├── README.md ├── adventure_of_design └── 1_scalable_PEs_by_Verilog │ ├── README.md │ ├── generate_kernel.v │ ├── report_generate_kernel.txt │ └── report_script_kernel.txt ├── auto_generator ├── .vscode │ └── settings.json ├── generater.cpp ├── main ├── result.v └── source │ ├── input.v │ ├── output_decider.v │ ├── pe_array.v │ └── serialize_deserialize.v ├── basic_components ├── cache │ ├── cache.v │ ├── cache_1.v │ ├── cache_decider.v │ ├── cache_test │ └── util.txt ├── input │ ├── input.v │ └── test │ │ ├── test │ │ ├── test_basic_triangle_shifter.v │ │ ├── test_column_shifter.v │ │ ├── test_input_all.v │ │ ├── test_inverter.v │ │ ├── test_mux.v │ │ └── test_shifter.v ├── pe_array │ ├── pe_array.v │ └── test │ │ ├── output.txt │ │ ├── single_PE_test.v │ │ └── test ├── simple_pe │ ├── single_PE.v │ └── test │ │ ├── output.txt │ │ ├── single_PE_test.v │ │ └── test ├── sparse_pe │ └── sparse_pe.v ├── tile_controller │ └── tile_controller.v └── wrapper │ ├── test │ ├── output.txt │ └── wrapper_test.v │ └── wrapper.v ├── densparsa.v ├── results ├── SA_32_32_16_16_16 │ ├── all_except_input.v │ ├── input_L2.v │ ├── note.txt │ ├── output.v │ └── util │ │ ├── all_except_input.txt │ │ ├── input_L3.txt │ │ ├── output_diagonal.txt │ │ └── output_matrix.txt ├── SA_32_32_16_4_4 │ ├── all_except_input.v │ ├── input_L2.v │ ├── note.txt │ ├── output.v │ └── util │ │ ├── all_except_input.txt │ │ ├── input_L3.txt │ │ ├── output_diagonal.txt │ │ └── output_matrix.txt ├── SA_32_32_16_8_8 │ ├── all_except_input.v │ ├── input_L2.v │ ├── note.txt │ ├── output.v │ └── util │ │ ├── all_except_input.txt │ │ ├── input_L3.txt │ │ ├── output_diagonal.txt │ │ └── output_matrix.txt ├── comparison.md ├── old_reports │ ├── critical_path.md │ └── utilization.md └── plain_SA_32_32_16 │ ├── input_L2.v │ ├── input_L3.v │ ├── output_L3.v │ ├── pe_array.v │ ├── top.v │ └── util │ ├── input_L2.txt │ ├── input_L3.txt │ ├── output.txt │ ├── pe_array.txt │ └── top.txt ├── simu ├── .vscode │ └── settings.json ├── README.md ├── main ├── new_sim │ ├── .vscode │ │ └── settings.json │ ├── draft │ ├── saving.txt │ ├── static_sparse_simulation.cpp │ └── test ├── stack_size_monte_carlo.cpp └── test.cpp └── 脉动阵列.drawio /README.md: -------------------------------------------------------------------------------- 1 | # systolic-array-in-verilog 2 | 3 | 用verilog实现的脉动阵列。 4 | 5 | 本代码的目的是用verilog实现可以参数化生成的脉动阵列(包括PE阵列和配套IO),通过eda工具对生成代码进行综合即可估算脉动阵列各部分的资源占用、功耗。 6 | 7 | ### 如何看这坨代码 how to read this crap of code 8 | 9 | 实际上大多是没有封装的半成品。 10 | 如果你恰好被推送了这份仓库,我认为对你来说比较有用的应该是/basic_components/pe_array/pe_array.v。这份代码可以很便捷地自动生成一套不包括IO的纯阵列,如果你想设计、实现自己的脉动阵列,可以在这份代码的基础上自行添加IO模块,或者添加并行功能,etc。除此以外的代码都需要结合具体研究的语境理解。 11 | 12 | 7.15更新: 13 | 阵列本体的自动化脚本 14 | 15 | 7.16更新: 16 | 阵列+输出节拍器的自动化脚本 17 | 18 | 7.23更新: 19 | 发现generate-for实际上比自动化脚本好用。底层模块应该采用generate而非自动化脚本,自动化脚本只应该用来封装最外部的逻辑。 20 | -------------------------------------------------------------------------------- /adventure_of_design/1_scalable_PEs_by_Verilog/README.md: -------------------------------------------------------------------------------- 1 | ### Generate Scalable PE Array by Verilog 2 | ### 用verilog生成scalable的纯PE阵列 3 | 4 | PE单体非常好写,但复数PE接线很麻烦。一开始的做法是用c++写脚本批量化生成,主要有两个问题: 5 | 1. 调用脚本比代码本身的parameter麻烦 6 | 2. 脚本可拓展性差,改代码需要先研究代码本身怎么改、再研究生成脚本怎么改 7 | 8 | 用脚本做自动化生成固然是必要的,但是如果底层的纯阵列也要靠脚本,脚本就失去了抽象性,起不到控制复杂度的作用。 9 | 因此,在纯阵列这一块上,还是应该转用verilog语言本身的generate来做批量化生成,纯阵列的体积等参数直接使用Verilog本身的parameter来控制。 10 | 11 | ### How to test correctness of generate-for? 12 | ### generate-for 语句内部的接线如何确保正确? 13 | 14 | 由于之前用脚本写的纯阵列可以确定是靠谱的,我就没有直接测自动生成代码的接线正确性(赶论文没时间了)。 15 | 这里留档了两种写法在Vivado里综合出的不同报告,可见logic LUT、DSP用量相同,memory LUT和register的总用量几乎一致, 16 | 加上接线的时候严格参考了之前写的脚本逻辑,可以认为功能上是等效的,如果日后有问题再说吧。 17 | 18 | 至于比原来的脚本版多用了IOB,则是因为新版代码额外增加了对角线的输出接口,便于后续调用。 19 | 20 | 整个流程给我的经验是 21 | 1. 接线规划成熟的话,generate + assign 比 c++ 高效 22 | 2. generate 使用的一维数组和 c++ 脚本使用的零散数组综合效率相同 -------------------------------------------------------------------------------- /adventure_of_design/1_scalable_PEs_by_Verilog/generate_kernel.v: -------------------------------------------------------------------------------- 1 | /** 编号规则: 2 | f h 3 | c e g 4 | b d 5 | a 6 | 7 | C B A [3_3] [3_2] [3_1] 8 | 9 | F E D [2_3] [2_2] [2_1] 10 | 11 | H G [1_3] [1_2] [1_1] 12 | 13 | */ 14 | module single_PE_rounded #( 15 | parameter DATA_WIDTH = 8, 16 | parameter Half_WIDTH = 4 17 | )( 18 | input clk, 19 | input finish, 20 | input [DATA_WIDTH-1 : 0] i_up, 21 | input [DATA_WIDTH-1 : 0] i_left, 22 | output reg [DATA_WIDTH-1 : 0] o_down, 23 | output reg [DATA_WIDTH-1 : 0] o_right, 24 | output reg [DATA_WIDTH-1 : 0] o_result = 0 25 | ); 26 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 27 | wire [DATA_WIDTH-1 : 0] x; 28 | assign x = (i_up*i_left) >> Half_WIDTH; 29 | always @(posedge clk) begin 30 | o_down <= i_up; 31 | o_right <= i_left; 32 | o_result <= finish ? partial_sum : o_result; 33 | partial_sum <= finish ? x : (partial_sum + x); 34 | end 35 | endmodule 36 | 37 | 38 | module single_kernel #( 39 | parameter SIZE = 8, 40 | parameter DATA_WIDTH = 16 41 | ) ( 42 | input clk, 43 | input [SIZE*SIZE-1:0] finish, // 编号规则: 44 | input [SIZE*DATA_WIDTH-1:0] in_up, // n_n ---- n_1 45 | input [SIZE*DATA_WIDTH-1:0] in_left, // | | 46 | output [SIZE*DATA_WIDTH-1:0] pass_down, // | | 47 | output [SIZE*DATA_WIDTH-1:0] pass_right, // 1_n ---- 1_1 48 | output [SIZE*SIZE*DATA_WIDTH-1:0] out_matrix, // 49 | output [SIZE*DATA_WIDTH-1:0] out_diagonal // serialized_index(i,j) = (i-1)*SIZE + j 50 | ); 51 | genvar i,j,k; 52 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_down; 53 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_right; 54 | generate 55 | for (i=SIZE; i>=1; i=i-1) begin 56 | for (j=SIZE; j>=1; j=j-1) begin 57 | if (i==SIZE && j==SIZE) begin // 左上角。the upper-left PE 58 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 59 | pe (clk, finish [(i-1)*SIZE+j-1], 60 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 61 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 62 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 63 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 64 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 65 | end else if (i==SIZE && j!=SIZE) begin // 最上一行。PEs in the upper-most row 66 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 67 | pe (clk, finish [(i-1)*SIZE+j-1], 68 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 69 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 70 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 71 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 72 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 73 | end else if (i!=SIZE && j==SIZE) begin // 最左一列。PEs in the left-most column 74 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 75 | pe (clk, finish [(i-1)*SIZE+j-1], 76 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 77 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 78 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 79 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 80 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 81 | end else begin // 其他PE。all other PEs 82 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 83 | pe (clk, finish [(i-1)*SIZE+j-1], 84 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 85 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 86 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 87 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 88 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 89 | end end end 90 | endgenerate 91 | generate 92 | for (k=SIZE; k>=1; k=k-1) begin 93 | // 向下侧阵列传递。pass data downward to other PE arays 94 | // 向下侧阵列传递。pass data rightward to other PE arays 95 | // 输出对角线值。 output results in diagonal position 96 | assign pass_down [k*DATA_WIDTH-1 -:DATA_WIDTH] 97 | = inner_pass_down [((1-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 98 | assign pass_right [k*DATA_WIDTH-1 -:DATA_WIDTH] 99 | = inner_pass_right [((k-1)*SIZE+1)*DATA_WIDTH-1 -:DATA_WIDTH]; 100 | assign out_diagonal [k*DATA_WIDTH-1 -:DATA_WIDTH] 101 | = out_matrix [((k-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 102 | end 103 | endgenerate 104 | endmodule 105 | -------------------------------------------------------------------------------- /adventure_of_design/1_scalable_PEs_by_Verilog/report_generate_kernel.txt: -------------------------------------------------------------------------------- 1 | Utilization Design Information 2 | 3 | Table of Contents 4 | ----------------- 5 | 1. Slice Logic 6 | 1.1 Summary of Registers by Type 7 | 2. Memory 8 | 3. DSP 9 | 4. IO and GT Specific 10 | 5. Clocking 11 | 6. Specific Feature 12 | 7. Primitives 13 | 8. Black Boxes 14 | 9. Instantiated Netlists 15 | 16 | 1. Slice Logic 17 | -------------- 18 | 19 | +----------------------------+------+-------+-----------+-------+ 20 | | Site Type | Used | Fixed | Available | Util% | 21 | +----------------------------+------+-------+-----------+-------+ 22 | | Slice LUTs* | 1312 | 0 | 134600 | 0.97 | 23 | | LUT as Logic | 1024 | 0 | 134600 | 0.76 | 24 | | LUT as Memory | 288 | 0 | 46200 | 0.62 | 25 | | LUT as Distributed RAM | 0 | 0 | | | 26 | | LUT as Shift Register | 288 | 0 | | | 27 | | Slice Registers | 2880 | 0 | 269200 | 1.07 | 28 | | Register as Flip Flop | 2880 | 0 | 269200 | 1.07 | 29 | | Register as Latch | 0 | 0 | 269200 | 0.00 | 30 | | F7 Muxes | 0 | 0 | 67300 | 0.00 | 31 | | F8 Muxes | 0 | 0 | 33650 | 0.00 | 32 | +----------------------------+------+-------+-----------+-------+ 33 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 34 | 35 | 36 | 1.1 Summary of Registers by Type 37 | -------------------------------- 38 | 39 | +-------+--------------+-------------+--------------+ 40 | | Total | Clock Enable | Synchronous | Asynchronous | 41 | +-------+--------------+-------------+--------------+ 42 | | 0 | _ | - | - | 43 | | 0 | _ | - | Set | 44 | | 0 | _ | - | Reset | 45 | | 0 | _ | Set | - | 46 | | 0 | _ | Reset | - | 47 | | 0 | Yes | - | - | 48 | | 0 | Yes | - | Set | 49 | | 0 | Yes | - | Reset | 50 | | 0 | Yes | Set | - | 51 | | 2880 | Yes | Reset | - | 52 | +-------+--------------+-------------+--------------+ 53 | 54 | 55 | 2. Memory 56 | --------- 57 | 58 | +----------------+------+-------+-----------+-------+ 59 | | Site Type | Used | Fixed | Available | Util% | 60 | +----------------+------+-------+-----------+-------+ 61 | | Block RAM Tile | 0 | 0 | 365 | 0.00 | 62 | | RAMB36/FIFO* | 0 | 0 | 365 | 0.00 | 63 | | RAMB18 | 0 | 0 | 730 | 0.00 | 64 | +----------------+------+-------+-----------+-------+ 65 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 66 | 67 | 68 | 3. DSP 69 | ------ 70 | 71 | +----------------+------+-------+-----------+-------+ 72 | | Site Type | Used | Fixed | Available | Util% | 73 | +----------------+------+-------+-----------+-------+ 74 | | DSPs | 64 | 0 | 740 | 8.65 | 75 | | DSP48E1 only | 64 | | | | 76 | +----------------+------+-------+-----------+-------+ 77 | 78 | 79 | 4. IO and GT Specific 80 | --------------------- 81 | 82 | +-----------------------------+------+-------+-----------+--------+ 83 | | Site Type | Used | Fixed | Available | Util% | 84 | +-----------------------------+------+-------+-----------+--------+ 85 | | Bonded IOB | 1729 | 0 | 500 | 345.80 | 86 | | Bonded IPADs | 0 | 0 | 50 | 0.00 | 87 | | Bonded OPADs | 0 | 0 | 32 | 0.00 | 88 | | PHY_CONTROL | 0 | 0 | 10 | 0.00 | 89 | | PHASER_REF | 0 | 0 | 10 | 0.00 | 90 | | OUT_FIFO | 0 | 0 | 40 | 0.00 | 91 | | IN_FIFO | 0 | 0 | 40 | 0.00 | 92 | | IDELAYCTRL | 0 | 0 | 10 | 0.00 | 93 | | IBUFDS | 0 | 0 | 480 | 0.00 | 94 | | GTPE2_CHANNEL | 0 | 0 | 16 | 0.00 | 95 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 40 | 0.00 | 96 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 40 | 0.00 | 97 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 500 | 0.00 | 98 | | IBUFDS_GTE2 | 0 | 0 | 8 | 0.00 | 99 | | ILOGIC | 0 | 0 | 500 | 0.00 | 100 | | OLOGIC | 0 | 0 | 500 | 0.00 | 101 | +-----------------------------+------+-------+-----------+--------+ 102 | 103 | 104 | 5. Clocking 105 | ----------- 106 | 107 | +------------+------+-------+-----------+-------+ 108 | | Site Type | Used | Fixed | Available | Util% | 109 | +------------+------+-------+-----------+-------+ 110 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 111 | | BUFIO | 0 | 0 | 40 | 0.00 | 112 | | MMCME2_ADV | 0 | 0 | 10 | 0.00 | 113 | | PLLE2_ADV | 0 | 0 | 10 | 0.00 | 114 | | BUFMRCE | 0 | 0 | 20 | 0.00 | 115 | | BUFHCE | 0 | 0 | 120 | 0.00 | 116 | | BUFR | 0 | 0 | 40 | 0.00 | 117 | +------------+------+-------+-----------+-------+ 118 | 119 | 120 | 6. Specific Feature 121 | ------------------- 122 | 123 | +-------------+------+-------+-----------+-------+ 124 | | Site Type | Used | Fixed | Available | Util% | 125 | +-------------+------+-------+-----------+-------+ 126 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 127 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 128 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 129 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 130 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 131 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 132 | | PCIE_2_1 | 0 | 0 | 1 | 0.00 | 133 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 134 | | XADC | 0 | 0 | 1 | 0.00 | 135 | +-------------+------+-------+-----------+-------+ 136 | 137 | 138 | 7. Primitives 139 | ------------- 140 | 141 | +----------+------+---------------------+ 142 | | Ref Name | Used | Functional Category | 143 | +----------+------+---------------------+ 144 | | FDRE | 2880 | Flop & Latch | 145 | | OBUF | 1408 | IO | 146 | | LUT3 | 512 | LUT | 147 | | LUT2 | 512 | LUT | 148 | | IBUF | 321 | IO | 149 | | SRL16E | 288 | Distributed Memory | 150 | | CARRY4 | 256 | CarryLogic | 151 | | DSP48E1 | 64 | Block Arithmetic | 152 | | BUFG | 1 | Clock | 153 | +----------+------+---------------------+ 154 | 155 | 156 | 8. Black Boxes 157 | -------------- 158 | 159 | +----------+------+ 160 | | Ref Name | Used | 161 | +----------+------+ 162 | 163 | 164 | 9. Instantiated Netlists 165 | ------------------------ 166 | -------------------------------------------------------------------------------- /adventure_of_design/1_scalable_PEs_by_Verilog/report_script_kernel.txt: -------------------------------------------------------------------------------- 1 | Table of Contents 2 | ----------------- 3 | 1. Slice Logic 4 | 1.1 Summary of Registers by Type 5 | 2. Memory 6 | 3. DSP 7 | 4. IO and GT Specific 8 | 5. Clocking 9 | 6. Specific Feature 10 | 7. Primitives 11 | 8. Black Boxes 12 | 9. Instantiated Netlists 13 | 14 | 1. Slice Logic 15 | -------------- 16 | 17 | +----------------------------+------+-------+-----------+-------+ 18 | | Site Type | Used | Fixed | Available | Util% | 19 | +----------------------------+------+-------+-----------+-------+ 20 | | Slice LUTs* | 1280 | 0 | 134600 | 0.95 | 21 | | LUT as Logic | 1024 | 0 | 134600 | 0.76 | 22 | | LUT as Memory | 256 | 0 | 46200 | 0.55 | 23 | | LUT as Distributed RAM | 0 | 0 | | | 24 | | LUT as Shift Register | 256 | 0 | | | 25 | | Slice Registers | 2848 | 0 | 269200 | 1.06 | 26 | | Register as Flip Flop | 2848 | 0 | 269200 | 1.06 | 27 | | Register as Latch | 0 | 0 | 269200 | 0.00 | 28 | | F7 Muxes | 0 | 0 | 67300 | 0.00 | 29 | | F8 Muxes | 0 | 0 | 33650 | 0.00 | 30 | +----------------------------+------+-------+-----------+-------+ 31 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 32 | 33 | 34 | 1.1 Summary of Registers by Type 35 | -------------------------------- 36 | 37 | +-------+--------------+-------------+--------------+ 38 | | Total | Clock Enable | Synchronous | Asynchronous | 39 | +-------+--------------+-------------+--------------+ 40 | | 0 | _ | - | - | 41 | | 0 | _ | - | Set | 42 | | 0 | _ | - | Reset | 43 | | 0 | _ | Set | - | 44 | | 0 | _ | Reset | - | 45 | | 0 | Yes | - | - | 46 | | 0 | Yes | - | Set | 47 | | 0 | Yes | - | Reset | 48 | | 0 | Yes | Set | - | 49 | | 2848 | Yes | Reset | - | 50 | +-------+--------------+-------------+--------------+ 51 | 52 | 53 | 2. Memory 54 | --------- 55 | 56 | +----------------+------+-------+-----------+-------+ 57 | | Site Type | Used | Fixed | Available | Util% | 58 | +----------------+------+-------+-----------+-------+ 59 | | Block RAM Tile | 0 | 0 | 365 | 0.00 | 60 | | RAMB36/FIFO* | 0 | 0 | 365 | 0.00 | 61 | | RAMB18 | 0 | 0 | 730 | 0.00 | 62 | +----------------+------+-------+-----------+-------+ 63 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 64 | 65 | 66 | 3. DSP 67 | ------ 68 | 69 | +----------------+------+-------+-----------+-------+ 70 | | Site Type | Used | Fixed | Available | Util% | 71 | +----------------+------+-------+-----------+-------+ 72 | | DSPs | 64 | 0 | 740 | 8.65 | 73 | | DSP48E1 only | 64 | | | | 74 | +----------------+------+-------+-----------+-------+ 75 | 76 | 77 | 4. IO and GT Specific 78 | --------------------- 79 | 80 | +-----------------------------+------+-------+-----------+--------+ 81 | | Site Type | Used | Fixed | Available | Util% | 82 | +-----------------------------+------+-------+-----------+--------+ 83 | | Bonded IOB | 1601 | 0 | 500 | 320.20 | 84 | | Bonded IPADs | 0 | 0 | 50 | 0.00 | 85 | | Bonded OPADs | 0 | 0 | 32 | 0.00 | 86 | | PHY_CONTROL | 0 | 0 | 10 | 0.00 | 87 | | PHASER_REF | 0 | 0 | 10 | 0.00 | 88 | | OUT_FIFO | 0 | 0 | 40 | 0.00 | 89 | | IN_FIFO | 0 | 0 | 40 | 0.00 | 90 | | IDELAYCTRL | 0 | 0 | 10 | 0.00 | 91 | | IBUFDS | 0 | 0 | 480 | 0.00 | 92 | | GTPE2_CHANNEL | 0 | 0 | 16 | 0.00 | 93 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 40 | 0.00 | 94 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 40 | 0.00 | 95 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 500 | 0.00 | 96 | | IBUFDS_GTE2 | 0 | 0 | 8 | 0.00 | 97 | | ILOGIC | 0 | 0 | 500 | 0.00 | 98 | | OLOGIC | 0 | 0 | 500 | 0.00 | 99 | +-----------------------------+------+-------+-----------+--------+ 100 | 101 | 102 | 5. Clocking 103 | ----------- 104 | 105 | +------------+------+-------+-----------+-------+ 106 | | Site Type | Used | Fixed | Available | Util% | 107 | +------------+------+-------+-----------+-------+ 108 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 109 | | BUFIO | 0 | 0 | 40 | 0.00 | 110 | | MMCME2_ADV | 0 | 0 | 10 | 0.00 | 111 | | PLLE2_ADV | 0 | 0 | 10 | 0.00 | 112 | | BUFMRCE | 0 | 0 | 20 | 0.00 | 113 | | BUFHCE | 0 | 0 | 120 | 0.00 | 114 | | BUFR | 0 | 0 | 40 | 0.00 | 115 | +------------+------+-------+-----------+-------+ 116 | 117 | 118 | 6. Specific Feature 119 | ------------------- 120 | 121 | +-------------+------+-------+-----------+-------+ 122 | | Site Type | Used | Fixed | Available | Util% | 123 | +-------------+------+-------+-----------+-------+ 124 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 125 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 126 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 127 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 128 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 129 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 130 | | PCIE_2_1 | 0 | 0 | 1 | 0.00 | 131 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 132 | | XADC | 0 | 0 | 1 | 0.00 | 133 | +-------------+------+-------+-----------+-------+ 134 | 135 | 136 | 7. Primitives 137 | ------------- 138 | 139 | +----------+------+---------------------+ 140 | | Ref Name | Used | Functional Category | 141 | +----------+------+---------------------+ 142 | | FDRE | 2848 | Flop & Latch | 143 | | OBUF | 1280 | IO | 144 | | LUT3 | 512 | LUT | 145 | | LUT2 | 512 | LUT | 146 | | IBUF | 321 | IO | 147 | | SRL16E | 256 | Distributed Memory | 148 | | CARRY4 | 256 | CarryLogic | 149 | | DSP48E1 | 64 | Block Arithmetic | 150 | | BUFG | 1 | Clock | 151 | +----------+------+---------------------+ 152 | 153 | 154 | 8. Black Boxes 155 | -------------- 156 | 157 | +----------+------+ 158 | | Ref Name | Used | 159 | +----------+------+ 160 | 161 | 162 | 9. Instantiated Netlists 163 | ------------------------ 164 | 165 | +----------+------+ 166 | | Ref Name | Used | 167 | +----------+------+ 168 | 169 | -------------------------------------------------------------------------------- /auto_generator/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "ostream": "cpp", 4 | "iosfwd": "cpp" 5 | } 6 | } -------------------------------------------------------------------------------- /auto_generator/main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ziheng-W/systolic-array-verilog/bc64a45f1468f3e0352c2e921522b767ccf06cd0/auto_generator/main -------------------------------------------------------------------------------- /auto_generator/source/input.v: -------------------------------------------------------------------------------- 1 | module mux #( 2 | parameter LENGTH = 4, 3 | parameter DATA_WIDTH = 16 4 | )( 5 | input flag, 6 | input [LENGTH*DATA_WIDTH-1 : 0] in_0, 7 | input [LENGTH*DATA_WIDTH-1 : 0] in_1, 8 | output [LENGTH*DATA_WIDTH-1 : 0] out 9 | ); 10 | assign out = flag ? in_1 : in_0; 11 | endmodule 12 | 13 | module invert #( 14 | parameter LENGTH = 4, 15 | parameter DATA_WIDTH = 16 16 | ) ( 17 | input [LENGTH*DATA_WIDTH-1 : 0] in, 18 | output [LENGTH*DATA_WIDTH-1 : 0] out 19 | ); 20 | genvar i; 21 | generate for (i=0; i> Half_WIDTH; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | o_result <= finish ? partial_sum : o_result; 20 | partial_sum <= finish ? x : (partial_sum + x); 21 | end 22 | endmodule 23 | 24 | 25 | module singple_kernel #( 26 | parameter DATA_WIDTH = 16, 27 | parameter SIZE = 8 28 | ) ( 29 | input clk, 30 | input [SIZE*SIZE-1:0] finish, // 编号规则: 31 | input [SIZE*DATA_WIDTH-1:0] in_up, // n_n ---- n_1 32 | input [SIZE*DATA_WIDTH-1:0] in_left, // | | 33 | output [SIZE*DATA_WIDTH-1:0] pass_down, // | | 34 | output [SIZE*DATA_WIDTH-1:0] pass_right, // 1_n ---- 1_1 35 | output [SIZE*SIZE*DATA_WIDTH-1:0] out_matrix, // 36 | output [SIZE*DATA_WIDTH-1:0] out_diagonal // serialized_index(i,j) = (i-1)*SIZE + j 37 | ); 38 | genvar i,j,k; 39 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_down; 40 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_right; 41 | generate 42 | for (i=SIZE; i>=1; i=i-1) begin 43 | for (j=SIZE; j>=1; j=j-1) begin 44 | if (i==SIZE && j==SIZE) begin // 左上角。the upper-left PE 45 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 46 | pe (clk, finish [(i-1)*SIZE+j-1], 47 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 48 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 49 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 50 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 51 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 52 | end else if (i==SIZE && j!=SIZE) begin // 最上一行。PEs in the upper-most row 53 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 54 | pe (clk, finish [(i-1)*SIZE+j-1], 55 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 56 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 57 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 58 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 59 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 60 | end else if (i!=SIZE && j==SIZE) begin // 最左一列。PEs in the left-most column 61 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 62 | pe (clk, finish [(i-1)*SIZE+j-1], 63 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 64 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 65 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 66 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 67 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 68 | end else begin // 其他PE。all other PEs 69 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 70 | pe (clk, finish [(i-1)*SIZE+j-1], 71 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 72 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 73 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 74 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 75 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 76 | end end end 77 | endgenerate 78 | generate 79 | for (k=SIZE; k>=1; k=k-1) begin 80 | // 向下侧阵列传递。pass data downward to other PE arays 81 | // 向下侧阵列传递。pass data rightward to other PE arays 82 | // 输出对角线值。 output results in diagonal position 83 | assign pass_down [k*DATA_WIDTH-1 -:DATA_WIDTH] 84 | = inner_pass_down [((1-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 85 | assign pass_right [k*DATA_WIDTH-1 -:DATA_WIDTH] 86 | = inner_pass_right [((k-1)*SIZE+1)*DATA_WIDTH-1 -:DATA_WIDTH]; 87 | assign out_diagonal [k*DATA_WIDTH-1 -:DATA_WIDTH] 88 | = out_matrix [((k-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 89 | end 90 | endgenerate 91 | endmodule 92 | 93 | -------------------------------------------------------------------------------- /auto_generator/source/serialize_deserialize.v: -------------------------------------------------------------------------------- 1 | // 高位先出 2 | module serialize #( 3 | parameter LENGTH = 4, 4 | parameter BIT_WIDTH = 16 5 | ) ( 6 | input clk, 7 | input write_enable, 8 | input read_enable, 9 | input [LENGTH*BIT_WIDTH -1 : 0] in, 10 | output [BIT_WIDTH-1 : 0] out 11 | ); 12 | reg [BIT_WIDTH*LENGTH-1 : 0] local = 0; 13 | integer i; 14 | always @(posedge clk) begin 15 | if(write_enable == 1 & read_enable == 0) begin 16 | local <= in; 17 | end else if(write_enable == 0 & read_enable == 1) begin 18 | for (i = LENGTH; i>=2; i=i-1) begin 19 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 20 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 21 | end 22 | end else local <= local; 23 | end 24 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 25 | endmodule 26 | 27 | // 串转并 28 | module deserialize #( 29 | parameter LENGTH = 4, 30 | parameter BIT_WIDTH = 64 31 | ) ( 32 | input clk, 33 | input read_enable, 34 | input [BIT_WIDTH-1 : 0] in, 35 | output [LENGTH*BIT_WIDTH-1 : 0] out 36 | ); 37 | reg [LENGTH*BIT_WIDTH-1 : 0] local = 0; 38 | integer i; 39 | always @(posedge clk) begin 40 | if (read_enable) begin 41 | for (i = LENGTH; i>=2; i=i-1) begin 42 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 43 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 44 | end 45 | local[BIT_WIDTH-1 -: BIT_WIDTH] <= in; 46 | end else local <= local; 47 | end 48 | assign out = local; 49 | endmodule -------------------------------------------------------------------------------- /basic_components/cache/cache.v: -------------------------------------------------------------------------------- 1 | 2 | module test_cache #( 3 | parameter Data_Width = 16, 4 | parameter Index_Width = 5, 5 | parameter Address_Width = 5 6 | ) ( 7 | input clk, 8 | /******** 端口1 ********/ 9 | input WE_0, 10 | input [Data_Width-1:0] data_i_0, 11 | input [Index_Width-1:0] index_i_0, 12 | input [Index_Width-1:0] index_o_0, 13 | output [Data_Width-1:0] data_o_0 14 | ); 15 | reg [Data_Width+Index_Width+3-1:0] ram [19:0]; 16 | reg [19:0] available_list = -1; // 1意味着可选,0意味着被占用 17 | reg [Address_Width-1:0] ptr_i_0 = 0; 18 | reg [Address_Width-1:0] ptr_o_0 = 0; 19 | integer i; 20 | 21 | initial begin 22 | for(i=0; i<20; i=i+1) begin 23 | ram[i] = 0; end end 24 | 25 | /******** 端口1 ********/ 26 | // 写入 27 | always @(posedge clk ) begin 28 | if (WE_0) begin 29 | ram[ptr_i_0] = {3'b000,index_i_0, data_i_0}; 30 | available_list[ptr_i_0] = 0; 31 | end 32 | end 33 | // 读出 34 | always @ (index_o_0) begin 35 | for(i=0; i<20; i++) begin // 检索匹配 36 | if (ram[i][Data_Width+Index_Width+3-1 : Data_Width] == {3'b000, index_o_0}) begin 37 | ptr_o_0 = i; 38 | available_list[ptr_i_0] = 1; 39 | end 40 | end 41 | end 42 | assign data_o_0 = ram[ptr_o_0][Data_Width-1:0]; // 输出 43 | // // 更新空闲指针 44 | // always @() begin 45 | 46 | // end 47 | endmodule 48 | 49 | 50 | -------------------------------------------------------------------------------- /basic_components/cache/cache_1.v: -------------------------------------------------------------------------------- 1 | // 命名规则:cache_x 即为x入x出的cache。多pe共用cache,分担存储压力。 2 | 3 | // 按照稀疏比 0.1 vs 0.4 的模拟,平均每个pe额外使用4*Data_Width个reg,一共需要32个word。 4 | // word的组成是data + index 5 | module cache_8 #( 6 | parameter Data_Width = 16, 7 | parameter Address_Width = 5 8 | ) ( 9 | input clk, 10 | // write enable 11 | input WE_0, input WE_1, input WE_2, input WE_3, 12 | input WE_4, input WE_5, input WE_6, input WE_7, 13 | // write address 14 | input [Address_Width-1:0] W_Add_0, input [Address_Width-1:0] W_Add_1, 15 | input [Address_Width-1:0] W_Add_2, input [Address_Width-1:0] W_Add_3, 16 | input [Address_Width-1:0] W_Add_4, input [Address_Width-1:0] W_Add_5, 17 | input [Address_Width-1:0] W_Add_6, input [Address_Width-1:0] W_Add_7, 18 | // read address 19 | input [Address_Width-1:0] R_Add_0, input [Address_Width-1:0] R_Add_1, 20 | input [Address_Width-1:0] R_Add_2, input [Address_Width-1:0] R_Add_3, 21 | input [Address_Width-1:0] R_Add_4, input [Address_Width-1:0] R_Add_5, 22 | input [Address_Width-1:0] R_Add_6, input [Address_Width-1:0] R_Add_7, 23 | // write data 24 | input [Data_Width-1:0] W_Data_0, input [Data_Width-1:0] W_Data_1, 25 | input [Data_Width-1:0] W_Data_2, input [Data_Width-1:0] W_Data_3, 26 | input [Data_Width-1:0] W_Data_4, input [Data_Width-1:0] W_Data_5, 27 | input [Data_Width-1:0] W_Data_6, input [Data_Width-1:0] W_Data_7, 28 | // read data 29 | output reg [Data_Width-1:0] R_Data_0, output reg [Data_Width-1:0] R_Data_1, 30 | output reg [Data_Width-1:0] R_Data_2, output reg [Data_Width-1:0] R_Data_3, 31 | output reg [Data_Width-1:0] R_Data_4, output reg [Data_Width-1:0] R_Data_5, 32 | output reg [Data_Width-1:0] R_Data_6, output reg [Data_Width-1:0] R_Data_7 33 | ); 34 | reg [Data_Width+Address_Width-1:0] ram [31:0]; 35 | 36 | 37 | endmodule 38 | 39 | 40 | module brake_for_ #( 41 | parameter non = 1 42 | ) ( 43 | input clk 44 | , input [19:0] bit_mask 45 | , output [4:0] available_0 46 | , output [4:0] available_1 47 | , output [4:0] available_2 48 | , output [4:0] available_3 49 | ); 50 | integer i; // 0 ~ 19 51 | integer j; // 0 ~ 3 52 | reg [4:0] availables [3:0]; 53 | always @(posedge clk) begin 54 | j = 3; 55 | for(i = 0; i<20; i=i+1) begin 56 | if(bit_mask[i] == 1) begin 57 | availables[j] = i; 58 | j = j>0 ? j-1 : j; 59 | end end 60 | end 61 | assign available_0 = availables[0]; 62 | assign available_1 = availables[1]; 63 | assign available_2 = availables[2]; 64 | assign available_3 = availables[3]; 65 | endmodule 66 | 67 | 68 | 69 | 70 | 71 | 72 | module cache_4 #( 73 | parameter Data_Width = 16, 74 | parameter Address_Width = 5 75 | ) ( 76 | input clk, 77 | // write enable 78 | input WE_0, input WE_1, input WE_2, input WE_3, 79 | input WE_4, input WE_5, input WE_6, input WE_7, 80 | // write address 81 | input [Address_Width-1:0] W_Add_0, input [Address_Width-1:0] W_Add_1, 82 | input [Address_Width-1:0] W_Add_2, input [Address_Width-1:0] W_Add_3, 83 | // read address 84 | input [Address_Width-1:0] R_Add_0, input [Address_Width-1:0] R_Add_1, 85 | input [Address_Width-1:0] R_Add_2, input [Address_Width-1:0] R_Add_3, 86 | // write data 87 | input [Data_Width-1:0] W_Data_0, input [Data_Width-1:0] W_Data_1, 88 | input [Data_Width-1:0] W_Data_2, input [Data_Width-1:0] W_Data_3, 89 | // read data 90 | output reg [Data_Width-1:0] R_Data_0, output reg [Data_Width-1:0] R_Data_1, 91 | output reg [Data_Width-1:0] R_Data_2, output reg [Data_Width-1:0] R_Data_3 92 | ); 93 | reg [Data_Width+Address_Width-1:0] ram [31:0]; 94 | // write 95 | always @(posedge clk ) begin 96 | if (WE_0) begin 97 | 98 | end 99 | end 100 | 101 | endmodule 102 | 103 | 104 | -------------------------------------------------------------------------------- /basic_components/cache/cache_decider.v: -------------------------------------------------------------------------------- 1 | module cache_decider #( 2 | parameter non = 1 3 | ) ( 4 | input clk 5 | , input [19:0] bit_mask 6 | // , output [4:0] available_0 7 | // , output [4:0] available_1 8 | // , output [4:0] available_2 9 | // , output [4:0] available_3 10 | // , output [4:0] available_4 11 | // , output [4:0] available_5 12 | // , output [4:0] available_6 13 | // , output [4:0] available_7 14 | // , output [4:0] available_8 15 | // , output [4:0] available_9 16 | // , output [4:0] available_10 17 | // , output [4:0] available_11 18 | ); 19 | integer i; // 0 ~ 19 20 | integer j; // 0 ~ 3 21 | reg [4:0] availables [3:0]; 22 | initial begin 23 | for(i=0; i<4; i=i+1) begin 24 | availables[i] = 0; 25 | end 26 | end 27 | 28 | always @(posedge clk) begin 29 | j = 0; 30 | for(i = 0; i<4; i=i+1) begin 31 | availables[i] = -1; 32 | end 33 | for(i = 0; i<20; i=i+1) begin 34 | if(bit_mask[i] == 1) begin 35 | availables[j] = i; 36 | j = j+1; 37 | end 38 | end 39 | end 40 | // assign available_0 = availables[0]; 41 | // assign available_1 = availables[1]; 42 | // assign available_2 = availables[2]; 43 | // assign available_3 = availables[3]; 44 | // assign available_4 = availables[4]; 45 | // assign available_5 = availables[5]; 46 | // assign available_6 = availables[6]; 47 | // assign available_7 = availables[7]; 48 | // assign available_8 = availables[8]; 49 | // assign available_9 = availables[9]; 50 | // assign available_10 = availables[10]; 51 | // assign available_11 = availables[11]; 52 | endmodule 53 | 54 | module test(); 55 | integer currTime = 0; 56 | reg clk = 0; 57 | reg [19:0] bitmask = 0; 58 | // wire [4:0] available_0; 59 | // wire [4:0] available_1; 60 | // wire [4:0] available_2; 61 | // wire [4:0] available_3; 62 | // wire [4:0] available_4; 63 | // wire [4:0] available_5; 64 | // wire [4:0] available_6; 65 | // wire [4:0] available_7; 66 | // wire [4:0] available_8; 67 | // wire [4:0] available_9; 68 | // wire [4:0] available_10; 69 | // wire [4:0] available_11; 70 | cache_decider # (1) little_cache (clk, bitmask); 71 | 72 | // 修改input、显示初态 73 | initial begin 74 | # 0 75 | # 2 bitmask = 20'b0000_0000_0000_0000_1100; 76 | // # 2 bitmask = 20'b0000_1111_0000_1111_1111; 77 | // # 2 bitmask = 20'b0000_1111_0000_0000_0000; 78 | // # 2 bitmask = 20'b0000_0000_0111_0000_0000; 79 | # 2 $finish; 80 | end 81 | // 计时并间隔输出 82 | always #2 begin 83 | currTime = currTime + 1; 84 | $display(" "); 85 | end 86 | // output 87 | always #1 begin 88 | clk = ~clk; 89 | $display("Time: %d, clk: %d, ava_0: %d, ava_1: %d, ava_2: %d, ava_3: %d", currTime, clk, little_cache.availables[0], little_cache.availables[1], little_cache.availables[2], little_cache.availables[3]); 90 | end 91 | endmodule 92 | -------------------------------------------------------------------------------- /basic_components/cache/cache_test: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/vvp 2 | :ivl_version "13.0 (devel)" "(s20221226-516-g615a01c6c)"; 3 | :ivl_delay_selection "TYPICAL"; 4 | :vpi_time_precision + 0; 5 | :vpi_module "/usr/local/lib/ivl/system.vpi"; 6 | :vpi_module "/usr/local/lib/ivl/vhdl_sys.vpi"; 7 | :vpi_module "/usr/local/lib/ivl/vhdl_textio.vpi"; 8 | :vpi_module "/usr/local/lib/ivl/v2005_math.vpi"; 9 | :vpi_module "/usr/local/lib/ivl/va_math.vpi"; 10 | S_0x559a05fcc4e0 .scope module, "test" "test" 2 54; 11 | .timescale 0 0; 12 | v0x559a05fe1bb0_0 .var "bitmask", 19 0; 13 | v0x559a05fe1c70_0 .var "clk", 0 0; 14 | v0x559a05fe1d40_0 .var/i "currTime", 31 0; 15 | S_0x559a05fcc670 .scope module, "little_cache" "cache_decider" 2 70, 2 1 0, S_0x559a05fcc4e0; 16 | .timescale 0 0; 17 | .port_info 0 /INPUT 1 "clk"; 18 | .port_info 1 /INPUT 20 "bit_mask"; 19 | P_0x559a05fcc850 .param/l "non" 0 2 2, +C4<00000000000000000000000000000001>; 20 | v0x559a05fb9710 .array "availables", 0 3, 4 0; 21 | v0x559a05fb97b0_0 .net "bit_mask", 19 0, v0x559a05fe1bb0_0; 1 drivers 22 | v0x559a05fe1870_0 .net "clk", 0 0, v0x559a05fe1c70_0; 1 drivers 23 | v0x559a05fe1940_0 .var/i "i", 31 0; 24 | v0x559a05fe1a20_0 .var/i "j", 31 0; 25 | E_0x559a05fc8a90 .event posedge, v0x559a05fe1870_0; 26 | .scope S_0x559a05fcc670; 27 | T_0 ; 28 | %pushi/vec4 0, 0, 32; 29 | %store/vec4 v0x559a05fe1940_0, 0, 32; 30 | T_0.0 ; Top of for-loop 31 | %load/vec4 v0x559a05fe1940_0; 32 | %cmpi/s 4, 0, 32; 33 | %jmp/0xz T_0.1, 5; 34 | %pushi/vec4 0, 0, 5; 35 | %ix/getv/s 4, v0x559a05fe1940_0; 36 | %store/vec4a v0x559a05fb9710, 4, 0; 37 | T_0.2 ; for-loop step statement 38 | %load/vec4 v0x559a05fe1940_0; 39 | %addi 1, 0, 32; 40 | %store/vec4 v0x559a05fe1940_0, 0, 32; 41 | %jmp T_0.0; 42 | T_0.1 ; for-loop exit label 43 | %end; 44 | .thread T_0; 45 | .scope S_0x559a05fcc670; 46 | T_1 ; 47 | %wait E_0x559a05fc8a90; 48 | %pushi/vec4 0, 0, 32; 49 | %store/vec4 v0x559a05fe1a20_0, 0, 32; 50 | %pushi/vec4 0, 0, 32; 51 | %store/vec4 v0x559a05fe1940_0, 0, 32; 52 | T_1.0 ; Top of for-loop 53 | %load/vec4 v0x559a05fe1940_0; 54 | %cmpi/s 4, 0, 32; 55 | %jmp/0xz T_1.1, 5; 56 | %pushi/vec4 31, 0, 5; 57 | %ix/getv/s 4, v0x559a05fe1940_0; 58 | %store/vec4a v0x559a05fb9710, 4, 0; 59 | T_1.2 ; for-loop step statement 60 | %load/vec4 v0x559a05fe1940_0; 61 | %addi 1, 0, 32; 62 | %store/vec4 v0x559a05fe1940_0, 0, 32; 63 | %jmp T_1.0; 64 | T_1.1 ; for-loop exit label 65 | %pushi/vec4 0, 0, 32; 66 | %store/vec4 v0x559a05fe1940_0, 0, 32; 67 | T_1.3 ; Top of for-loop 68 | %load/vec4 v0x559a05fe1940_0; 69 | %cmpi/s 20, 0, 32; 70 | %jmp/0xz T_1.4, 5; 71 | %load/vec4 v0x559a05fb97b0_0; 72 | %load/vec4 v0x559a05fe1940_0; 73 | %part/s 1; 74 | %pad/u 32; 75 | %cmpi/e 1, 0, 32; 76 | %jmp/0xz T_1.6, 4; 77 | %load/vec4 v0x559a05fe1940_0; 78 | %pad/s 5; 79 | %ix/getv/s 4, v0x559a05fe1a20_0; 80 | %store/vec4a v0x559a05fb9710, 4, 0; 81 | %load/vec4 v0x559a05fe1a20_0; 82 | %addi 1, 0, 32; 83 | %store/vec4 v0x559a05fe1a20_0, 0, 32; 84 | T_1.6 ; 85 | T_1.5 ; for-loop step statement 86 | %load/vec4 v0x559a05fe1940_0; 87 | %addi 1, 0, 32; 88 | %store/vec4 v0x559a05fe1940_0, 0, 32; 89 | %jmp T_1.3; 90 | T_1.4 ; for-loop exit label 91 | %jmp T_1; 92 | .thread T_1; 93 | .scope S_0x559a05fcc4e0; 94 | T_2 ; 95 | %pushi/vec4 0, 0, 32; 96 | %store/vec4 v0x559a05fe1d40_0, 0, 32; 97 | %pushi/vec4 0, 0, 1; 98 | %store/vec4 v0x559a05fe1c70_0, 0, 1; 99 | %pushi/vec4 0, 0, 20; 100 | %store/vec4 v0x559a05fe1bb0_0, 0, 20; 101 | %end; 102 | .thread T_2; 103 | .scope S_0x559a05fcc4e0; 104 | T_3 ; 105 | %delay 0, 0; 106 | %delay 2, 0; 107 | %pushi/vec4 12, 0, 20; 108 | %store/vec4 v0x559a05fe1bb0_0, 0, 20; 109 | %delay 2, 0; 110 | %vpi_call 2 79 "$finish" {0 0 0}; 111 | %end; 112 | .thread T_3; 113 | .scope S_0x559a05fcc4e0; 114 | T_4 ; 115 | %delay 2, 0; 116 | %load/vec4 v0x559a05fe1d40_0; 117 | %addi 1, 0, 32; 118 | %store/vec4 v0x559a05fe1d40_0, 0, 32; 119 | %vpi_call 2 84 "$display", " " {0 0 0}; 120 | %jmp T_4; 121 | .thread T_4; 122 | .scope S_0x559a05fcc4e0; 123 | T_5 ; 124 | %delay 1, 0; 125 | %load/vec4 v0x559a05fe1c70_0; 126 | %inv; 127 | %store/vec4 v0x559a05fe1c70_0, 0, 1; 128 | %vpi_call 2 89 "$display", "Time: %d, clk: %d, ava_0: %d, ava_1: %d, ava_2: %d, ava_3: %d", v0x559a05fe1d40_0, v0x559a05fe1c70_0, &A, &A, &A, &A {0 0 0}; 129 | %jmp T_5; 130 | .thread T_5; 131 | # The file index is used to find the file name in the following table. 132 | :file_names 3; 133 | "N/A"; 134 | ""; 135 | "cache_decider.v"; 136 | -------------------------------------------------------------------------------- /basic_components/cache/util.txt: -------------------------------------------------------------------------------- 1 | 写了部分代码的数据分析: 2 | 1. 3 | 存入的时候需要查找可用的位置(从比方说20个里面找出8个能存东西的地方) 4 | 从20个里面取出8个0,LUT消耗是246,gate level是20左右 5 | 2. 6 | 取出的时候要匹配index,LUT消耗如下: 7 | n 中取出8的LUT: 8 | n 24 20 16 12 9 | LUT 290 241 182 152 10 | 平均 12.083 15.06 11.375 12.66 11 | 层数 8+-,区别不大 12 | 选16的话,相当于每个pe增加了22.75个LUT。选20的话是30个,有些多了 13 | 3. 14 | 计算不同pe分比的reg消耗 15 | 假设n个pe共用,则需要log2n bit的cache内部地址+16bit数据+5bit index 16 | 相当于一个word 有 (log2n + 5 + 16)位 17 | 若单个一组,则8个pe需要 8 * 6 * (5+16)= 1008 个寄存器,平均每pe多了7.875个16bit 18 | 若4个一组,则需要 2 * 13 * (2 + 5 + 16) = 598 个寄存器,平均每个pe多用了4.67个16bit 19 | 若8个一组,则需要 1 * 20 * (3 + 5 + 16) = 480 个寄存器,平均每个pe多了3.75个16bit 20 | 所以,pe消耗是8个一组划算。 21 | * 考虑到如果8个pe平分20个word,均摊的LUT不太划算(见上),改为16个的话: 22 | 消耗寄存器为 1 * 16 * (3 + 5 + 16) = 384,平均每pe多用3个16bit。这种情况下,需要多层cache 23 | 4. 24 | 大致的critical path 25 | level: 8+- 26 | 读: 接收并匹配index,输出并更新 list 27 | 写: 根据list计算可用位置,存储并更新list 28 | level: 20+- 写reg 29 | 30 | 31 | 一些代码不好写的地方: 32 | 我们显然希望可以做到一个周期内先取出一个,然后旧的立刻填上位置,这样总的内容消耗可以比较小,但是时钟会变慢 33 | verilog对类似多维向量的支持不太好,8个端口到时候基本上得手写。 34 | 35 | 36 | 待定: 37 | pe发出申请是在当前周期内给回复,还是下个周期回复? 38 | 周期内回复,时钟线基本上必定翻倍(发出申请、收到数据、mac) 39 | 下个周期回复的话,一来要加一个16bit寄存器,二来要俩MAC。 40 | 41 | 42 | -------------------------------------------------------------------------------- /basic_components/input/input.v: -------------------------------------------------------------------------------- 1 | module mux #( 2 | parameter LENGTH = 4, 3 | parameter DATA_WIDTH = 16 4 | )( 5 | input flag, 6 | input [LENGTH*DATA_WIDTH-1 : 0] in_0, 7 | input [LENGTH*DATA_WIDTH-1 : 0] in_1, 8 | output [LENGTH*DATA_WIDTH-1 : 0] out 9 | ); 10 | assign out = flag ? in_1 : in_0; 11 | endmodule 12 | 13 | module invert #( 14 | parameter LENGTH = 4, 15 | parameter DATA_WIDTH = 16 16 | ) ( 17 | input [LENGTH*DATA_WIDTH-1 : 0] in, 18 | output [LENGTH*DATA_WIDTH-1 : 0] out 19 | ); 20 | genvar i; 21 | generate for (i=0; i> Half_WIDTH; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | o_result <= finish ? partial_sum : o_result; 20 | partial_sum <= finish ? x : (partial_sum + x); 21 | end 22 | endmodule 23 | 24 | 25 | module single_kernel #( 26 | parameter SIZE = 8, 27 | parameter DATA_WIDTH = 16 28 | ) ( 29 | input clk, 30 | input [SIZE*SIZE-1:0] finish, // 编号规则: 31 | input [SIZE*DATA_WIDTH-1:0] in_up, // n_n ---- n_1 32 | input [SIZE*DATA_WIDTH-1:0] in_left, // | | 33 | output [SIZE*DATA_WIDTH-1:0] pass_down, // | | 34 | output [SIZE*DATA_WIDTH-1:0] pass_right, // 1_n ---- 1_1 35 | output [SIZE*SIZE*DATA_WIDTH-1:0] out_matrix, // 36 | output [SIZE*DATA_WIDTH-1:0] out_diagonal // serialized_index(i,j) = (i-1)*SIZE + j 37 | ); 38 | genvar i,j,k; 39 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_down; 40 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_right; 41 | generate 42 | for (i=SIZE; i>=1; i=i-1) begin 43 | for (j=SIZE; j>=1; j=j-1) begin 44 | if (i==SIZE && j==SIZE) begin // 左上角。the upper-left PE 45 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 46 | pe (clk, finish [(i-1)*SIZE+j-1], 47 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 48 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 49 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 50 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 51 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 52 | end else if (i==SIZE && j!=SIZE) begin // 最上一行。PEs in the upper-most row 53 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 54 | pe (clk, finish [(i-1)*SIZE+j-1], 55 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 56 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 57 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 58 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 59 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 60 | end else if (i!=SIZE && j==SIZE) begin // 最左一列。PEs in the left-most column 61 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 62 | pe (clk, finish [(i-1)*SIZE+j-1], 63 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 64 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 65 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 66 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 67 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 68 | end else begin // 其他PE。all other PEs 69 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 70 | pe (clk, finish [(i-1)*SIZE+j-1], 71 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 72 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 73 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 74 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 75 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 76 | end end end 77 | endgenerate 78 | generate 79 | for (k=SIZE; k>=1; k=k-1) begin 80 | // 向下侧阵列传递。pass data downward to other PE arays 81 | // 向下侧阵列传递。pass data rightward to other PE arays 82 | // 输出对角线值。 output results in diagonal position 83 | assign pass_down [k*DATA_WIDTH-1 -:DATA_WIDTH] 84 | = inner_pass_down [((1-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 85 | assign pass_right [k*DATA_WIDTH-1 -:DATA_WIDTH] 86 | = inner_pass_right [((k-1)*SIZE+1)*DATA_WIDTH-1 -:DATA_WIDTH]; 87 | assign out_diagonal [k*DATA_WIDTH-1 -:DATA_WIDTH] 88 | = out_matrix [((k-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 89 | end 90 | endgenerate 91 | endmodule -------------------------------------------------------------------------------- /basic_components/pe_array/test/output.txt: -------------------------------------------------------------------------------- 1 | single pe test 2 | Time: 0, clk: 0, partial_sum: 0 3 | Time: 0, clk: 1, partial_sum: 0, o_down: x, o_right: x, o_result 0 4 | 5 | Time: 1, clk: 0, partial_sum: 0, o_down: 0, o_right: 0, o_result 0 6 | Time: 1, clk: 1, partial_sum: 0, o_down: 0, o_right: 0, o_result 0 7 | 8 | Time: 2, clk: 0, partial_sum: 7, o_down: 100, o_right: 20, o_result 0 9 | Time: 2, clk: 1, partial_sum: 7, o_down: 100, o_right: 20, o_result 0 10 | 11 | Time: 3, clk: 0, partial_sum: 14, o_down: 100, o_right: 20, o_result 0 12 | Time: 3, clk: 1, partial_sum: 14, o_down: 100, o_right: 20, o_result 0 13 | 14 | Time: 4, clk: 0, partial_sum: 21, o_down: 100, o_right: 20, o_result 0 15 | Time: 4, clk: 1, partial_sum: 21, o_down: 100, o_right: 20, o_result 0 16 | 17 | Time: 5, clk: 0, partial_sum: 36, o_down: 100, o_right: 40, o_result 0 18 | Time: 5, clk: 1, partial_sum: 36, o_down: 100, o_right: 40, o_result 0 19 | 20 | Time: 6, clk: 0, partial_sum: 15, o_down: 100, o_right: 40, o_result 36 21 | Time: 6, clk: 1, partial_sum: 15, o_down: 100, o_right: 40, o_result 36 22 | 23 | Time: 7, clk: 0, partial_sum: 30, o_down: 100, o_right: 40, o_result 36 24 | Time: 7, clk: 1, partial_sum: 30, o_down: 100, o_right: 40, o_result 36 25 | single_PE_test.v:28: $finish called at 16 (1s) 26 | 27 | Time: 8, clk: 0, partial_sum: 45, o_down: 100, o_right: 40, o_result 36 28 | -------------------------------------------------------------------------------- /basic_components/pe_array/test/single_PE_test.v: -------------------------------------------------------------------------------- 1 | module single_PE_rounded #( 2 | parameter DATA_WIDTH = 8, 3 | parameter Half_WIDTH = 4 4 | )( 5 | input clk, 6 | input finish, 7 | input[DATA_WIDTH-1 : 0] i_up, 8 | input[DATA_WIDTH-1 : 0] i_left, 9 | output reg[DATA_WIDTH-1 : 0] o_down, 10 | output reg[DATA_WIDTH-1 : 0] o_right, 11 | output reg[DATA_WIDTH-1 : 0] o_result = 0 12 | ); 13 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 14 | wire [DATA_WIDTH-1 : 0] x; 15 | assign x = (i_up*i_left) >> Half_WIDTH; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | o_result <= finish ? partial_sum : o_result; 20 | partial_sum <= finish ? x : (partial_sum + x); 21 | end 22 | endmodule 23 | /* 24 | 1. 限制精度的乘累加计算 25 | 2. finish信号能否重置partial result 26 | */ 27 | 28 | module main #(parameter DATA_WIDTH = 16)(); 29 | integer currTime = 0; 30 | reg clk = 0; 31 | reg finish = 0; 32 | reg [DATA_WIDTH-1 : 0] i_up = 0; 33 | reg [DATA_WIDTH-1 : 0] i_left = 0; 34 | wire [DATA_WIDTH-1 : 0] o_down = 0; 35 | wire [DATA_WIDTH-1 : 0] o_right = 0; 36 | wire [DATA_WIDTH-1 : 0] o_result = 0; 37 | single_PE_rounded # (16,8) test (clk, finish, i_up, i_left, o_down, o_right, o_result); 38 | 39 | // 修改input、显示初态 40 | initial begin 41 | # 0 42 | $display("single pe test"); 43 | $display("Time: %d, clk: %d, partial_sum: %d", currTime, clk, test.partial_sum); 44 | # 2 i_up = 100; i_left = 20; 45 | # 6 i_up = 100; i_left = 40; 46 | # 2 finish = 1; 47 | # 2 finish = 0; 48 | # 4 $finish; 49 | end 50 | // 计时并间隔输出 51 | always #2 begin 52 | currTime = currTime + 1; 53 | $display(" "); 54 | end 55 | // output 56 | always #1 begin 57 | clk = ~clk; 58 | $display("Time: %d, clk: %d, partial_sum: %d, o_down: %d, o_right: %d, o_result", currTime, clk, test.partial_sum, test.o_down, test.o_right, test.o_result); 59 | end 60 | 61 | endmodule -------------------------------------------------------------------------------- /basic_components/pe_array/test/test: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/vvp 2 | :ivl_version "13.0 (devel)" "(s20221226-516-g615a01c6c)"; 3 | :ivl_delay_selection "TYPICAL"; 4 | :vpi_time_precision + 0; 5 | :vpi_module "/usr/local/lib/ivl/system.vpi"; 6 | :vpi_module "/usr/local/lib/ivl/vhdl_sys.vpi"; 7 | :vpi_module "/usr/local/lib/ivl/vhdl_textio.vpi"; 8 | :vpi_module "/usr/local/lib/ivl/v2005_math.vpi"; 9 | :vpi_module "/usr/local/lib/ivl/va_math.vpi"; 10 | S_0x55f13a68dd40 .scope module, "main" "main" 2 33; 11 | .timescale 0 0; 12 | P_0x55f13a6cc6d0 .param/l "DATA_WIDTH" 0 2 33, +C4<00000000000000000000000000010000>; 13 | v0x55f13a6e6000_0 .var "clk", 0 0; 14 | v0x55f13a6e60c0_0 .var/i "currTime", 31 0; 15 | v0x55f13a6e6180_0 .var "finish", 0 0; 16 | v0x55f13a6e6280_0 .var "i_left", 15 0; 17 | v0x55f13a6e6350_0 .var "i_up", 15 0; 18 | L_0x7f20bf5c8018 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 19 | RS_0x7f20bf611168 .resolv tri, v0x55f13a6e5ac0_0, L_0x7f20bf5c8018; 20 | v0x55f13a6e63f0_0 .net8 "o_down", 15 0, RS_0x7f20bf611168; 2 drivers 21 | L_0x7f20bf5c80a8 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 22 | RS_0x7f20bf611198 .resolv tri, v0x55f13a6e5ba0_0, L_0x7f20bf5c80a8; 23 | v0x55f13a6e64c0_0 .net8 "o_result", 15 0, RS_0x7f20bf611198; 2 drivers 24 | L_0x7f20bf5c8060 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 25 | RS_0x7f20bf6111c8 .resolv tri, v0x55f13a6e5c80_0, L_0x7f20bf5c8060; 26 | v0x55f13a6e6590_0 .net8 "o_right", 15 0, RS_0x7f20bf6111c8; 2 drivers 27 | S_0x55f13a68df20 .scope module, "test" "single_PE_rounded" 2 42, 2 1 0, S_0x55f13a68dd40; 28 | .timescale 0 0; 29 | .port_info 0 /INPUT 1 "clk"; 30 | .port_info 1 /INPUT 1 "finish"; 31 | .port_info 2 /INPUT 16 "i_up"; 32 | .port_info 3 /INPUT 16 "i_left"; 33 | .port_info 4 /OUTPUT 16 "o_down"; 34 | .port_info 5 /OUTPUT 16 "o_right"; 35 | .port_info 6 /OUTPUT 16 "o_result"; 36 | P_0x55f13a6b30d0 .param/l "DATA_WIDTH" 0 2 2, +C4<00000000000000000000000000010000>; 37 | P_0x55f13a6b3110 .param/l "Half_WIDTH" 0 2 3, +C4<00000000000000000000000000001000>; 38 | v0x55f13a68e2e0_0 .net *"_ivl_1", 15 0, L_0x55f13a6f6780; 1 drivers 39 | v0x55f13a6b9a40_0 .net *"_ivl_4", 7 0, L_0x55f13a6f68c0; 1 drivers 40 | L_0x7f20bf5c80f0 .functor BUFT 1, C4<00000000>, C4<0>, C4<0>, C4<0>; 41 | v0x55f13a6e5640_0 .net *"_ivl_6", 7 0, L_0x7f20bf5c80f0; 1 drivers 42 | v0x55f13a6e5730_0 .net "clk", 0 0, v0x55f13a6e6000_0; 1 drivers 43 | v0x55f13a6e57f0_0 .net "finish", 0 0, v0x55f13a6e6180_0; 1 drivers 44 | v0x55f13a6e5900_0 .net "i_left", 15 0, v0x55f13a6e6280_0; 1 drivers 45 | v0x55f13a6e59e0_0 .net "i_up", 15 0, v0x55f13a6e6350_0; 1 drivers 46 | v0x55f13a6e5ac0_0 .var "o_down", 15 0; 47 | v0x55f13a6e5ba0_0 .var "o_result", 15 0; 48 | v0x55f13a6e5c80_0 .var "o_right", 15 0; 49 | v0x55f13a6e5d60_0 .var "partial_sum", 15 0; 50 | v0x55f13a6e5e40_0 .net "x", 15 0, L_0x55f13a6f69b0; 1 drivers 51 | E_0x55f13a6d0d30 .event posedge, v0x55f13a6e5730_0; 52 | L_0x55f13a6f6780 .arith/mult 16, v0x55f13a6e6350_0, v0x55f13a6e6280_0; 53 | L_0x55f13a6f68c0 .part L_0x55f13a6f6780, 8, 8; 54 | L_0x55f13a6f69b0 .concat [ 8 8 0 0], L_0x55f13a6f68c0, L_0x7f20bf5c80f0; 55 | .scope S_0x55f13a68df20; 56 | T_0 ; 57 | %pushi/vec4 0, 0, 16; 58 | %store/vec4 v0x55f13a6e5ba0_0, 0, 16; 59 | %pushi/vec4 0, 0, 16; 60 | %store/vec4 v0x55f13a6e5d60_0, 0, 16; 61 | %end; 62 | .thread T_0; 63 | .scope S_0x55f13a68df20; 64 | T_1 ; 65 | %wait E_0x55f13a6d0d30; 66 | %load/vec4 v0x55f13a6e59e0_0; 67 | %assign/vec4 v0x55f13a6e5ac0_0, 0; 68 | %load/vec4 v0x55f13a6e5900_0; 69 | %assign/vec4 v0x55f13a6e5c80_0, 0; 70 | %load/vec4 v0x55f13a6e57f0_0; 71 | %flag_set/vec4 8; 72 | %jmp/0xz T_1.0, 8; 73 | %load/vec4 v0x55f13a6e5d60_0; 74 | %assign/vec4 v0x55f13a6e5ba0_0, 0; 75 | %load/vec4 v0x55f13a6e5e40_0; 76 | %assign/vec4 v0x55f13a6e5d60_0, 0; 77 | %jmp T_1.1; 78 | T_1.0 ; 79 | %load/vec4 v0x55f13a6e5ba0_0; 80 | %assign/vec4 v0x55f13a6e5ba0_0, 0; 81 | %load/vec4 v0x55f13a6e5d60_0; 82 | %load/vec4 v0x55f13a6e5e40_0; 83 | %add; 84 | %assign/vec4 v0x55f13a6e5d60_0, 0; 85 | T_1.1 ; 86 | %jmp T_1; 87 | .thread T_1; 88 | .scope S_0x55f13a68dd40; 89 | T_2 ; 90 | %pushi/vec4 0, 0, 32; 91 | %store/vec4 v0x55f13a6e60c0_0, 0, 32; 92 | %pushi/vec4 0, 0, 1; 93 | %store/vec4 v0x55f13a6e6000_0, 0, 1; 94 | %pushi/vec4 0, 0, 1; 95 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 96 | %pushi/vec4 0, 0, 16; 97 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 98 | %pushi/vec4 0, 0, 16; 99 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 100 | %end; 101 | .thread T_2; 102 | .scope S_0x55f13a68dd40; 103 | T_3 ; 104 | %delay 0, 0; 105 | %vpi_call 2 47 "$display", "single pe test" {0 0 0}; 106 | %vpi_call 2 48 "$display", "Time: %d, clk: %d, partial_sum: %d", v0x55f13a6e60c0_0, v0x55f13a6e6000_0, v0x55f13a6e5d60_0 {0 0 0}; 107 | %delay 2, 0; 108 | %pushi/vec4 100, 0, 16; 109 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 110 | %pushi/vec4 20, 0, 16; 111 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 112 | %delay 6, 0; 113 | %pushi/vec4 100, 0, 16; 114 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 115 | %pushi/vec4 40, 0, 16; 116 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 117 | %delay 2, 0; 118 | %pushi/vec4 1, 0, 1; 119 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 120 | %delay 2, 0; 121 | %pushi/vec4 0, 0, 1; 122 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 123 | %delay 4, 0; 124 | %vpi_call 2 53 "$finish" {0 0 0}; 125 | %end; 126 | .thread T_3; 127 | .scope S_0x55f13a68dd40; 128 | T_4 ; 129 | %delay 2, 0; 130 | %load/vec4 v0x55f13a6e60c0_0; 131 | %addi 1, 0, 32; 132 | %store/vec4 v0x55f13a6e60c0_0, 0, 32; 133 | %vpi_call 2 58 "$display", " " {0 0 0}; 134 | %jmp T_4; 135 | .thread T_4; 136 | .scope S_0x55f13a68dd40; 137 | T_5 ; 138 | %delay 1, 0; 139 | %load/vec4 v0x55f13a6e6000_0; 140 | %inv; 141 | %store/vec4 v0x55f13a6e6000_0, 0, 1; 142 | %vpi_call 2 63 "$display", "Time: %d, clk: %d, partial_sum: %d, o_down: %d, o_right: %d, o_result", v0x55f13a6e60c0_0, v0x55f13a6e6000_0, v0x55f13a6e5d60_0, v0x55f13a6e5ac0_0, v0x55f13a6e5c80_0, v0x55f13a6e5ba0_0 {0 0 0}; 143 | %jmp T_5; 144 | .thread T_5; 145 | # The file index is used to find the file name in the following table. 146 | :file_names 3; 147 | "N/A"; 148 | ""; 149 | "single_PE_test.v"; 150 | -------------------------------------------------------------------------------- /basic_components/simple_pe/single_PE.v: -------------------------------------------------------------------------------- 1 | module single_PE #( 2 | parameter DATA_WIDTH = 8, 3 | parameter Half_WIDTH = 4 4 | )( 5 | input clk, 6 | input finish, 7 | input[DATA_WIDTH-1 : 0] i_up, 8 | input[DATA_WIDTH-1 : 0] i_left, 9 | output reg[DATA_WIDTH-1 : 0] o_down, 10 | output reg[DATA_WIDTH-1 : 0] o_right, 11 | output reg[2*DATA_WIDTH-1 : 0] o_result = 0 12 | ); 13 | reg [2*DATA_WIDTH-1 : 0] partial_sum = 0; 14 | wire [DATA_WIDTH-1 : 0] x; 15 | assign x = i_up*i_left; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | if(finish) begin 20 | o_result <= partial_sum; 21 | partial_sum <= x; 22 | end else begin 23 | o_result <= o_result; 24 | partial_sum <= partial_sum + x; 25 | end 26 | end 27 | endmodule 28 | 29 | module single_PE_rounded #( 30 | parameter DATA_WIDTH = 8, 31 | parameter Half_WIDTH = 4 32 | )( 33 | input clk, 34 | input finish, 35 | input[DATA_WIDTH-1 : 0] i_up, 36 | input[DATA_WIDTH-1 : 0] i_left, 37 | output reg[DATA_WIDTH-1 : 0] o_down, 38 | output reg[DATA_WIDTH-1 : 0] o_right, 39 | output reg[DATA_WIDTH-1 : 0] o_result = 0 40 | ); 41 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 42 | wire [DATA_WIDTH-1 : 0] x; 43 | assign x = (i_up*i_left) >> Half_WIDTH; 44 | always @(posedge clk) begin 45 | o_down <= i_up; 46 | o_right <= i_left; 47 | o_result <= finish ? partial_sum : o_result; 48 | partial_sum <= finish ? x : (partial_sum + x); 49 | end 50 | endmodule -------------------------------------------------------------------------------- /basic_components/simple_pe/test/output.txt: -------------------------------------------------------------------------------- 1 | single pe test 2 | Time: 0, clk: 0, partial_sum: 0 3 | Time: 0, clk: 1, partial_sum: 0, o_down: x, o_right: x, o_result 0 4 | 5 | Time: 1, clk: 0, partial_sum: 0, o_down: 0, o_right: 0, o_result 0 6 | Time: 1, clk: 1, partial_sum: 0, o_down: 0, o_right: 0, o_result 0 7 | 8 | Time: 2, clk: 0, partial_sum: 7, o_down: 100, o_right: 20, o_result 0 9 | Time: 2, clk: 1, partial_sum: 7, o_down: 100, o_right: 20, o_result 0 10 | 11 | Time: 3, clk: 0, partial_sum: 14, o_down: 100, o_right: 20, o_result 0 12 | Time: 3, clk: 1, partial_sum: 14, o_down: 100, o_right: 20, o_result 0 13 | 14 | Time: 4, clk: 0, partial_sum: 21, o_down: 100, o_right: 20, o_result 0 15 | Time: 4, clk: 1, partial_sum: 21, o_down: 100, o_right: 20, o_result 0 16 | 17 | Time: 5, clk: 0, partial_sum: 36, o_down: 100, o_right: 40, o_result 0 18 | Time: 5, clk: 1, partial_sum: 36, o_down: 100, o_right: 40, o_result 0 19 | 20 | Time: 6, clk: 0, partial_sum: 15, o_down: 100, o_right: 40, o_result 36 21 | Time: 6, clk: 1, partial_sum: 15, o_down: 100, o_right: 40, o_result 36 22 | 23 | Time: 7, clk: 0, partial_sum: 30, o_down: 100, o_right: 40, o_result 36 24 | Time: 7, clk: 1, partial_sum: 30, o_down: 100, o_right: 40, o_result 36 25 | single_PE_test.v:28: $finish called at 16 (1s) 26 | 27 | Time: 8, clk: 0, partial_sum: 45, o_down: 100, o_right: 40, o_result 36 28 | -------------------------------------------------------------------------------- /basic_components/simple_pe/test/single_PE_test.v: -------------------------------------------------------------------------------- 1 | module single_PE_rounded #( 2 | parameter DATA_WIDTH = 8, 3 | parameter Half_WIDTH = 4 4 | )( 5 | input clk, 6 | input finish, 7 | input[DATA_WIDTH-1 : 0] i_up, 8 | input[DATA_WIDTH-1 : 0] i_left, 9 | output reg[DATA_WIDTH-1 : 0] o_down, 10 | output reg[DATA_WIDTH-1 : 0] o_right, 11 | output reg[DATA_WIDTH-1 : 0] o_result = 0 12 | ); 13 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 14 | wire [DATA_WIDTH-1 : 0] x; 15 | assign x = (i_up*i_left) >> Half_WIDTH; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | o_result <= finish ? partial_sum : o_result; 20 | partial_sum <= finish ? x : (partial_sum + x); 21 | end 22 | endmodule 23 | /* 24 | 1. 限制精度的乘累加计算 25 | 2. finish信号能否重置partial result 26 | */ 27 | 28 | module main #(parameter DATA_WIDTH = 16)(); 29 | integer currTime = 0; 30 | reg clk = 0; 31 | reg finish = 0; 32 | reg [DATA_WIDTH-1 : 0] i_up = 0; 33 | reg [DATA_WIDTH-1 : 0] i_left = 0; 34 | wire [DATA_WIDTH-1 : 0] o_down = 0; 35 | wire [DATA_WIDTH-1 : 0] o_right = 0; 36 | wire [DATA_WIDTH-1 : 0] o_result = 0; 37 | single_PE_rounded # (16,8) test (clk, finish, i_up, i_left, o_down, o_right, o_result); 38 | 39 | // 修改input、显示初态 40 | initial begin 41 | # 0 42 | $display("single pe test"); 43 | $display("Time: %d, clk: %d, partial_sum: %d", currTime, clk, test.partial_sum); 44 | # 2 i_up = 100; i_left = 20; 45 | # 6 i_up = 100; i_left = 40; 46 | # 2 finish = 1; 47 | # 2 finish = 0; 48 | # 4 $finish; 49 | end 50 | // 计时并间隔输出 51 | always #2 begin 52 | currTime = currTime + 1; 53 | $display(" "); 54 | end 55 | // output 56 | always #1 begin 57 | clk = ~clk; 58 | $display("Time: %d, clk: %d, partial_sum: %d, o_down: %d, o_right: %d, o_result", currTime, clk, test.partial_sum, test.o_down, test.o_right, test.o_result); 59 | end 60 | 61 | endmodule -------------------------------------------------------------------------------- /basic_components/simple_pe/test/test: -------------------------------------------------------------------------------- 1 | #! /usr/local/bin/vvp 2 | :ivl_version "13.0 (devel)" "(s20221226-516-g615a01c6c)"; 3 | :ivl_delay_selection "TYPICAL"; 4 | :vpi_time_precision + 0; 5 | :vpi_module "/usr/local/lib/ivl/system.vpi"; 6 | :vpi_module "/usr/local/lib/ivl/vhdl_sys.vpi"; 7 | :vpi_module "/usr/local/lib/ivl/vhdl_textio.vpi"; 8 | :vpi_module "/usr/local/lib/ivl/v2005_math.vpi"; 9 | :vpi_module "/usr/local/lib/ivl/va_math.vpi"; 10 | S_0x55f13a68dd40 .scope module, "main" "main" 2 33; 11 | .timescale 0 0; 12 | P_0x55f13a6cc6d0 .param/l "DATA_WIDTH" 0 2 33, +C4<00000000000000000000000000010000>; 13 | v0x55f13a6e6000_0 .var "clk", 0 0; 14 | v0x55f13a6e60c0_0 .var/i "currTime", 31 0; 15 | v0x55f13a6e6180_0 .var "finish", 0 0; 16 | v0x55f13a6e6280_0 .var "i_left", 15 0; 17 | v0x55f13a6e6350_0 .var "i_up", 15 0; 18 | L_0x7f20bf5c8018 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 19 | RS_0x7f20bf611168 .resolv tri, v0x55f13a6e5ac0_0, L_0x7f20bf5c8018; 20 | v0x55f13a6e63f0_0 .net8 "o_down", 15 0, RS_0x7f20bf611168; 2 drivers 21 | L_0x7f20bf5c80a8 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 22 | RS_0x7f20bf611198 .resolv tri, v0x55f13a6e5ba0_0, L_0x7f20bf5c80a8; 23 | v0x55f13a6e64c0_0 .net8 "o_result", 15 0, RS_0x7f20bf611198; 2 drivers 24 | L_0x7f20bf5c8060 .functor BUFT 1, C4<0000000000000000>, C4<0>, C4<0>, C4<0>; 25 | RS_0x7f20bf6111c8 .resolv tri, v0x55f13a6e5c80_0, L_0x7f20bf5c8060; 26 | v0x55f13a6e6590_0 .net8 "o_right", 15 0, RS_0x7f20bf6111c8; 2 drivers 27 | S_0x55f13a68df20 .scope module, "test" "single_PE_rounded" 2 42, 2 1 0, S_0x55f13a68dd40; 28 | .timescale 0 0; 29 | .port_info 0 /INPUT 1 "clk"; 30 | .port_info 1 /INPUT 1 "finish"; 31 | .port_info 2 /INPUT 16 "i_up"; 32 | .port_info 3 /INPUT 16 "i_left"; 33 | .port_info 4 /OUTPUT 16 "o_down"; 34 | .port_info 5 /OUTPUT 16 "o_right"; 35 | .port_info 6 /OUTPUT 16 "o_result"; 36 | P_0x55f13a6b30d0 .param/l "DATA_WIDTH" 0 2 2, +C4<00000000000000000000000000010000>; 37 | P_0x55f13a6b3110 .param/l "Half_WIDTH" 0 2 3, +C4<00000000000000000000000000001000>; 38 | v0x55f13a68e2e0_0 .net *"_ivl_1", 15 0, L_0x55f13a6f6780; 1 drivers 39 | v0x55f13a6b9a40_0 .net *"_ivl_4", 7 0, L_0x55f13a6f68c0; 1 drivers 40 | L_0x7f20bf5c80f0 .functor BUFT 1, C4<00000000>, C4<0>, C4<0>, C4<0>; 41 | v0x55f13a6e5640_0 .net *"_ivl_6", 7 0, L_0x7f20bf5c80f0; 1 drivers 42 | v0x55f13a6e5730_0 .net "clk", 0 0, v0x55f13a6e6000_0; 1 drivers 43 | v0x55f13a6e57f0_0 .net "finish", 0 0, v0x55f13a6e6180_0; 1 drivers 44 | v0x55f13a6e5900_0 .net "i_left", 15 0, v0x55f13a6e6280_0; 1 drivers 45 | v0x55f13a6e59e0_0 .net "i_up", 15 0, v0x55f13a6e6350_0; 1 drivers 46 | v0x55f13a6e5ac0_0 .var "o_down", 15 0; 47 | v0x55f13a6e5ba0_0 .var "o_result", 15 0; 48 | v0x55f13a6e5c80_0 .var "o_right", 15 0; 49 | v0x55f13a6e5d60_0 .var "partial_sum", 15 0; 50 | v0x55f13a6e5e40_0 .net "x", 15 0, L_0x55f13a6f69b0; 1 drivers 51 | E_0x55f13a6d0d30 .event posedge, v0x55f13a6e5730_0; 52 | L_0x55f13a6f6780 .arith/mult 16, v0x55f13a6e6350_0, v0x55f13a6e6280_0; 53 | L_0x55f13a6f68c0 .part L_0x55f13a6f6780, 8, 8; 54 | L_0x55f13a6f69b0 .concat [ 8 8 0 0], L_0x55f13a6f68c0, L_0x7f20bf5c80f0; 55 | .scope S_0x55f13a68df20; 56 | T_0 ; 57 | %pushi/vec4 0, 0, 16; 58 | %store/vec4 v0x55f13a6e5ba0_0, 0, 16; 59 | %pushi/vec4 0, 0, 16; 60 | %store/vec4 v0x55f13a6e5d60_0, 0, 16; 61 | %end; 62 | .thread T_0; 63 | .scope S_0x55f13a68df20; 64 | T_1 ; 65 | %wait E_0x55f13a6d0d30; 66 | %load/vec4 v0x55f13a6e59e0_0; 67 | %assign/vec4 v0x55f13a6e5ac0_0, 0; 68 | %load/vec4 v0x55f13a6e5900_0; 69 | %assign/vec4 v0x55f13a6e5c80_0, 0; 70 | %load/vec4 v0x55f13a6e57f0_0; 71 | %flag_set/vec4 8; 72 | %jmp/0xz T_1.0, 8; 73 | %load/vec4 v0x55f13a6e5d60_0; 74 | %assign/vec4 v0x55f13a6e5ba0_0, 0; 75 | %load/vec4 v0x55f13a6e5e40_0; 76 | %assign/vec4 v0x55f13a6e5d60_0, 0; 77 | %jmp T_1.1; 78 | T_1.0 ; 79 | %load/vec4 v0x55f13a6e5ba0_0; 80 | %assign/vec4 v0x55f13a6e5ba0_0, 0; 81 | %load/vec4 v0x55f13a6e5d60_0; 82 | %load/vec4 v0x55f13a6e5e40_0; 83 | %add; 84 | %assign/vec4 v0x55f13a6e5d60_0, 0; 85 | T_1.1 ; 86 | %jmp T_1; 87 | .thread T_1; 88 | .scope S_0x55f13a68dd40; 89 | T_2 ; 90 | %pushi/vec4 0, 0, 32; 91 | %store/vec4 v0x55f13a6e60c0_0, 0, 32; 92 | %pushi/vec4 0, 0, 1; 93 | %store/vec4 v0x55f13a6e6000_0, 0, 1; 94 | %pushi/vec4 0, 0, 1; 95 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 96 | %pushi/vec4 0, 0, 16; 97 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 98 | %pushi/vec4 0, 0, 16; 99 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 100 | %end; 101 | .thread T_2; 102 | .scope S_0x55f13a68dd40; 103 | T_3 ; 104 | %delay 0, 0; 105 | %vpi_call 2 47 "$display", "single pe test" {0 0 0}; 106 | %vpi_call 2 48 "$display", "Time: %d, clk: %d, partial_sum: %d", v0x55f13a6e60c0_0, v0x55f13a6e6000_0, v0x55f13a6e5d60_0 {0 0 0}; 107 | %delay 2, 0; 108 | %pushi/vec4 100, 0, 16; 109 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 110 | %pushi/vec4 20, 0, 16; 111 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 112 | %delay 6, 0; 113 | %pushi/vec4 100, 0, 16; 114 | %store/vec4 v0x55f13a6e6350_0, 0, 16; 115 | %pushi/vec4 40, 0, 16; 116 | %store/vec4 v0x55f13a6e6280_0, 0, 16; 117 | %delay 2, 0; 118 | %pushi/vec4 1, 0, 1; 119 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 120 | %delay 2, 0; 121 | %pushi/vec4 0, 0, 1; 122 | %store/vec4 v0x55f13a6e6180_0, 0, 1; 123 | %delay 4, 0; 124 | %vpi_call 2 53 "$finish" {0 0 0}; 125 | %end; 126 | .thread T_3; 127 | .scope S_0x55f13a68dd40; 128 | T_4 ; 129 | %delay 2, 0; 130 | %load/vec4 v0x55f13a6e60c0_0; 131 | %addi 1, 0, 32; 132 | %store/vec4 v0x55f13a6e60c0_0, 0, 32; 133 | %vpi_call 2 58 "$display", " " {0 0 0}; 134 | %jmp T_4; 135 | .thread T_4; 136 | .scope S_0x55f13a68dd40; 137 | T_5 ; 138 | %delay 1, 0; 139 | %load/vec4 v0x55f13a6e6000_0; 140 | %inv; 141 | %store/vec4 v0x55f13a6e6000_0, 0, 1; 142 | %vpi_call 2 63 "$display", "Time: %d, clk: %d, partial_sum: %d, o_down: %d, o_right: %d, o_result", v0x55f13a6e60c0_0, v0x55f13a6e6000_0, v0x55f13a6e5d60_0, v0x55f13a6e5ac0_0, v0x55f13a6e5c80_0, v0x55f13a6e5ba0_0 {0 0 0}; 143 | %jmp T_5; 144 | .thread T_5; 145 | # The file index is used to find the file name in the following table. 146 | :file_names 3; 147 | "N/A"; 148 | ""; 149 | "single_PE_test.v"; 150 | -------------------------------------------------------------------------------- /basic_components/sparse_pe/sparse_pe.v: -------------------------------------------------------------------------------- 1 | module sparse_PE_rounded #( 2 | parameter DATA_WIDTH = 8, 3 | parameter Half_WIDTH = 4, 4 | parameter INDEX_SIZE = 3 // 8*8 PE 5 | )( 6 | input clk, 7 | input[DATA_WIDTH-1 : 0] i_up, 8 | input[DATA_WIDTH-1 : 0] i_left, 9 | input [INDEX_SIZE-1 : 0] index_up, 10 | input [INDEX_SIZE-1 : 0] index_left, 11 | output reg [DATA_WIDTH-1 : 0] o_down, 12 | output reg [DATA_WIDTH-1 : 0] o_right, 13 | output reg [INDEX_SIZE-1 : 0] index_down, 14 | output reg [INDEX_SIZE-1 : 0] index_right, 15 | output reg[DATA_WIDTH-1 : 0] o_result = 0, 16 | output finished // 普通pe接受finish信号作为输入,稀疏pe自己判断是否finish并将其输出 17 | ); 18 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 19 | reg [INDEX_SIZE+DATA_WIDTH : 0] cache = 0; // 最高位代表存入方向(0上1左),中间INDEX_SIZE位为存入元素的序号,最后DATA_WIDTH位为数据 20 | reg [INDEX_SIZE+DATA_WIDTH : 0] cache_use = 0; // 最高位代表存入方向(0上1左),中间INDEX_SIZE位为存入元素的序号,最后DATA_WIDTH位为数据 21 | wire [DATA_WIDTH-1 : 0] delta; 22 | 23 | // 传输:无论如何,每个上升沿向下、向后传输一位数据 24 | always @(posedge clk) begin 25 | cache_use <= cache; 26 | o_down <= i_up; 27 | o_right <= i_left; 28 | index_down <= index_up; 29 | index_right <= index_left; 30 | end 31 | // 计算:在计算未完成时,通过比较两个输入的index,分情况处理。若计算完成,什么也不做 32 | assign delta = (i_up*i_left) >> Half_WIDTH; // 乘累加增量 33 | assign finished = (i_up==0) | (i_left==0); // 判断是否计算完成 34 | always @(posedge clk) begin 35 | if (finished == 0) begin 36 | // 情况1:计算未完成,则result维持不动 37 | o_result <= o_result; 38 | if (index_left == index_up) begin 39 | // 情况1.1:两个输入的序号相等,直接进行乘累加 40 | partial_sum <= partial_sum + delta; 41 | cache <= 0; // cache清零 42 | end 43 | else if (index_left < index_up) begin 44 | // 情况1.2:上方输入序号更大,舍弃左边的输入,存储上方的输入 45 | cache [INDEX_SIZE+DATA_WIDTH] <= 0; 46 | cache [INDEX_SIZE+DATA_WIDTH - 1 -: INDEX_SIZE] = index_up; 47 | cache [DATA_WIDTH-1 : 0] = i_up; 48 | if (cache_use[INDEX_SIZE+DATA_WIDTH] == 0) begin 49 | if(cache_use[INDEX_SIZE+DATA_WIDTH-1 -: INDEX_SIZE] == index_left) begin 50 | partial_sum <= partial_sum + (cache_use[DATA_WIDTH-1 : 0] * i_left) >> Half_WIDTH; 51 | end 52 | end else begin 53 | // todo: 54 | partial_sum <= partial_sum; 55 | end 56 | end 57 | else begin 58 | // 情况1.3:左侧输入序号更大,舍弃上边的输入,存储左侧的输入 59 | cache [INDEX_SIZE+DATA_WIDTH] <= 1; 60 | cache [INDEX_SIZE+DATA_WIDTH - 1 -: INDEX_SIZE] = index_left; 61 | cache [DATA_WIDTH-1 : 0] = i_left; 62 | if (cache_use[INDEX_SIZE+DATA_WIDTH] == 1) begin 63 | if(cache_use[INDEX_SIZE+DATA_WIDTH-1 -: INDEX_SIZE] == index_up) begin 64 | partial_sum <= partial_sum + (cache_use[DATA_WIDTH-1 : 0] * i_up) >> Half_WIDTH; 65 | end 66 | end else begin 67 | // todo: 68 | partial_sum <= partial_sum; 69 | end 70 | end 71 | end 72 | end 73 | endmodule -------------------------------------------------------------------------------- /basic_components/tile_controller/tile_controller.v: -------------------------------------------------------------------------------- 1 | module tile_controller_4_4 #( 2 | parameter TILE_CNT = 4 // 整个阵列被分为 TILE_CNT * TILE_CNT 个小块 3 | ) ( 4 | input [4:0] in_signal, 5 | output [TILE_CNT*TILE_CNT-1:0] out_signal 6 | ); 7 | 8 | endmodule -------------------------------------------------------------------------------- /basic_components/wrapper/test/output.txt: -------------------------------------------------------------------------------- 1 | single pe test 2 | Time: 0, clk: 0 3 | Time: 1, clk: 1, _ 1 0 0 0 0 0 0 0 _ 4 | Time: 1, clk: 1, _ 0 0 0 0 0 0 0 1 _ 5 | Time: 1, clk: 1, _ 0 0 0 0 0 0 1 0 _ 6 | Time: 1, clk: 1, _ 0 0 0 0 0 1 0 0 _ 7 | Time: 1, clk: 1, _ 0 0 0 0 1 0 0 0 _ 8 | Time: 1, clk: 1, _ 0 0 0 1 0 0 0 0 _ 9 | Time: 1, clk: 1, _ 0 0 1 0 0 0 0 0 _ 10 | Time: 1, clk: 1, _ 0 1 0 0 0 0 0 0 _ 11 | 12 | Time: 2, clk: 1, _ 0 1 0 0 0 1 0 0 _ 13 | Time: 2, clk: 1, _ 1 0 0 0 1 0 0 0 _ 14 | Time: 2, clk: 1, _ 0 0 0 1 0 0 0 1 _ 15 | Time: 2, clk: 1, _ 0 0 1 0 0 0 1 0 _ 16 | Time: 2, clk: 1, _ 0 1 0 0 0 1 0 0 _ 17 | Time: 2, clk: 1, _ 1 0 0 0 1 0 0 0 _ 18 | Time: 2, clk: 1, _ 0 0 0 1 0 0 0 1 _ 19 | Time: 2, clk: 1, _ 0 0 1 0 0 0 1 0 _ 20 | 21 | Time: 3, clk: 1, _ 0 0 1 0 0 0 1 0 _ 22 | Time: 3, clk: 1, _ 0 1 0 0 0 1 0 0 _ 23 | Time: 3, clk: 1, _ 1 0 0 0 1 0 0 0 _ 24 | Time: 3, clk: 1, _ 0 0 0 1 0 0 0 1 _ 25 | Time: 3, clk: 1, _ 0 0 1 0 0 0 1 0 _ 26 | Time: 3, clk: 1, _ 0 1 0 0 0 1 0 0 _ 27 | Time: 3, clk: 1, _ 1 0 0 0 1 0 0 0 _ 28 | Time: 3, clk: 1, _ 0 0 0 1 0 0 0 1 _ 29 | 30 | Time: 4, clk: 1, _ 0 0 0 1 0 0 0 1 _ 31 | Time: 4, clk: 1, _ 0 0 1 0 0 0 1 0 _ 32 | Time: 4, clk: 1, _ 0 1 0 0 0 1 0 0 _ 33 | Time: 4, clk: 1, _ 1 0 0 0 1 0 0 0 _ 34 | Time: 4, clk: 1, _ 0 0 0 1 0 0 0 1 _ 35 | Time: 4, clk: 1, _ 0 0 1 0 0 0 1 0 _ 36 | Time: 4, clk: 1, _ 0 1 0 0 0 1 0 0 _ 37 | Time: 4, clk: 1, _ 1 0 0 0 1 0 0 0 _ 38 | 39 | wrapper_test.v:14: $finish called at 10 (1s) 40 | Time: 5, clk: 1, _ 1 0 0 0 1 0 0 0 _ 41 | -------------------------------------------------------------------------------- /basic_components/wrapper/test/wrapper_test.v: -------------------------------------------------------------------------------- 1 | `include "/home/toko/onesa/code/wrapper.v" 2 | module test #(parameter SIZE = 8) (); 3 | integer currTime = 0; 4 | reg clk = 0; 5 | reg tile = 0; 6 | wire [63 : 0]out; 7 | finish_decider #(8,4) test(clk, tile, out); 8 | // 修改input、显示初态 9 | initial begin 10 | # 0 tile = 1; 11 | $display("single pe test"); 12 | $display("Time: %d, clk: %d", currTime, clk); 13 | 14 | #10 $finish; 15 | end 16 | // 间隔输出 17 | always #2 begin 18 | currTime = currTime + 1; 19 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[0], test.signal[1], test.signal[2], test.signal[3], test.signal[4], test.signal[5], test.signal[6], test.signal[7]); 20 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[8], test.signal[9], test.signal[10], test.signal[11], test.signal[12], test.signal[13], test.signal[14], test.signal[15]); 21 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[16], test.signal[17], test.signal[18], test.signal[19], test.signal[20], test.signal[21], test.signal[22], test.signal[23]); 22 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[24], test.signal[25], test.signal[26], test.signal[27], test.signal[28], test.signal[29], test.signal[30], test.signal[31]); 23 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[32], test.signal[33], test.signal[34], test.signal[35], test.signal[36], test.signal[37], test.signal[38], test.signal[39]); 24 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[40], test.signal[41], test.signal[42], test.signal[43], test.signal[44], test.signal[45], test.signal[46], test.signal[47]); 25 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[48], test.signal[49], test.signal[50], test.signal[51], test.signal[52], test.signal[53], test.signal[54], test.signal[55]); 26 | $display("Time: %d, clk: %d, _ %d %d %d %d %d %d %d %d _", currTime, clk, test.signal[56], test.signal[57], test.signal[58], test.signal[59], test.signal[60], test.signal[61], test.signal[62], test.signal[63]); 27 | $display(""); 28 | end 29 | // 30 | always #1 begin 31 | clk = ~clk; 32 | end 33 | 34 | endmodule -------------------------------------------------------------------------------- /basic_components/wrapper/wrapper.v: -------------------------------------------------------------------------------- 1 | /* 2 | what's in this file: 3 | 将单个PE打包成PE阵列的wrapper。 4 | */ 5 | 6 | /* 7 | 地址映射: 8 | 0 ············ n-1 9 | · · 10 | · · 11 | · · 12 | n2-n ········· n2-1 13 | */ 14 | 15 | /* 16 | 每个乘累加PE需要finish信号告诉自己计算完毕,finish_decider()模块用来生成finish信号 17 | */ 18 | module finish_decider #( 19 | parameter ARRAY_SIZE = 4, 20 | parameter Half_SIZE = 2 21 | ) ( 22 | input clk, 23 | input tile, 24 | output reg [ARRAY_SIZE*ARRAY_SIZE -1 : 0] signal = 0 25 | ); 26 | reg unsigned[$clog2(ARRAY_SIZE)-1 : 0] counter = 0; 27 | reg unsigned[$clog2(ARRAY_SIZE)-1 : 0] counter_tile = 0; 28 | reg [$clog2(ARRAY_SIZE)-1 : 0] bias = 0; 29 | reg [$clog2(ARRAY_SIZE)-1 : 0] bias2 = 0; 30 | 31 | integer unsigned i; 32 | 33 | always @(posedge clk ) begin 34 | counter <= counter+1; 35 | counter_tile <= counter + 1 + Half_SIZE; 36 | 37 | signal = 0; 38 | for(i=0; i=2; i=i-1) begin 20 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 21 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 22 | end 23 | end else local <= local; 24 | end 25 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 26 | endmodule 27 | 28 | // used 4 times 29 | 30 | module serialize_ #( 31 | parameter LENGTH = 16, 32 | parameter BIT_WIDTH = 16 33 | ) ( 34 | input clk, 35 | input write_enable, 36 | input read_enable, 37 | input [LENGTH*BIT_WIDTH -1 : 0] in, 38 | output [BIT_WIDTH-1 : 0] out 39 | ); 40 | reg [BIT_WIDTH*LENGTH-1 : 0] local = 0; 41 | integer i; 42 | always @(posedge clk) begin 43 | if(write_enable == 1 & read_enable == 0) begin 44 | local <= in; 45 | end else if(write_enable == 0 & read_enable == 1) begin 46 | for (i = LENGTH; i>=2; i=i-1) begin 47 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 48 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 49 | end 50 | end else local <= local; 51 | end 52 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 53 | endmodule -------------------------------------------------------------------------------- /results/SA_32_32_16_16_16/util/input_L3.txt: -------------------------------------------------------------------------------- 1 | ! 8 of this stuff is used for top 2 | 3 | 4 | | Device : 7vx485tffg1158-2L 5 | | Design State : Synthesized 6 | ----------------------------------------------------------------------------------------------------------------- 7 | 8 | Utilization Design Information 9 | 10 | Table of Contents 11 | ----------------- 12 | 1. Slice Logic 13 | 1.1 Summary of Registers by Type 14 | 2. Memory 15 | 3. DSP 16 | 4. IO and GT Specific 17 | 5. Clocking 18 | 6. Specific Feature 19 | 7. Primitives 20 | 8. Black Boxes 21 | 9. Instantiated Netlists 22 | 23 | 1. Slice Logic 24 | -------------- 25 | 26 | +-------------------------+------+-------+-----------+-------+ 27 | | Site Type | Used | Fixed | Available | Util% | 28 | +-------------------------+------+-------+-----------+-------+ 29 | | Slice LUTs* | 0 | 0 | 303600 | 0.00 | 30 | | LUT as Logic | 0 | 0 | 303600 | 0.00 | 31 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 32 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 33 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 34 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 35 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 36 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 37 | +-------------------------+------+-------+-----------+-------+ 38 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 39 | 40 | 41 | 1.1 Summary of Registers by Type 42 | -------------------------------- 43 | 44 | +-------+--------------+-------------+--------------+ 45 | | Total | Clock Enable | Synchronous | Asynchronous | 46 | +-------+--------------+-------------+--------------+ 47 | | 0 | _ | - | - | 48 | | 0 | _ | - | Set | 49 | | 0 | _ | - | Reset | 50 | | 0 | _ | Set | - | 51 | | 0 | _ | Reset | - | 52 | | 0 | Yes | - | - | 53 | | 0 | Yes | - | Set | 54 | | 0 | Yes | - | Reset | 55 | | 0 | Yes | Set | - | 56 | | 512 | Yes | Reset | - | 57 | +-------+--------------+-------------+--------------+ 58 | 59 | 60 | 2. Memory 61 | --------- 62 | 63 | +----------------+------+-------+-----------+-------+ 64 | | Site Type | Used | Fixed | Available | Util% | 65 | +----------------+------+-------+-----------+-------+ 66 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 67 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 68 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 69 | +----------------+------+-------+-----------+-------+ 70 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 71 | 72 | 73 | 3. DSP 74 | ------ 75 | 76 | +-----------+------+-------+-----------+-------+ 77 | | Site Type | Used | Fixed | Available | Util% | 78 | +-----------+------+-------+-----------+-------+ 79 | | DSPs | 0 | 0 | 2800 | 0.00 | 80 | +-----------+------+-------+-----------+-------+ 81 | 82 | 83 | 4. IO and GT Specific 84 | --------------------- 85 | 86 | +-----------------------------+------+-------+-----------+--------+ 87 | | Site Type | Used | Fixed | Available | Util% | 88 | +-----------------------------+------+-------+-----------+--------+ 89 | | Bonded IOB | 578 | 0 | 350 | 165.14 | 90 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 91 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 92 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 93 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 94 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 96 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 97 | | IBUFDS | 0 | 0 | 336 | 0.00 | 98 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 99 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 100 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 101 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 102 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 104 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 105 | | ILOGIC | 0 | 0 | 350 | 0.00 | 106 | | OLOGIC | 0 | 0 | 350 | 0.00 | 107 | +-----------------------------+------+-------+-----------+--------+ 108 | 109 | 110 | 5. Clocking 111 | ----------- 112 | 113 | +------------+------+-------+-----------+-------+ 114 | | Site Type | Used | Fixed | Available | Util% | 115 | +------------+------+-------+-----------+-------+ 116 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 117 | | BUFIO | 0 | 0 | 56 | 0.00 | 118 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 119 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 120 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 121 | | BUFHCE | 0 | 0 | 168 | 0.00 | 122 | | BUFR | 0 | 0 | 56 | 0.00 | 123 | +------------+------+-------+-----------+-------+ 124 | 125 | 126 | 6. Specific Feature 127 | ------------------- 128 | 129 | +-------------+------+-------+-----------+-------+ 130 | | Site Type | Used | Fixed | Available | Util% | 131 | +-------------+------+-------+-----------+-------+ 132 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 133 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 134 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 135 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 136 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 137 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 138 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 139 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 140 | | XADC | 0 | 0 | 1 | 0.00 | 141 | +-------------+------+-------+-----------+-------+ 142 | 143 | 144 | 7. Primitives 145 | ------------- 146 | 147 | +----------+------+---------------------+ 148 | | Ref Name | Used | Functional Category | 149 | +----------+------+---------------------+ 150 | | OBUF | 512 | IO | 151 | | FDRE | 512 | Flop & Latch | 152 | | IBUF | 66 | IO | 153 | | BUFG | 1 | Clock | 154 | +----------+------+---------------------+ 155 | 156 | 157 | 8. Black Boxes 158 | -------------- 159 | 160 | +----------+------+ 161 | | Ref Name | Used | 162 | +----------+------+ 163 | 164 | 165 | 9. Instantiated Netlists 166 | ------------------------ 167 | 168 | +----------+------+ 169 | | Ref Name | Used | 170 | +----------+------+ 171 | 172 | 173 | -------------------------------------------------------------------------------- /results/SA_32_32_16_16_16/util/output_matrix.txt: -------------------------------------------------------------------------------- 1 | // used 32+2 times 2 | 3 | | Device : 7vx485tffg1158-2L 4 | | Design State : Synthesized 5 | ------------------------------------------------------------------------------------------------------------- 6 | 7 | Utilization Design Information 8 | 9 | Table of Contents 10 | ----------------- 11 | 1. Slice Logic 12 | 1.1 Summary of Registers by Type 13 | 2. Memory 14 | 3. DSP 15 | 4. IO and GT Specific 16 | 5. Clocking 17 | 6. Specific Feature 18 | 7. Primitives 19 | 8. Black Boxes 20 | 9. Instantiated Netlists 21 | 22 | 1. Slice Logic 23 | -------------- 24 | 25 | +-------------------------+------+-------+-----------+-------+ 26 | | Site Type | Used | Fixed | Available | Util% | 27 | +-------------------------+------+-------+-----------+-------+ 28 | | Slice LUTs* | 498 | 0 | 303600 | 0.16 | 29 | | LUT as Logic | 498 | 0 | 303600 | 0.16 | 30 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 31 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 32 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +-------------------------+------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 512 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +-----------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +-----------+------+-------+-----------+-------+ 78 | | DSPs | 0 | 0 | 2800 | 0.00 | 79 | +-----------+------+-------+-----------+-------+ 80 | 81 | 82 | 4. IO and GT Specific 83 | --------------------- 84 | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Site Type | Used | Fixed | Available | Util% | 87 | +-----------------------------+------+-------+-----------+--------+ 88 | | Bonded IOB | 531 | 0 | 350 | 151.71 | 89 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 90 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 91 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 92 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 93 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 94 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 96 | | IBUFDS | 0 | 0 | 336 | 0.00 | 97 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 98 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 99 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 100 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 101 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 102 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 104 | | ILOGIC | 0 | 0 | 350 | 0.00 | 105 | | OLOGIC | 0 | 0 | 350 | 0.00 | 106 | +-----------------------------+------+-------+-----------+--------+ 107 | 108 | 109 | 5. Clocking 110 | ----------- 111 | 112 | +------------+------+-------+-----------+-------+ 113 | | Site Type | Used | Fixed | Available | Util% | 114 | +------------+------+-------+-----------+-------+ 115 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 116 | | BUFIO | 0 | 0 | 56 | 0.00 | 117 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 118 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 119 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 120 | | BUFHCE | 0 | 0 | 168 | 0.00 | 121 | | BUFR | 0 | 0 | 56 | 0.00 | 122 | +------------+------+-------+-----------+-------+ 123 | 124 | 125 | 6. Specific Feature 126 | ------------------- 127 | 128 | +-------------+------+-------+-----------+-------+ 129 | | Site Type | Used | Fixed | Available | Util% | 130 | +-------------+------+-------+-----------+-------+ 131 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 132 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 133 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 134 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 135 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 136 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 137 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 138 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 139 | | XADC | 0 | 0 | 1 | 0.00 | 140 | +-------------+------+-------+-----------+-------+ 141 | 142 | 143 | 7. Primitives 144 | ------------- 145 | 146 | +----------+------+---------------------+ 147 | | Ref Name | Used | Functional Category | 148 | +----------+------+---------------------+ 149 | | IBUF | 515 | IO | 150 | | FDRE | 512 | Flop & Latch | 151 | | LUT4 | 496 | LUT | 152 | | OBUF | 16 | IO | 153 | | LUT2 | 2 | LUT | 154 | | BUFG | 1 | Clock | 155 | +----------+------+---------------------+ 156 | 157 | 158 | 8. Black Boxes 159 | -------------- 160 | 161 | +----------+------+ 162 | | Ref Name | Used | 163 | +----------+------+ 164 | 165 | 166 | 9. Instantiated Netlists 167 | ------------------------ 168 | 169 | +----------+------+ 170 | | Ref Name | Used | 171 | +----------+------+ 172 | 173 | 174 | -------------------------------------------------------------------------------- /results/SA_32_32_16_4_4/input_L2.v: -------------------------------------------------------------------------------- 1 | module mux #( 2 | parameter LENGTH = 4, 3 | parameter DATA_WIDTH = 16 4 | )( 5 | input flag, 6 | input [LENGTH*DATA_WIDTH-1 : 0] in_0, 7 | input [LENGTH*DATA_WIDTH-1 : 0] in_1, 8 | output [LENGTH*DATA_WIDTH-1 : 0] out 9 | ); 10 | assign out = flag ? in_1 : in_0; 11 | endmodule 12 | 13 | module invert #( 14 | parameter LENGTH = 4, 15 | parameter DATA_WIDTH = 16 16 | ) ( 17 | input [LENGTH*DATA_WIDTH-1 : 0] in, 18 | output [LENGTH*DATA_WIDTH-1 : 0] out 19 | ); 20 | genvar i; 21 | generate for (i=0; i=2; i=i-1) begin 20 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 21 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 22 | end 23 | end else local <= local; 24 | end 25 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 26 | endmodule 27 | 28 | // used 16 times 29 | 30 | module serialize_ #( 31 | parameter LENGTH = 8, 32 | parameter BIT_WIDTH = 16 33 | ) ( 34 | input clk, 35 | input write_enable, 36 | input read_enable, 37 | input [LENGTH*BIT_WIDTH -1 : 0] in, 38 | output [BIT_WIDTH-1 : 0] out 39 | ); 40 | reg [BIT_WIDTH*LENGTH-1 : 0] local = 0; 41 | integer i; 42 | always @(posedge clk) begin 43 | if(write_enable == 1 & read_enable == 0) begin 44 | local <= in; 45 | end else if(write_enable == 0 & read_enable == 1) begin 46 | for (i = LENGTH; i>=2; i=i-1) begin 47 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 48 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 49 | end 50 | end else local <= local; 51 | end 52 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 53 | endmodule -------------------------------------------------------------------------------- /results/SA_32_32_16_4_4/util/input_L3.txt: -------------------------------------------------------------------------------- 1 | ! 8 of this stuff is used for top 2 | 3 | 4 | | Device : 7vx485tffg1158-2L 5 | | Design State : Synthesized 6 | ----------------------------------------------------------------------------------------------------------------- 7 | 8 | Utilization Design Information 9 | 10 | Table of Contents 11 | ----------------- 12 | 1. Slice Logic 13 | 1.1 Summary of Registers by Type 14 | 2. Memory 15 | 3. DSP 16 | 4. IO and GT Specific 17 | 5. Clocking 18 | 6. Specific Feature 19 | 7. Primitives 20 | 8. Black Boxes 21 | 9. Instantiated Netlists 22 | 23 | 1. Slice Logic 24 | -------------- 25 | 26 | +-------------------------+------+-------+-----------+-------+ 27 | | Site Type | Used | Fixed | Available | Util% | 28 | +-------------------------+------+-------+-----------+-------+ 29 | | Slice LUTs* | 0 | 0 | 303600 | 0.00 | 30 | | LUT as Logic | 0 | 0 | 303600 | 0.00 | 31 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 32 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 33 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 34 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 35 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 36 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 37 | +-------------------------+------+-------+-----------+-------+ 38 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 39 | 40 | 41 | 1.1 Summary of Registers by Type 42 | -------------------------------- 43 | 44 | +-------+--------------+-------------+--------------+ 45 | | Total | Clock Enable | Synchronous | Asynchronous | 46 | +-------+--------------+-------------+--------------+ 47 | | 0 | _ | - | - | 48 | | 0 | _ | - | Set | 49 | | 0 | _ | - | Reset | 50 | | 0 | _ | Set | - | 51 | | 0 | _ | Reset | - | 52 | | 0 | Yes | - | - | 53 | | 0 | Yes | - | Set | 54 | | 0 | Yes | - | Reset | 55 | | 0 | Yes | Set | - | 56 | | 512 | Yes | Reset | - | 57 | +-------+--------------+-------------+--------------+ 58 | 59 | 60 | 2. Memory 61 | --------- 62 | 63 | +----------------+------+-------+-----------+-------+ 64 | | Site Type | Used | Fixed | Available | Util% | 65 | +----------------+------+-------+-----------+-------+ 66 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 67 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 68 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 69 | +----------------+------+-------+-----------+-------+ 70 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 71 | 72 | 73 | 3. DSP 74 | ------ 75 | 76 | +-----------+------+-------+-----------+-------+ 77 | | Site Type | Used | Fixed | Available | Util% | 78 | +-----------+------+-------+-----------+-------+ 79 | | DSPs | 0 | 0 | 2800 | 0.00 | 80 | +-----------+------+-------+-----------+-------+ 81 | 82 | 83 | 4. IO and GT Specific 84 | --------------------- 85 | 86 | +-----------------------------+------+-------+-----------+--------+ 87 | | Site Type | Used | Fixed | Available | Util% | 88 | +-----------------------------+------+-------+-----------+--------+ 89 | | Bonded IOB | 578 | 0 | 350 | 165.14 | 90 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 91 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 92 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 93 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 94 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 96 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 97 | | IBUFDS | 0 | 0 | 336 | 0.00 | 98 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 99 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 100 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 101 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 102 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 104 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 105 | | ILOGIC | 0 | 0 | 350 | 0.00 | 106 | | OLOGIC | 0 | 0 | 350 | 0.00 | 107 | +-----------------------------+------+-------+-----------+--------+ 108 | 109 | 110 | 5. Clocking 111 | ----------- 112 | 113 | +------------+------+-------+-----------+-------+ 114 | | Site Type | Used | Fixed | Available | Util% | 115 | +------------+------+-------+-----------+-------+ 116 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 117 | | BUFIO | 0 | 0 | 56 | 0.00 | 118 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 119 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 120 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 121 | | BUFHCE | 0 | 0 | 168 | 0.00 | 122 | | BUFR | 0 | 0 | 56 | 0.00 | 123 | +------------+------+-------+-----------+-------+ 124 | 125 | 126 | 6. Specific Feature 127 | ------------------- 128 | 129 | +-------------+------+-------+-----------+-------+ 130 | | Site Type | Used | Fixed | Available | Util% | 131 | +-------------+------+-------+-----------+-------+ 132 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 133 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 134 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 135 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 136 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 137 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 138 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 139 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 140 | | XADC | 0 | 0 | 1 | 0.00 | 141 | +-------------+------+-------+-----------+-------+ 142 | 143 | 144 | 7. Primitives 145 | ------------- 146 | 147 | +----------+------+---------------------+ 148 | | Ref Name | Used | Functional Category | 149 | +----------+------+---------------------+ 150 | | OBUF | 512 | IO | 151 | | FDRE | 512 | Flop & Latch | 152 | | IBUF | 66 | IO | 153 | | BUFG | 1 | Clock | 154 | +----------+------+---------------------+ 155 | 156 | 157 | 8. Black Boxes 158 | -------------- 159 | 160 | +----------+------+ 161 | | Ref Name | Used | 162 | +----------+------+ 163 | 164 | 165 | 9. Instantiated Netlists 166 | ------------------------ 167 | 168 | +----------+------+ 169 | | Ref Name | Used | 170 | +----------+------+ 171 | 172 | 173 | -------------------------------------------------------------------------------- /results/SA_32_32_16_4_4/util/output_matrix.txt: -------------------------------------------------------------------------------- 1 | // used 32+2 times 2 | 3 | | Device : 7vx485tffg1158-2L 4 | | Design State : Synthesized 5 | ------------------------------------------------------------------------------------------------------------- 6 | 7 | Utilization Design Information 8 | 9 | Table of Contents 10 | ----------------- 11 | 1. Slice Logic 12 | 1.1 Summary of Registers by Type 13 | 2. Memory 14 | 3. DSP 15 | 4. IO and GT Specific 16 | 5. Clocking 17 | 6. Specific Feature 18 | 7. Primitives 19 | 8. Black Boxes 20 | 9. Instantiated Netlists 21 | 22 | 1. Slice Logic 23 | -------------- 24 | 25 | +-------------------------+------+-------+-----------+-------+ 26 | | Site Type | Used | Fixed | Available | Util% | 27 | +-------------------------+------+-------+-----------+-------+ 28 | | Slice LUTs* | 498 | 0 | 303600 | 0.16 | 29 | | LUT as Logic | 498 | 0 | 303600 | 0.16 | 30 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 31 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 32 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +-------------------------+------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 512 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +-----------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +-----------+------+-------+-----------+-------+ 78 | | DSPs | 0 | 0 | 2800 | 0.00 | 79 | +-----------+------+-------+-----------+-------+ 80 | 81 | 82 | 4. IO and GT Specific 83 | --------------------- 84 | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Site Type | Used | Fixed | Available | Util% | 87 | +-----------------------------+------+-------+-----------+--------+ 88 | | Bonded IOB | 531 | 0 | 350 | 151.71 | 89 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 90 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 91 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 92 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 93 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 94 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 96 | | IBUFDS | 0 | 0 | 336 | 0.00 | 97 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 98 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 99 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 100 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 101 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 102 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 104 | | ILOGIC | 0 | 0 | 350 | 0.00 | 105 | | OLOGIC | 0 | 0 | 350 | 0.00 | 106 | +-----------------------------+------+-------+-----------+--------+ 107 | 108 | 109 | 5. Clocking 110 | ----------- 111 | 112 | +------------+------+-------+-----------+-------+ 113 | | Site Type | Used | Fixed | Available | Util% | 114 | +------------+------+-------+-----------+-------+ 115 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 116 | | BUFIO | 0 | 0 | 56 | 0.00 | 117 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 118 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 119 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 120 | | BUFHCE | 0 | 0 | 168 | 0.00 | 121 | | BUFR | 0 | 0 | 56 | 0.00 | 122 | +------------+------+-------+-----------+-------+ 123 | 124 | 125 | 6. Specific Feature 126 | ------------------- 127 | 128 | +-------------+------+-------+-----------+-------+ 129 | | Site Type | Used | Fixed | Available | Util% | 130 | +-------------+------+-------+-----------+-------+ 131 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 132 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 133 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 134 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 135 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 136 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 137 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 138 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 139 | | XADC | 0 | 0 | 1 | 0.00 | 140 | +-------------+------+-------+-----------+-------+ 141 | 142 | 143 | 7. Primitives 144 | ------------- 145 | 146 | +----------+------+---------------------+ 147 | | Ref Name | Used | Functional Category | 148 | +----------+------+---------------------+ 149 | | IBUF | 515 | IO | 150 | | FDRE | 512 | Flop & Latch | 151 | | LUT4 | 496 | LUT | 152 | | OBUF | 16 | IO | 153 | | LUT2 | 2 | LUT | 154 | | BUFG | 1 | Clock | 155 | +----------+------+---------------------+ 156 | 157 | 158 | 8. Black Boxes 159 | -------------- 160 | 161 | +----------+------+ 162 | | Ref Name | Used | 163 | +----------+------+ 164 | 165 | 166 | 9. Instantiated Netlists 167 | ------------------------ 168 | 169 | +----------+------+ 170 | | Ref Name | Used | 171 | +----------+------+ 172 | 173 | 174 | -------------------------------------------------------------------------------- /results/SA_32_32_16_8_8/input_L2.v: -------------------------------------------------------------------------------- 1 | module mux #( 2 | parameter LENGTH = 4, 3 | parameter DATA_WIDTH = 16 4 | )( 5 | input flag, 6 | input [LENGTH*DATA_WIDTH-1 : 0] in_0, 7 | input [LENGTH*DATA_WIDTH-1 : 0] in_1, 8 | output [LENGTH*DATA_WIDTH-1 : 0] out 9 | ); 10 | assign out = flag ? in_1 : in_0; 11 | endmodule 12 | 13 | module invert #( 14 | parameter LENGTH = 4, 15 | parameter DATA_WIDTH = 16 16 | ) ( 17 | input [LENGTH*DATA_WIDTH-1 : 0] in, 18 | output [LENGTH*DATA_WIDTH-1 : 0] out 19 | ); 20 | genvar i; 21 | generate for (i=0; i=2; i=i-1) begin 20 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 21 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 22 | end 23 | end else local <= local; 24 | end 25 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 26 | endmodule 27 | 28 | // used 16 times 29 | 30 | module serialize_ #( 31 | parameter LENGTH = 8, 32 | parameter BIT_WIDTH = 16 33 | ) ( 34 | input clk, 35 | input write_enable, 36 | input read_enable, 37 | input [LENGTH*BIT_WIDTH -1 : 0] in, 38 | output [BIT_WIDTH-1 : 0] out 39 | ); 40 | reg [BIT_WIDTH*LENGTH-1 : 0] local = 0; 41 | integer i; 42 | always @(posedge clk) begin 43 | if(write_enable == 1 & read_enable == 0) begin 44 | local <= in; 45 | end else if(write_enable == 0 & read_enable == 1) begin 46 | for (i = LENGTH; i>=2; i=i-1) begin 47 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 48 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 49 | end 50 | end else local <= local; 51 | end 52 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 53 | endmodule -------------------------------------------------------------------------------- /results/SA_32_32_16_8_8/util/input_L3.txt: -------------------------------------------------------------------------------- 1 | ! 8 of this stuff is used for top 2 | 3 | 4 | | Device : 7vx485tffg1158-2L 5 | | Design State : Synthesized 6 | ----------------------------------------------------------------------------------------------------------------- 7 | 8 | Utilization Design Information 9 | 10 | Table of Contents 11 | ----------------- 12 | 1. Slice Logic 13 | 1.1 Summary of Registers by Type 14 | 2. Memory 15 | 3. DSP 16 | 4. IO and GT Specific 17 | 5. Clocking 18 | 6. Specific Feature 19 | 7. Primitives 20 | 8. Black Boxes 21 | 9. Instantiated Netlists 22 | 23 | 1. Slice Logic 24 | -------------- 25 | 26 | +-------------------------+------+-------+-----------+-------+ 27 | | Site Type | Used | Fixed | Available | Util% | 28 | +-------------------------+------+-------+-----------+-------+ 29 | | Slice LUTs* | 0 | 0 | 303600 | 0.00 | 30 | | LUT as Logic | 0 | 0 | 303600 | 0.00 | 31 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 32 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 33 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 34 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 35 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 36 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 37 | +-------------------------+------+-------+-----------+-------+ 38 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 39 | 40 | 41 | 1.1 Summary of Registers by Type 42 | -------------------------------- 43 | 44 | +-------+--------------+-------------+--------------+ 45 | | Total | Clock Enable | Synchronous | Asynchronous | 46 | +-------+--------------+-------------+--------------+ 47 | | 0 | _ | - | - | 48 | | 0 | _ | - | Set | 49 | | 0 | _ | - | Reset | 50 | | 0 | _ | Set | - | 51 | | 0 | _ | Reset | - | 52 | | 0 | Yes | - | - | 53 | | 0 | Yes | - | Set | 54 | | 0 | Yes | - | Reset | 55 | | 0 | Yes | Set | - | 56 | | 512 | Yes | Reset | - | 57 | +-------+--------------+-------------+--------------+ 58 | 59 | 60 | 2. Memory 61 | --------- 62 | 63 | +----------------+------+-------+-----------+-------+ 64 | | Site Type | Used | Fixed | Available | Util% | 65 | +----------------+------+-------+-----------+-------+ 66 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 67 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 68 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 69 | +----------------+------+-------+-----------+-------+ 70 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 71 | 72 | 73 | 3. DSP 74 | ------ 75 | 76 | +-----------+------+-------+-----------+-------+ 77 | | Site Type | Used | Fixed | Available | Util% | 78 | +-----------+------+-------+-----------+-------+ 79 | | DSPs | 0 | 0 | 2800 | 0.00 | 80 | +-----------+------+-------+-----------+-------+ 81 | 82 | 83 | 4. IO and GT Specific 84 | --------------------- 85 | 86 | +-----------------------------+------+-------+-----------+--------+ 87 | | Site Type | Used | Fixed | Available | Util% | 88 | +-----------------------------+------+-------+-----------+--------+ 89 | | Bonded IOB | 578 | 0 | 350 | 165.14 | 90 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 91 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 92 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 93 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 94 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 96 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 97 | | IBUFDS | 0 | 0 | 336 | 0.00 | 98 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 99 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 100 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 101 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 102 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 104 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 105 | | ILOGIC | 0 | 0 | 350 | 0.00 | 106 | | OLOGIC | 0 | 0 | 350 | 0.00 | 107 | +-----------------------------+------+-------+-----------+--------+ 108 | 109 | 110 | 5. Clocking 111 | ----------- 112 | 113 | +------------+------+-------+-----------+-------+ 114 | | Site Type | Used | Fixed | Available | Util% | 115 | +------------+------+-------+-----------+-------+ 116 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 117 | | BUFIO | 0 | 0 | 56 | 0.00 | 118 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 119 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 120 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 121 | | BUFHCE | 0 | 0 | 168 | 0.00 | 122 | | BUFR | 0 | 0 | 56 | 0.00 | 123 | +------------+------+-------+-----------+-------+ 124 | 125 | 126 | 6. Specific Feature 127 | ------------------- 128 | 129 | +-------------+------+-------+-----------+-------+ 130 | | Site Type | Used | Fixed | Available | Util% | 131 | +-------------+------+-------+-----------+-------+ 132 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 133 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 134 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 135 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 136 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 137 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 138 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 139 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 140 | | XADC | 0 | 0 | 1 | 0.00 | 141 | +-------------+------+-------+-----------+-------+ 142 | 143 | 144 | 7. Primitives 145 | ------------- 146 | 147 | +----------+------+---------------------+ 148 | | Ref Name | Used | Functional Category | 149 | +----------+------+---------------------+ 150 | | OBUF | 512 | IO | 151 | | FDRE | 512 | Flop & Latch | 152 | | IBUF | 66 | IO | 153 | | BUFG | 1 | Clock | 154 | +----------+------+---------------------+ 155 | 156 | 157 | 8. Black Boxes 158 | -------------- 159 | 160 | +----------+------+ 161 | | Ref Name | Used | 162 | +----------+------+ 163 | 164 | 165 | 9. Instantiated Netlists 166 | ------------------------ 167 | 168 | +----------+------+ 169 | | Ref Name | Used | 170 | +----------+------+ 171 | 172 | 173 | -------------------------------------------------------------------------------- /results/SA_32_32_16_8_8/util/output_matrix.txt: -------------------------------------------------------------------------------- 1 | // used 32+2 times 2 | 3 | | Device : 7vx485tffg1158-2L 4 | | Design State : Synthesized 5 | ------------------------------------------------------------------------------------------------------------- 6 | 7 | Utilization Design Information 8 | 9 | Table of Contents 10 | ----------------- 11 | 1. Slice Logic 12 | 1.1 Summary of Registers by Type 13 | 2. Memory 14 | 3. DSP 15 | 4. IO and GT Specific 16 | 5. Clocking 17 | 6. Specific Feature 18 | 7. Primitives 19 | 8. Black Boxes 20 | 9. Instantiated Netlists 21 | 22 | 1. Slice Logic 23 | -------------- 24 | 25 | +-------------------------+------+-------+-----------+-------+ 26 | | Site Type | Used | Fixed | Available | Util% | 27 | +-------------------------+------+-------+-----------+-------+ 28 | | Slice LUTs* | 498 | 0 | 303600 | 0.16 | 29 | | LUT as Logic | 498 | 0 | 303600 | 0.16 | 30 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 31 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 32 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +-------------------------+------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 512 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +-----------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +-----------+------+-------+-----------+-------+ 78 | | DSPs | 0 | 0 | 2800 | 0.00 | 79 | +-----------+------+-------+-----------+-------+ 80 | 81 | 82 | 4. IO and GT Specific 83 | --------------------- 84 | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Site Type | Used | Fixed | Available | Util% | 87 | +-----------------------------+------+-------+-----------+--------+ 88 | | Bonded IOB | 531 | 0 | 350 | 151.71 | 89 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 90 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 91 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 92 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 93 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 94 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 96 | | IBUFDS | 0 | 0 | 336 | 0.00 | 97 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 98 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 99 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 100 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 101 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 102 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 104 | | ILOGIC | 0 | 0 | 350 | 0.00 | 105 | | OLOGIC | 0 | 0 | 350 | 0.00 | 106 | +-----------------------------+------+-------+-----------+--------+ 107 | 108 | 109 | 5. Clocking 110 | ----------- 111 | 112 | +------------+------+-------+-----------+-------+ 113 | | Site Type | Used | Fixed | Available | Util% | 114 | +------------+------+-------+-----------+-------+ 115 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 116 | | BUFIO | 0 | 0 | 56 | 0.00 | 117 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 118 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 119 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 120 | | BUFHCE | 0 | 0 | 168 | 0.00 | 121 | | BUFR | 0 | 0 | 56 | 0.00 | 122 | +------------+------+-------+-----------+-------+ 123 | 124 | 125 | 6. Specific Feature 126 | ------------------- 127 | 128 | +-------------+------+-------+-----------+-------+ 129 | | Site Type | Used | Fixed | Available | Util% | 130 | +-------------+------+-------+-----------+-------+ 131 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 132 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 133 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 134 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 135 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 136 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 137 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 138 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 139 | | XADC | 0 | 0 | 1 | 0.00 | 140 | +-------------+------+-------+-----------+-------+ 141 | 142 | 143 | 7. Primitives 144 | ------------- 145 | 146 | +----------+------+---------------------+ 147 | | Ref Name | Used | Functional Category | 148 | +----------+------+---------------------+ 149 | | IBUF | 515 | IO | 150 | | FDRE | 512 | Flop & Latch | 151 | | LUT4 | 496 | LUT | 152 | | OBUF | 16 | IO | 153 | | LUT2 | 2 | LUT | 154 | | BUFG | 1 | Clock | 155 | +----------+------+---------------------+ 156 | 157 | 158 | 8. Black Boxes 159 | -------------- 160 | 161 | +----------+------+ 162 | | Ref Name | Used | 163 | +----------+------+ 164 | 165 | 166 | 9. Instantiated Netlists 167 | ------------------------ 168 | 169 | +----------+------+ 170 | | Ref Name | Used | 171 | +----------+------+ 172 | 173 | 174 | -------------------------------------------------------------------------------- /results/comparison.md: -------------------------------------------------------------------------------- 1 | | 普通SA | BRAM | LUT | FF | DSP | 2 | | ------------ | ---- | ----- | ----- | ---- | 3 | | 共计 | 0 | 37127 | 70829 | 1024 | 4 | | PE & L1 | 0 | 19296 | 49824 | 1024 | 5 | | L2 ( I ) | 0 | 928 | 2048 | 0 | 6 | | L3 ( I & O ) | 0 | 16434 | 17920 | 0 | 7 | | input_L3 | 0 | 0 | 1024 | 0 | 8 | | output_L3 | 0 | 17430 | 17920 | 0 | 9 | 10 | | SA_16*16 | BRAM | LUT | FF | DSP | 11 | | ------------ | ---- | ----- | ----- | ---- | 12 | | 共计 | 0 | 40236 | 71488 | 1024 | 13 | | PE & L1 | 0 | 19584 | 47360 | 1024 | 14 | | L2 ( I ) | 0 | 2752 | 3648 | 0 | 15 | | L3 ( I & O ) | 0 | 17900 | 20480 | 0 | 16 | | input_L3 | 0 | 0 | 2048 | 0 | 17 | | output_L3 | 0 | 17900 | 18432 | 0 | 18 | 19 | | SA_8*8 | BRAM | LUT | FF | DSP | 20 | | ------------ | ---- | ----- | ----- | ---- | 21 | | 共计 | 0 | 46990 | 75974 | 1024 | 22 | | PE & L1 | 0 | 22218 | 45830 | 1024 | 23 | | L2 ( I ) | 0 | 6016 | 6592 | 0 | 24 | | L3 ( I & O ) | 0 | 18756 | 23552 | 0 | 25 | | input_L3 | 0 | 0 | 4096 | 0 | 26 | | output_L3 | 0 | 18756 | 19456 | 0 | 27 | 28 | | SA_4*4 | BRAM | LUT | FF | DSP | 29 | | ------------ | ---- | ----- | ----- | ---- | 30 | | 共计 | 0 | 60452 | 85056 | 1024 | 31 | | PE & L1 | 0 | 28416 | 43008 | 1024 | 32 | | L2 ( I ) | 0 | 11904 | 12352 | 0 | 33 | | L3 ( I & O ) | 0 | 20132 | 29696 | 0 | 34 | | input_L3 | 0 | 0 | 8192 | 0 | 35 | | output_L3 | 0 | 20132 | 21504 | 0 | 36 | -------------------------------------------------------------------------------- /results/old_reports/critical_path.md: -------------------------------------------------------------------------------- 1 | ### PE with rounding 2 | 3 | | bitwidth | critical path | total delay | logic delay | 4 | | --- | --- | --- | --- | 5 | | 32 | 14 + 1 = 15 | 16.381 + 0.281 =  16.622 ns | 6.644 + 0.164 = 6.808 ns | 6 | | 16 | 7 + 1 = 8 | 10.203 + 0.276  = 10.879 ns | 5.802 + 0.141 = 5.943 ns | 7 | | 8 | 10 + 1 = 11 | 8.599 + 0.324  = 8.923 ns | 3.870 + 0.141 = 4.011 ns | 8 | 9 | ### finish decider 10 | 11 | | size | critical path | total delay (ns) | logic delay (ns) | 12 | | --- | --- | --- | --- | 13 | | 32*32 | | | | 14 | | 16*16 | 2 + 2 = 4 | 3.312 + 0.394 = 3.706 | 2.741 + 0.239 = 2.98 | 15 | | 8*8 | 2 + 2 = 4 | 3.312 + 0.388 = 3.7 | 2.741 + 0.239 = 2.98 | 16 | | 4*4 | 2 + 2 = 4 | 3.312 + 0.381 = 3.693 | 2.741 + 0.239 = 2.98 | 17 | 18 | -------------------------------------------------------------------------------- /results/old_reports/utilization.md: -------------------------------------------------------------------------------- 1 | ### each PE with rounding 2 | 3 | | bitwidth | LUT | DSP48E1 | register | 4 | | --- | --- | --- | --- | 5 | | 32 | 47 | 3 | 128 | 6 | | 16 | 16 | 1 | 64 | 7 | | 8 | 41 | 0 | 32 | 8 | | 4 | 10 | 0 | 16 | 9 | 10 | 规整且符合预期,打包成阵列直接乘上PE数目就可以。 11 | 12 | ### finish decider 13 | 14 | | size | LUT | DSP48E1 | register | 15 | | --- | --- | --- | --- | 16 | | 32*32 | 41 | 0 | 38 | 17 | | 16*16 | 19 | 0 | 21 | 18 | | 8*8 | 7 | 0 | 14 | 19 | | 4*4 | 4 | 0 | 8 | -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/input_L2.v: -------------------------------------------------------------------------------- 1 | // two of this stuff is used for one top module. 2 | 3 | module shifter #( 4 | parameter LENGTH = 3, 5 | parameter DATA_WIDTH = 16 6 | ) ( 7 | input clk, 8 | input enable, 9 | input [DATA_WIDTH-1 : 0] in, 10 | output reg [DATA_WIDTH-1 : 0] out = 0 11 | ); 12 | reg [DATA_WIDTH*LENGTH-1 : 0] inner_shifters = 0; 13 | integer i; 14 | always @(posedge clk ) begin 15 | if (enable) begin 16 | inner_shifters[DATA_WIDTH-1 : 0] <= in; 17 | out <= inner_shifters[DATA_WIDTH*LENGTH-1 -: DATA_WIDTH]; 18 | for (i=1; i=2; i=i-1) begin 17 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 18 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 19 | end 20 | local[BIT_WIDTH-1 -: BIT_WIDTH] <= in; 21 | end else local <= local; 22 | end 23 | assign out = local; 24 | endmodule -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/output_L3.v: -------------------------------------------------------------------------------- 1 | // LENGTH+3 of this stuff is used in the top module 2 | 3 | module serialize #( 4 | parameter LENGTH = 32, 5 | parameter BIT_WIDTH = 16 6 | ) ( 7 | input clk, 8 | input write_enable, 9 | input read_enable, 10 | input [LENGTH*BIT_WIDTH -1 : 0] in, 11 | output [BIT_WIDTH-1 : 0] out 12 | ); 13 | reg [BIT_WIDTH*LENGTH-1 : 0] local = 0; 14 | integer i; 15 | always @(posedge clk) begin 16 | if(write_enable == 1 & read_enable == 0) begin 17 | local <= in; 18 | end else if(write_enable == 0 & read_enable == 1) begin 19 | for (i = LENGTH; i>=2; i=i-1) begin 20 | local [i*BIT_WIDTH-1 -: BIT_WIDTH] 21 | <= local[(i-1)*BIT_WIDTH-1 -: BIT_WIDTH]; 22 | end 23 | end else local <= local; 24 | end 25 | assign out = local[LENGTH*BIT_WIDTH-1 -: BIT_WIDTH]; 26 | endmodule -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/pe_array.v: -------------------------------------------------------------------------------- 1 | module single_PE_rounded #( 2 | parameter DATA_WIDTH = 8, 3 | parameter Half_WIDTH = 4 4 | )( 5 | input clk, 6 | input finish, 7 | input [DATA_WIDTH-1 : 0] i_up, 8 | input [DATA_WIDTH-1 : 0] i_left, 9 | output reg [DATA_WIDTH-1 : 0] o_down, 10 | output reg [DATA_WIDTH-1 : 0] o_right, 11 | output reg [DATA_WIDTH-1 : 0] o_result = 0 12 | ); 13 | reg [DATA_WIDTH-1 : 0] partial_sum = 0; 14 | wire [DATA_WIDTH-1 : 0] x; 15 | assign x = (i_up*i_left) >> Half_WIDTH; 16 | always @(posedge clk) begin 17 | o_down <= i_up; 18 | o_right <= i_left; 19 | o_result <= finish ? partial_sum : o_result; 20 | partial_sum <= finish ? x : (partial_sum + x); 21 | end 22 | endmodule 23 | 24 | 25 | module singple_kernel #( 26 | parameter DATA_WIDTH = 16, 27 | parameter SIZE = 32 28 | ) ( 29 | input clk, 30 | input [SIZE*SIZE-1:0] finish, // 编号规则: 31 | input [SIZE*DATA_WIDTH-1:0] in_up, // n_n ---- n_1 32 | input [SIZE*DATA_WIDTH-1:0] in_left, // | | 33 | output [SIZE*DATA_WIDTH-1:0] pass_down, // | | 34 | output [SIZE*DATA_WIDTH-1:0] pass_right, // 1_n ---- 1_1 35 | output [SIZE*SIZE*DATA_WIDTH-1:0] out_matrix, // 36 | output [SIZE*DATA_WIDTH-1:0] out_diagonal // serialized_index(i,j) = (i-1)*SIZE + j 37 | ); 38 | genvar i,j,k; 39 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_down; 40 | wire [SIZE*SIZE*DATA_WIDTH-1:0] inner_pass_right; 41 | generate 42 | for (i=SIZE; i>=1; i=i-1) begin 43 | for (j=SIZE; j>=1; j=j-1) begin 44 | if (i==SIZE && j==SIZE) begin // 左上角。the upper-left PE 45 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 46 | pe (clk, finish [(i-1)*SIZE+j-1], 47 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 48 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 49 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 50 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 51 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 52 | end else if (i==SIZE && j!=SIZE) begin // 最上一行。PEs in the upper-most row 53 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 54 | pe (clk, finish [(i-1)*SIZE+j-1], 55 | in_up [j*DATA_WIDTH-1 -:DATA_WIDTH], 56 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 57 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 58 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 59 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 60 | end else if (i!=SIZE && j==SIZE) begin // 最左一列。PEs in the left-most column 61 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 62 | pe (clk, finish [(i-1)*SIZE+j-1], 63 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 64 | in_left [i*DATA_WIDTH-1 -:DATA_WIDTH], 65 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 66 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 67 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 68 | end else begin // 其他PE。all other PEs 69 | single_PE_rounded # (DATA_WIDTH, DATA_WIDTH/2) 70 | pe (clk, finish [(i-1)*SIZE+j-1], 71 | inner_pass_down [((i-1+1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 72 | inner_pass_right [((i-1)*SIZE+j+1)*DATA_WIDTH-1 -:DATA_WIDTH], 73 | inner_pass_down [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 74 | inner_pass_right [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH], 75 | out_matrix [((i-1)*SIZE+j)*DATA_WIDTH-1 -:DATA_WIDTH]); 76 | end end end 77 | endgenerate 78 | generate 79 | for (k=SIZE; k>=1; k=k-1) begin 80 | // 向下侧阵列传递。pass data downward to other PE arays 81 | // 向下侧阵列传递。pass data rightward to other PE arays 82 | // 输出对角线值。 output results in diagonal position 83 | assign pass_down [k*DATA_WIDTH-1 -:DATA_WIDTH] 84 | = inner_pass_down [((1-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 85 | assign pass_right [k*DATA_WIDTH-1 -:DATA_WIDTH] 86 | = inner_pass_right [((k-1)*SIZE+1)*DATA_WIDTH-1 -:DATA_WIDTH]; 87 | assign out_diagonal [k*DATA_WIDTH-1 -:DATA_WIDTH] 88 | = out_matrix [((k-1)*SIZE+k)*DATA_WIDTH-1 -:DATA_WIDTH]; 89 | end 90 | endgenerate 91 | endmodule 92 | 93 | -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/util/input_L2.txt: -------------------------------------------------------------------------------- 1 | | Device : 7vx485tffg1158-2L 2 | | Design State : Synthesized 3 | --------------------------------------------------------------------------------------------------------------------------------------- 4 | 5 | Utilization Design Information 6 | 7 | Table of Contents 8 | ----------------- 9 | 1. Slice Logic 10 | 1.1 Summary of Registers by Type 11 | 2. Memory 12 | 3. DSP 13 | 4. IO and GT Specific 14 | 5. Clocking 15 | 6. Specific Feature 16 | 7. Primitives 17 | 8. Black Boxes 18 | 9. Instantiated Netlists 19 | 20 | 1. Slice Logic 21 | -------------- 22 | 23 | +----------------------------+------+-------+-----------+-------+ 24 | | Site Type | Used | Fixed | Available | Util% | 25 | +----------------------------+------+-------+-----------+-------+ 26 | | Slice LUTs* | 464 | 0 | 303600 | 0.15 | 27 | | LUT as Logic | 0 | 0 | 303600 | 0.00 | 28 | | LUT as Memory | 464 | 0 | 130800 | 0.35 | 29 | | LUT as Distributed RAM | 0 | 0 | | | 30 | | LUT as Shift Register | 464 | 0 | | | 31 | | Slice Registers | 1024 | 0 | 607200 | 0.17 | 32 | | Register as Flip Flop | 1024 | 0 | 607200 | 0.17 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +----------------------------+------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 1024 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +-----------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +-----------+------+-------+-----------+-------+ 78 | | DSPs | 0 | 0 | 2800 | 0.00 | 79 | +-----------+------+-------+-----------+-------+ 80 | 81 | 82 | 4. IO and GT Specific 83 | --------------------- 84 | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Site Type | Used | Fixed | Available | Util% | 87 | +-----------------------------+------+-------+-----------+--------+ 88 | | Bonded IOB | 1026 | 0 | 350 | 293.14 | 89 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 90 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 91 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 92 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 93 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 94 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 96 | | IBUFDS | 0 | 0 | 336 | 0.00 | 97 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 98 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 99 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 100 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 101 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 102 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 104 | | ILOGIC | 0 | 0 | 350 | 0.00 | 105 | | OLOGIC | 0 | 0 | 350 | 0.00 | 106 | +-----------------------------+------+-------+-----------+--------+ 107 | 108 | 109 | 5. Clocking 110 | ----------- 111 | 112 | +------------+------+-------+-----------+-------+ 113 | | Site Type | Used | Fixed | Available | Util% | 114 | +------------+------+-------+-----------+-------+ 115 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 116 | | BUFIO | 0 | 0 | 56 | 0.00 | 117 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 118 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 119 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 120 | | BUFHCE | 0 | 0 | 168 | 0.00 | 121 | | BUFR | 0 | 0 | 56 | 0.00 | 122 | +------------+------+-------+-----------+-------+ 123 | 124 | 125 | 6. Specific Feature 126 | ------------------- 127 | 128 | +-------------+------+-------+-----------+-------+ 129 | | Site Type | Used | Fixed | Available | Util% | 130 | +-------------+------+-------+-----------+-------+ 131 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 132 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 133 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 134 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 135 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 136 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 137 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 138 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 139 | | XADC | 0 | 0 | 1 | 0.00 | 140 | +-------------+------+-------+-----------+-------+ 141 | 142 | 143 | 7. Primitives 144 | ------------- 145 | 146 | +----------+------+---------------------+ 147 | | Ref Name | Used | Functional Category | 148 | +----------+------+---------------------+ 149 | | FDRE | 1024 | Flop & Latch | 150 | | IBUF | 514 | IO | 151 | | OBUF | 512 | IO | 152 | | SRL16E | 240 | Distributed Memory | 153 | | SRLC32E | 224 | Distributed Memory | 154 | | BUFG | 1 | Clock | 155 | +----------+------+---------------------+ 156 | 157 | 158 | 8. Black Boxes 159 | -------------- 160 | 161 | +----------+------+ 162 | | Ref Name | Used | 163 | +----------+------+ 164 | 165 | 166 | 9. Instantiated Netlists 167 | ------------------------ 168 | 169 | +----------+------+ 170 | | Ref Name | Used | 171 | +----------+------+ -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/util/input_L3.txt: -------------------------------------------------------------------------------- 1 | | Device : 7vx485tffg1158-2L 2 | | Design State : Synthesized 3 | ----------------------------------------------------------------------------------------------------------------- 4 | 5 | Utilization Design Information 6 | 7 | Table of Contents 8 | ----------------- 9 | 1. Slice Logic 10 | 1.1 Summary of Registers by Type 11 | 2. Memory 12 | 3. DSP 13 | 4. IO and GT Specific 14 | 5. Clocking 15 | 6. Specific Feature 16 | 7. Primitives 17 | 8. Black Boxes 18 | 9. Instantiated Netlists 19 | 20 | 1. Slice Logic 21 | -------------- 22 | 23 | +-------------------------+------+-------+-----------+-------+ 24 | | Site Type | Used | Fixed | Available | Util% | 25 | +-------------------------+------+-------+-----------+-------+ 26 | | Slice LUTs* | 0 | 0 | 303600 | 0.00 | 27 | | LUT as Logic | 0 | 0 | 303600 | 0.00 | 28 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 29 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 30 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 31 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 32 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 33 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 34 | +-------------------------+------+-------+-----------+-------+ 35 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 36 | 37 | 38 | 1.1 Summary of Registers by Type 39 | -------------------------------- 40 | 41 | +-------+--------------+-------------+--------------+ 42 | | Total | Clock Enable | Synchronous | Asynchronous | 43 | +-------+--------------+-------------+--------------+ 44 | | 0 | _ | - | - | 45 | | 0 | _ | - | Set | 46 | | 0 | _ | - | Reset | 47 | | 0 | _ | Set | - | 48 | | 0 | _ | Reset | - | 49 | | 0 | Yes | - | - | 50 | | 0 | Yes | - | Set | 51 | | 0 | Yes | - | Reset | 52 | | 0 | Yes | Set | - | 53 | | 512 | Yes | Reset | - | 54 | +-------+--------------+-------------+--------------+ 55 | 56 | 57 | 2. Memory 58 | --------- 59 | 60 | +----------------+------+-------+-----------+-------+ 61 | | Site Type | Used | Fixed | Available | Util% | 62 | +----------------+------+-------+-----------+-------+ 63 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 64 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 65 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 66 | +----------------+------+-------+-----------+-------+ 67 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 68 | 69 | 70 | 3. DSP 71 | ------ 72 | 73 | +-----------+------+-------+-----------+-------+ 74 | | Site Type | Used | Fixed | Available | Util% | 75 | +-----------+------+-------+-----------+-------+ 76 | | DSPs | 0 | 0 | 2800 | 0.00 | 77 | +-----------+------+-------+-----------+-------+ 78 | 79 | 80 | 4. IO and GT Specific 81 | --------------------- 82 | 83 | +-----------------------------+------+-------+-----------+--------+ 84 | | Site Type | Used | Fixed | Available | Util% | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Bonded IOB | 578 | 0 | 350 | 165.14 | 87 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 88 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 89 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 90 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 91 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 92 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 93 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 94 | | IBUFDS | 0 | 0 | 336 | 0.00 | 95 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 96 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 97 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 98 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 99 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 100 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 101 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 102 | | ILOGIC | 0 | 0 | 350 | 0.00 | 103 | | OLOGIC | 0 | 0 | 350 | 0.00 | 104 | +-----------------------------+------+-------+-----------+--------+ 105 | 106 | 107 | 5. Clocking 108 | ----------- 109 | 110 | +------------+------+-------+-----------+-------+ 111 | | Site Type | Used | Fixed | Available | Util% | 112 | +------------+------+-------+-----------+-------+ 113 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 114 | | BUFIO | 0 | 0 | 56 | 0.00 | 115 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 116 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 117 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 118 | | BUFHCE | 0 | 0 | 168 | 0.00 | 119 | | BUFR | 0 | 0 | 56 | 0.00 | 120 | +------------+------+-------+-----------+-------+ 121 | 122 | 123 | 6. Specific Feature 124 | ------------------- 125 | 126 | +-------------+------+-------+-----------+-------+ 127 | | Site Type | Used | Fixed | Available | Util% | 128 | +-------------+------+-------+-----------+-------+ 129 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 130 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 131 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 132 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 133 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 134 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 135 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 136 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 137 | | XADC | 0 | 0 | 1 | 0.00 | 138 | +-------------+------+-------+-----------+-------+ 139 | 140 | 141 | 7. Primitives 142 | ------------- 143 | 144 | +----------+------+---------------------+ 145 | | Ref Name | Used | Functional Category | 146 | +----------+------+---------------------+ 147 | | OBUF | 512 | IO | 148 | | FDRE | 512 | Flop & Latch | 149 | | IBUF | 66 | IO | 150 | | BUFG | 1 | Clock | 151 | +----------+------+---------------------+ 152 | 153 | 154 | 8. Black Boxes 155 | -------------- 156 | 157 | +----------+------+ 158 | | Ref Name | Used | 159 | +----------+------+ 160 | 161 | 162 | 9. Instantiated Netlists 163 | ------------------------ 164 | 165 | +----------+------+ 166 | | Ref Name | Used | 167 | +----------+------+ 168 | 169 | 170 | -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/util/output.txt: -------------------------------------------------------------------------------- 1 | // used 32+3 times 2 | 3 | | Device : 7vx485tffg1158-2L 4 | | Design State : Synthesized 5 | ------------------------------------------------------------------------------------------------------------- 6 | 7 | Utilization Design Information 8 | 9 | Table of Contents 10 | ----------------- 11 | 1. Slice Logic 12 | 1.1 Summary of Registers by Type 13 | 2. Memory 14 | 3. DSP 15 | 4. IO and GT Specific 16 | 5. Clocking 17 | 6. Specific Feature 18 | 7. Primitives 19 | 8. Black Boxes 20 | 9. Instantiated Netlists 21 | 22 | 1. Slice Logic 23 | -------------- 24 | 25 | +-------------------------+------+-------+-----------+-------+ 26 | | Site Type | Used | Fixed | Available | Util% | 27 | +-------------------------+------+-------+-----------+-------+ 28 | | Slice LUTs* | 498 | 0 | 303600 | 0.16 | 29 | | LUT as Logic | 498 | 0 | 303600 | 0.16 | 30 | | LUT as Memory | 0 | 0 | 130800 | 0.00 | 31 | | Slice Registers | 512 | 0 | 607200 | 0.08 | 32 | | Register as Flip Flop | 512 | 0 | 607200 | 0.08 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +-------------------------+------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 512 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +-----------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +-----------+------+-------+-----------+-------+ 78 | | DSPs | 0 | 0 | 2800 | 0.00 | 79 | +-----------+------+-------+-----------+-------+ 80 | 81 | 82 | 4. IO and GT Specific 83 | --------------------- 84 | 85 | +-----------------------------+------+-------+-----------+--------+ 86 | | Site Type | Used | Fixed | Available | Util% | 87 | +-----------------------------+------+-------+-----------+--------+ 88 | | Bonded IOB | 531 | 0 | 350 | 151.71 | 89 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 90 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 91 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 92 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 93 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 94 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 96 | | IBUFDS | 0 | 0 | 336 | 0.00 | 97 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 98 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 99 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 100 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 101 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 102 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 104 | | ILOGIC | 0 | 0 | 350 | 0.00 | 105 | | OLOGIC | 0 | 0 | 350 | 0.00 | 106 | +-----------------------------+------+-------+-----------+--------+ 107 | 108 | 109 | 5. Clocking 110 | ----------- 111 | 112 | +------------+------+-------+-----------+-------+ 113 | | Site Type | Used | Fixed | Available | Util% | 114 | +------------+------+-------+-----------+-------+ 115 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 116 | | BUFIO | 0 | 0 | 56 | 0.00 | 117 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 118 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 119 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 120 | | BUFHCE | 0 | 0 | 168 | 0.00 | 121 | | BUFR | 0 | 0 | 56 | 0.00 | 122 | +------------+------+-------+-----------+-------+ 123 | 124 | 125 | 6. Specific Feature 126 | ------------------- 127 | 128 | +-------------+------+-------+-----------+-------+ 129 | | Site Type | Used | Fixed | Available | Util% | 130 | +-------------+------+-------+-----------+-------+ 131 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 132 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 133 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 134 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 135 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 136 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 137 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 138 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 139 | | XADC | 0 | 0 | 1 | 0.00 | 140 | +-------------+------+-------+-----------+-------+ 141 | 142 | 143 | 7. Primitives 144 | ------------- 145 | 146 | +----------+------+---------------------+ 147 | | Ref Name | Used | Functional Category | 148 | +----------+------+---------------------+ 149 | | IBUF | 515 | IO | 150 | | FDRE | 512 | Flop & Latch | 151 | | LUT4 | 496 | LUT | 152 | | OBUF | 16 | IO | 153 | | LUT2 | 2 | LUT | 154 | | BUFG | 1 | Clock | 155 | +----------+------+---------------------+ 156 | 157 | 158 | 8. Black Boxes 159 | -------------- 160 | 161 | +----------+------+ 162 | | Ref Name | Used | 163 | +----------+------+ 164 | 165 | 166 | 9. Instantiated Netlists 167 | ------------------------ 168 | 169 | +----------+------+ 170 | | Ref Name | Used | 171 | +----------+------+ 172 | 173 | 174 | -------------------------------------------------------------------------------- /results/plain_SA_32_32_16/util/pe_array.txt: -------------------------------------------------------------------------------- 1 | | Device : 7vx485tffg1158-2L 2 | | Design State : Synthesized 3 | ----------------------------------------------------------------------------------------------------------------------- 4 | 5 | Utilization Design Information 6 | 7 | Table of Contents 8 | ----------------- 9 | 1. Slice Logic 10 | 1.1 Summary of Registers by Type 11 | 2. Memory 12 | 3. DSP 13 | 4. IO and GT Specific 14 | 5. Clocking 15 | 6. Specific Feature 16 | 7. Primitives 17 | 8. Black Boxes 18 | 9. Instantiated Netlists 19 | 20 | 1. Slice Logic 21 | -------------- 22 | 23 | +----------------------------+-------+-------+-----------+-------+ 24 | | Site Type | Used | Fixed | Available | Util% | 25 | +----------------------------+-------+-------+-----------+-------+ 26 | | Slice LUTs* | 19296 | 0 | 303600 | 6.36 | 27 | | LUT as Logic | 16384 | 0 | 303600 | 5.40 | 28 | | LUT as Memory | 2912 | 0 | 130800 | 2.23 | 29 | | LUT as Distributed RAM | 0 | 0 | | | 30 | | LUT as Shift Register | 2912 | 0 | | | 31 | | Slice Registers | 49824 | 0 | 607200 | 8.21 | 32 | | Register as Flip Flop | 49824 | 0 | 607200 | 8.21 | 33 | | Register as Latch | 0 | 0 | 607200 | 0.00 | 34 | | F7 Muxes | 0 | 0 | 151800 | 0.00 | 35 | | F8 Muxes | 0 | 0 | 75900 | 0.00 | 36 | +----------------------------+-------+-------+-----------+-------+ 37 | * Warning! The Final LUT count, after physical optimizations and full implementation, is typically lower. Run opt_design after synthesis, if not already completed, for a more realistic count. 38 | 39 | 40 | 1.1 Summary of Registers by Type 41 | -------------------------------- 42 | 43 | +-------+--------------+-------------+--------------+ 44 | | Total | Clock Enable | Synchronous | Asynchronous | 45 | +-------+--------------+-------------+--------------+ 46 | | 0 | _ | - | - | 47 | | 0 | _ | - | Set | 48 | | 0 | _ | - | Reset | 49 | | 0 | _ | Set | - | 50 | | 0 | _ | Reset | - | 51 | | 0 | Yes | - | - | 52 | | 0 | Yes | - | Set | 53 | | 0 | Yes | - | Reset | 54 | | 0 | Yes | Set | - | 55 | | 49824 | Yes | Reset | - | 56 | +-------+--------------+-------------+--------------+ 57 | 58 | 59 | 2. Memory 60 | --------- 61 | 62 | +----------------+------+-------+-----------+-------+ 63 | | Site Type | Used | Fixed | Available | Util% | 64 | +----------------+------+-------+-----------+-------+ 65 | | Block RAM Tile | 0 | 0 | 1030 | 0.00 | 66 | | RAMB36/FIFO* | 0 | 0 | 1030 | 0.00 | 67 | | RAMB18 | 0 | 0 | 2060 | 0.00 | 68 | +----------------+------+-------+-----------+-------+ 69 | * Note: Each Block RAM Tile only has one FIFO logic available and therefore can accommodate only one FIFO36E1 or one FIFO18E1. However, if a FIFO18E1 occupies a Block RAM Tile, that tile can still accommodate a RAMB18E1 70 | 71 | 72 | 3. DSP 73 | ------ 74 | 75 | +----------------+------+-------+-----------+-------+ 76 | | Site Type | Used | Fixed | Available | Util% | 77 | +----------------+------+-------+-----------+-------+ 78 | | DSPs | 1024 | 0 | 2800 | 36.57 | 79 | | DSP48E1 only | 1024 | | | | 80 | +----------------+------+-------+-----------+-------+ 81 | 82 | 83 | 4. IO and GT Specific 84 | --------------------- 85 | 86 | +-----------------------------+-------+-------+-----------+---------+ 87 | | Site Type | Used | Fixed | Available | Util% | 88 | +-----------------------------+-------+-------+-----------+---------+ 89 | | Bonded IOB | 19969 | 0 | 350 | 5705.43 | 90 | | Bonded IPADs | 0 | 0 | 146 | 0.00 | 91 | | Bonded OPADs | 0 | 0 | 96 | 0.00 | 92 | | PHY_CONTROL | 0 | 0 | 14 | 0.00 | 93 | | PHASER_REF | 0 | 0 | 14 | 0.00 | 94 | | OUT_FIFO | 0 | 0 | 56 | 0.00 | 95 | | IN_FIFO | 0 | 0 | 56 | 0.00 | 96 | | IDELAYCTRL | 0 | 0 | 14 | 0.00 | 97 | | IBUFDS | 0 | 0 | 336 | 0.00 | 98 | | GTXE2_COMMON | 0 | 0 | 12 | 0.00 | 99 | | GTXE2_CHANNEL | 0 | 0 | 48 | 0.00 | 100 | | PHASER_OUT/PHASER_OUT_PHY | 0 | 0 | 56 | 0.00 | 101 | | PHASER_IN/PHASER_IN_PHY | 0 | 0 | 56 | 0.00 | 102 | | IDELAYE2/IDELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 103 | | ODELAYE2/ODELAYE2_FINEDELAY | 0 | 0 | 700 | 0.00 | 104 | | IBUFDS_GTE2 | 0 | 0 | 24 | 0.00 | 105 | | ILOGIC | 0 | 0 | 350 | 0.00 | 106 | | OLOGIC | 0 | 0 | 350 | 0.00 | 107 | +-----------------------------+-------+-------+-----------+---------+ 108 | 109 | 110 | 5. Clocking 111 | ----------- 112 | 113 | +------------+------+-------+-----------+-------+ 114 | | Site Type | Used | Fixed | Available | Util% | 115 | +------------+------+-------+-----------+-------+ 116 | | BUFGCTRL | 1 | 0 | 32 | 3.13 | 117 | | BUFIO | 0 | 0 | 56 | 0.00 | 118 | | MMCME2_ADV | 0 | 0 | 14 | 0.00 | 119 | | PLLE2_ADV | 0 | 0 | 14 | 0.00 | 120 | | BUFMRCE | 0 | 0 | 28 | 0.00 | 121 | | BUFHCE | 0 | 0 | 168 | 0.00 | 122 | | BUFR | 0 | 0 | 56 | 0.00 | 123 | +------------+------+-------+-----------+-------+ 124 | 125 | 126 | 6. Specific Feature 127 | ------------------- 128 | 129 | +-------------+------+-------+-----------+-------+ 130 | | Site Type | Used | Fixed | Available | Util% | 131 | +-------------+------+-------+-----------+-------+ 132 | | BSCANE2 | 0 | 0 | 4 | 0.00 | 133 | | CAPTUREE2 | 0 | 0 | 1 | 0.00 | 134 | | DNA_PORT | 0 | 0 | 1 | 0.00 | 135 | | EFUSE_USR | 0 | 0 | 1 | 0.00 | 136 | | FRAME_ECCE2 | 0 | 0 | 1 | 0.00 | 137 | | ICAPE2 | 0 | 0 | 2 | 0.00 | 138 | | PCIE_2_1 | 0 | 0 | 4 | 0.00 | 139 | | STARTUPE2 | 0 | 0 | 1 | 0.00 | 140 | | XADC | 0 | 0 | 1 | 0.00 | 141 | +-------------+------+-------+-----------+-------+ 142 | 143 | 144 | 7. Primitives 145 | ------------- 146 | 147 | +----------+-------+---------------------+ 148 | | Ref Name | Used | Functional Category | 149 | +----------+-------+---------------------+ 150 | | FDRE | 49824 | Flop & Latch | 151 | | OBUF | 17920 | IO | 152 | | LUT3 | 8192 | LUT | 153 | | LUT2 | 8192 | LUT | 154 | | CARRY4 | 4096 | CarryLogic | 155 | | SRL16E | 2864 | Distributed Memory | 156 | | IBUF | 2049 | IO | 157 | | DSP48E1 | 1024 | Block Arithmetic | 158 | | SRLC32E | 48 | Distributed Memory | 159 | | BUFG | 1 | Clock | 160 | +----------+-------+---------------------+ 161 | 162 | 163 | 8. Black Boxes 164 | -------------- 165 | 166 | +----------+------+ 167 | | Ref Name | Used | 168 | +----------+------+ 169 | 170 | 171 | 9. Instantiated Netlists 172 | ------------------------ 173 | 174 | +----------+------+ 175 | | Ref Name | Used | 176 | +----------+------+ 177 | 178 | 179 | -------------------------------------------------------------------------------- /simu/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "ostream": "cpp" 4 | } 5 | } -------------------------------------------------------------------------------- /simu/README.md: -------------------------------------------------------------------------------- 1 | 用蒙地卡罗法对PE共用cache的必要体积进行模拟。 2 | 3 | 这个模拟的逻辑是:尽管我不知道神经网络中每条行/列向量的0是怎么分布的,但对于大量的含0向量,其分布必然趋近于随机(大数定理)。 4 | 因此,我直接用大量次数的随机0/1来模拟处理一对稀疏向量的内积时,需要多大体积的cache(即有多大的cache可以确保不丢失任何数据)。 5 | 6 | 显然,对于固定稀疏比例、固定向量长度的单个PE,所需的cache体积是一个概率分布:比方说,大部分情况下4组reg就够了,极少部分情况下要5个,极端罕见情况下要6个,等等。 7 | 8 | 单个pe的模拟结果是这样的: 9 | 向量长度50,0的比例10%,模拟次数一千次,最多需要5个额外的寄存器。 10 | 向量长度50,0的比例50%,模拟次数一千次,最多需要13+个额外的寄存器。 11 | 很悲观,因为不做稀疏计算的话,一个pe内有4个寄存器,相当于翻了好几倍。而且,随着模拟次数增加(一万次,十万次),极端情况会越来越吓人,可能动不动要20+个额外寄存器。 12 | 13 | 而对于多个pe共用cache,就很乐观:(每次模拟会有轻微浮动) 14 | 向量长度50,比例20%,8个pe共用cache,模拟1000次,平均每个pe需要2.625个额外寄存器。 15 | 向量长度50,比例50%,8个pe共用cache,模拟100次,平均每个pe需要4个额外寄存器。 16 | 向量长度50,比例50%,8个pe共用cache,模拟100000次,平均每个pe需要6个额外寄存器。 17 | 可以看出共用有两个好处: 18 | 1. 相同模拟次数下,平均单个pe的cache消耗变小了。 19 | 2. 随着模拟次数变多,cache消耗不会爆炸,会收敛在某个值上,这个值的tradeoff可以接受。 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /simu/main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ziheng-W/systolic-array-verilog/bc64a45f1468f3e0352c2e921522b767ccf06cd0/simu/main -------------------------------------------------------------------------------- /simu/new_sim/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "array": "cpp", 4 | "atomic": "cpp", 5 | "bit": "cpp", 6 | "*.tcc": "cpp", 7 | "bitset": "cpp", 8 | "cctype": "cpp", 9 | "chrono": "cpp", 10 | "clocale": "cpp", 11 | "cmath": "cpp", 12 | "compare": "cpp", 13 | "concepts": "cpp", 14 | "condition_variable": "cpp", 15 | "cstdarg": "cpp", 16 | "cstddef": "cpp", 17 | "cstdint": "cpp", 18 | "cstdio": "cpp", 19 | "cstdlib": "cpp", 20 | "cstring": "cpp", 21 | "ctime": "cpp", 22 | "cwchar": "cpp", 23 | "cwctype": "cpp", 24 | "deque": "cpp", 25 | "string": "cpp", 26 | "unordered_map": "cpp", 27 | "vector": "cpp", 28 | "exception": "cpp", 29 | "algorithm": "cpp", 30 | "functional": "cpp", 31 | "iterator": "cpp", 32 | "memory": "cpp", 33 | "memory_resource": "cpp", 34 | "numeric": "cpp", 35 | "optional": "cpp", 36 | "random": "cpp", 37 | "ratio": "cpp", 38 | "string_view": "cpp", 39 | "system_error": "cpp", 40 | "tuple": "cpp", 41 | "type_traits": "cpp", 42 | "utility": "cpp", 43 | "initializer_list": "cpp", 44 | "iomanip": "cpp", 45 | "iosfwd": "cpp", 46 | "iostream": "cpp", 47 | "istream": "cpp", 48 | "limits": "cpp", 49 | "mutex": "cpp", 50 | "new": "cpp", 51 | "numbers": "cpp", 52 | "ostream": "cpp", 53 | "semaphore": "cpp", 54 | "sstream": "cpp", 55 | "stdexcept": "cpp", 56 | "stop_token": "cpp", 57 | "streambuf": "cpp", 58 | "thread": "cpp", 59 | "typeinfo": "cpp", 60 | "__nullptr": "cpp" 61 | } 62 | } -------------------------------------------------------------------------------- /simu/new_sim/draft: -------------------------------------------------------------------------------- 1 | 6 6 8 8 12 12 21 21 28 28 2 | 0 1 2 6 8 9 12 13 14 15 16 21 22 26 28 29 3 | 4 | 0 1 1 2 1 1 5 | 0 0 -1 -1 -1 0 0 1 0 1 0 0 0 0 -1 -2 maximum: 1 -------------------------------------------------------------------------------- /simu/new_sim/saving.txt: -------------------------------------------------------------------------------- 1 | # include 2 | # include 3 | # include 4 | # include 5 | # include 6 | # include 7 | # include 8 | # include 9 | using namespace std; 10 | 11 | // v_size: 原向量长度 12 | // v_cnt: 同侧向量个数 13 | // p: 稀疏度 14 | int monte_carlo(int v_size, int v_cnt, float p); 15 | void print_vector(vector &v, bool show_size); 16 | void print_vector(vector &v, bool show_size); 17 | bool random_bit(float p); 18 | 19 | int main(int argc, char* argv[]){ 20 | srand((unsigned)time(NULL)); 21 | // vector> v{}; 22 | // cout<{}); 24 | // cout< &v, bool show_size = 0){ 38 | for(int i=0; i &v, bool show_size = 0){ 46 | for(int i=0; i> sparse_vectors{}; 56 | vector masked_dense{}; 57 | vector> individual_masks{}; 58 | vector universal_mask{}; 59 | for (int i=0; i{}); 61 | individual_masks.push_back(vector{}); 62 | for (int j=0; j> left_input{}; 81 | for(int i=0; i{}); 83 | for(int j=0; j masked_dense[i]) { 116 | if (i%2 == 1 && left_input[k][i]!=99) 117 | curr_mem_size++; 118 | if (individual_masks[0][masked_dense[i]]) 119 | pop_request = true; 120 | } 121 | if (left_input[k][i] < masked_dense[i]) { 122 | if (i%2 == 0) 123 | pop_request = true; 124 | if (individual_masks[0][masked_dense[i]]) 125 | curr_mem_size++; 126 | } 127 | maxi_mem_size = max(maxi_mem_size, curr_mem_size); 128 | cout< 2 | # include 3 | # include 4 | # include 5 | # include 6 | # include 7 | # include 8 | # include 9 | using namespace std; 10 | 11 | // v_size: 原向量长度 12 | // v_cnt: 同侧向量个数 13 | // p: 稀疏度 14 | int monte_carlo(int v_size, int v_cnt, float p, bool display = 0); 15 | void print_vector(vector &v, bool show_size); 16 | void print_vector(vector &v, bool show_size); 17 | bool random_bit(float p); 18 | 19 | int main(int argc, char* argv[]){ 20 | srand((unsigned)time(NULL)); 21 | int size = atoi(argv[1]); 22 | int cnt = atoi(argv[2]); 23 | float sparsity = atof(argv[3]); 24 | float n =0; 25 | for( int i=0; i<100; i++){ 26 | // n += monte_carlo(size, cnt, sparsity, 1); 27 | n = max((int)n, monte_carlo(size, cnt, sparsity, 1)); 28 | 29 | } 30 | cout<<"n: "; 31 | cout< &v, bool show_size = 0){ 41 | for(int i=0; i &v, bool show_size = 0){ 49 | for(int i=0; i> curr_mem_sizes{}; 60 | bool pop_request{}; 61 | int maxi_length{}; 62 | 63 | 64 | 65 | // 生成稀疏向量、稠密向量和对应掩码 66 | vector> sparse_vectors{}; 67 | vector masked_dense{}; 68 | vector> individual_masks{}; 69 | vector universal_mask{}; 70 | for (int i=0; i{}); 72 | individual_masks.push_back(vector{}); 73 | for (int j=0; j> left_input{}; 92 | for(int i=0; i{}); 94 | if(sparse_vectors[i].size()*3 > masked_dense.size()){ 95 | for(int j=0; j{}); 122 | for(int kk=0; kk masked_dense[i]) { 142 | if (last_left == left_input[k][i] && left_input[k][i]!=99) 143 | curr_mem_size++; 144 | if (individual_masks[k][masked_dense[i]]) 145 | pop_request = true; 146 | } 147 | if (left_input[k][i] < masked_dense[i]) { 148 | if (last_left != left_input[k][i]) 149 | pop_request = true; 150 | if (individual_masks[k][masked_dense[i]]) 151 | curr_mem_size++; 152 | } 153 | curr_mem_sizes[k].push_back(curr_mem_size); 154 | last_left = left_input[k][i]; 155 | } 156 | // maxi_length = min(maxi_length, (int)curr_mem_sizes[k].size()); 157 | } 158 | 159 | if(display){ 160 | for(int k = 0; k 2 | # include 3 | # include 4 | # include 5 | # include 6 | # include 7 | #include 8 | 9 | using namespace std; 10 | 11 | int main(){ 12 | vector> v{}; 13 | v.push_back({}); 14 | v.push_back({}); 15 | v[0].push_back(1); 16 | v[1].push_back(2); 17 | cout<{}); 19 | // v[0].push_back(1); 20 | // v.push_back(vector{2}); 21 | // cout<