├── MAC module.v ├── MAC_scalable.sv ├── MMU module(4x4).v ├── MMU_test.v ├── Main module.png ├── Output verification.jpg ├── Problem Statement.pdf ├── README.md ├── Results.pdf └── Systolicflow.png /MAC module.v: -------------------------------------------------------------------------------- 1 | module MAC #( 2 | parameter bit_width = 8, 3 | parameter acc_width = 32 4 | )( 5 | input clk, 6 | input control, 7 | input reset, 8 | input [acc_width - 1:0] acc_in, 9 | input [bit_width - 1:0] data_in, 10 | input [bit_width - 1:0] wt_path_in, 11 | output reg [acc_width - 1:0] acc_out, 12 | output reg [bit_width - 1:0] data_out, 13 | output reg [bit_width - 1:0] wt_path_out 14 | ); 15 | 16 | reg [bit_width + bit_width - 1:0] result; 17 | reg [acc_width - 1:0] acc_reg; 18 | reg [bit_width - 1:0] weight_reg; 19 | 20 | always @(posedge clk) begin 21 | if (reset) begin 22 | acc_out <= 0; 23 | wt_path_out <= 0; 24 | end 25 | else begin 26 | acc_out <= acc_reg; 27 | wt_path_out <= wt_path_in; 28 | data_out <= data_in; 29 | end 30 | end 31 | 32 | always @* begin 33 | result = data_in * wt_path_in; 34 | acc_reg = acc_in + result; 35 | end 36 | endmodule 37 | 38 | -------------------------------------------------------------------------------- /MAC_scalable.sv: -------------------------------------------------------------------------------- 1 | module TPU #(parameter depth=4, bit_width=8, acc_width=24, size=4) 2 | ( 3 | clk, 4 | control, 5 | data_arr, 6 | wt_arr, 7 | acc_out 8 | ); 9 | input clk; 10 | input control; 11 | input [(bit_width*depth)-1:0] data_arr; 12 | input [(bit_width*depth)-1:0] wt_arr; 13 | output reg [acc_width*size-1:0] acc_out; 14 | 15 | 16 | wire [bit_width-1:0]data_out[depth-1:0][depth-1:0]; 17 | wire [bit_width-1:0]wt_out[depth-1:0][depth-1:0]; 18 | wire [acc_width-1:0]acc_out_temp[depth-1:0][depth-1:0]; 19 | 20 | 21 | generate 22 | for (genvar i = 0; i < depth; i++) begin 23 | for (genvar j = 0; j < depth; j++) begin 24 | if(i==0 && j==0) begin 25 | MAC mac_instance(.clk(clk), 26 | .control(control), 27 | .acc_in(24'b0), 28 | .acc_out(acc_out_temp[i][j]), 29 | .data_in(data_arr[i*bit_width+:bit_width]), 30 | .wt_path_in(wt_arr[bit_width-1:0]), 31 | .data_out(data_out[i][j]), 32 | .wt_path_out(wt_out[i][j]) 33 | ); 34 | end 35 | 36 | if(i==0 && j!=0) begin 37 | MAC mac_instance(.clk(clk), 38 | .control(control), 39 | .acc_in(24'b0), 40 | .acc_out(acc_out_temp[i][j]), 41 | .data_in(data_out[i][j-1]), 42 | .wt_path_in(wt_arr[j*bit_width+:bit_width]), 43 | .data_out(data_out[i][j]), 44 | .wt_path_out(wt_out[i][j]) 45 | ); 46 | end 47 | 48 | if(i!=0 && j==0) begin 49 | MAC mac_instance(.clk(clk), 50 | .control(control), 51 | .acc_in(acc_out_temp[i-1][j]), 52 | .acc_out(acc_out_temp[i][j]), 53 | .data_in(data_arr[i*bit_width+:bit_width]), 54 | .wt_path_in(wt_out[i-1][j]), 55 | .data_out(data_out[i][j]), 56 | .wt_path_out(wt_out[i][j]) 57 | ); 58 | end 59 | 60 | if(i!=0 && j!=0) begin 61 | MAC mac_instance(.clk(clk), 62 | .control(control), 63 | .acc_in(acc_out_temp[i-1][j]), 64 | .acc_out(acc_out_temp[i][j]), 65 | .data_in(data_out[i][j-1]), 66 | .wt_path_in(wt_out[i-1][j]), 67 | .data_out(data_out[i][j]), 68 | .wt_path_out(wt_out[i][j]) 69 | ); 70 | end 71 | 72 | end 73 | end 74 | endgenerate 75 | 76 | 77 | generate 78 | for(genvar k = 0; k < depth; k++) begin 79 | always@(posedge clk) begin 80 | acc_out[k*acc_width+:acc_width] <= acc_out_temp[depth-1][k]; 81 | end 82 | end 83 | endgenerate 84 | 85 | 86 | endmodule -------------------------------------------------------------------------------- /MMU module(4x4).v: -------------------------------------------------------------------------------- 1 | // MMU module 2 | module MMU #(parameter depth=4, bit_width=8, acc_width=32, size=4)( 3 | input clk, 4 | input control, 5 | input reset, 6 | input [(bit_width*depth)-1:0] data_arr, 7 | input [(bit_width*depth)-1:0] wt_arr, 8 | output reg [acc_width*size-1:0] acc_out 9 | ); 10 | 11 | wire [7:0] data_out00, data_out01, data_out02, data_out03, data_out10, data_out11, data_out12, data_out13, data_out20, data_out21, data_out22, data_out23, data_out30, data_out31, data_out32, data_out33; 12 | wire [7:0] wt_out00, wt_out01, wt_out02, wt_out03, wt_out10, wt_out11, wt_out12, wt_out13, wt_out20, wt_out21, wt_out22, wt_out23, wt_out30, wt_out31, wt_out32, wt_out33; 13 | wire [31:0] acc_out00, acc_out01, acc_out02, acc_out03, acc_out10, acc_out11, acc_out12, acc_out13, acc_out20, acc_out21, acc_out22, acc_out23, acc_out30, acc_out31, acc_out32, acc_out33; 14 | 15 | MAC m00 (clk, control, reset, 32'b0, acc_out00, data_arr[7:0], wt_arr[7:0], data_out00, wt_out00); 16 | MAC m10 (clk, control, reset, 32'b0, acc_out10, data_out00, wt_arr[15:8], data_out10, wt_out10); 17 | MAC m20 (clk, control, reset, 32'b0, acc_out20, data_out10, wt_arr[23:16], data_out20, wt_out20); 18 | MAC m30 (clk, control, reset, 32'b0, acc_out30, data_out20, wt_arr[31:24], data_out30, wt_out30); 19 | 20 | MAC m01 (clk, control, reset, acc_out00, acc_out01, data_arr[15:8], wt_out00, data_out01, wt_out01); 21 | MAC m11 (clk, control, reset, acc_out10, acc_out11, data_out01, wt_out10, data_out11, wt_out11); 22 | MAC m21 (clk, control, reset, acc_out20, acc_out21, data_out11, wt_out20, data_out21, wt_out21); 23 | MAC m31 (clk, control, reset, acc_out30, acc_out31, data_out21, wt_out30, data_out31, wt_out31); 24 | 25 | MAC m02 (clk, control, reset, acc_out01, acc_out02, data_arr[23:16], wt_out01, data_out02, wt_out02); 26 | MAC m12 (clk, control, reset, acc_out11, acc_out12, data_out02, wt_out11, data_out12, wt_out12); 27 | MAC m22 (clk, control, reset, acc_out21, acc_out22, data_out12, wt_out21, data_out22, wt_out22); 28 | MAC m32 (clk, control, reset, acc_out31, acc_out32, data_out22, wt_out31, data_out32, wt_out32); 29 | 30 | MAC m03 (clk, control, reset, acc_out02, acc_out03, data_arr[31:24], wt_out02, data_out03, wt_out03); 31 | MAC m13 (clk, control, reset, acc_out12, acc_out13, data_out03, wt_out12, data_out13, wt_out13); 32 | MAC m23 (clk, control, reset, acc_out22, acc_out23, data_out13, wt_out22, data_out23, wt_out23); 33 | MAC m33 (clk, control, reset, acc_out32, acc_out33, data_out23, wt_out32, data_out33, wt_out33); 34 | always @(posedge clk) begin 35 | acc_out <= {acc_out33, acc_out23, acc_out13, acc_out03}; 36 | end 37 | 38 | endmodule 39 | -------------------------------------------------------------------------------- /MMU_test.v: -------------------------------------------------------------------------------- 1 | `timescale 1ns / 1ps 2 | 3 | // sample testbench for a 4X4 Systolic Array 4 | 5 | module test_TPU; 6 | 7 | // Inputs 8 | reg clk; 9 | reg control; 10 | reg [31:0] data_arr; 11 | reg [31:0] wt_arr; 12 | 13 | // Outputs 14 | wire [127:0] acc_out; 15 | 16 | // Instantiate the Unit Under Test (UUT) 17 | MMU uut ( 18 | .clk(clk), 19 | .control(control), 20 | .data_arr(data_arr), 21 | .wt_arr(wt_arr), 22 | .acc_out(acc_out) 23 | ); 24 | 25 | initial begin 26 | // Initialize Inputs 27 | clk = 0; 28 | control = 0; 29 | data_arr = 0; 30 | wt_arr = 0; 31 | 32 | // Wait 100 ns for global reset to finish 33 | #5000; 34 | end 35 | // Add stimulus here 36 | always 37 | #250 clk=!clk; 38 | 39 | initial begin 40 | @(posedge clk); 41 | control=1; 42 | wt_arr=32'h 05020304; 43 | 44 | @(posedge clk); 45 | wt_arr=32'h 03010203; 46 | 47 | @(posedge clk); 48 | wt_arr=32'h 07040102; 49 | 50 | @(posedge clk); 51 | wt_arr=32'h 01020403; 52 | 53 | 54 | @(posedge clk); 55 | 56 | control=0; 57 | 58 | data_arr=32'h 00000001; 59 | 60 | @(posedge clk); 61 | data_arr=32'h 00000102; 62 | 63 | @(posedge clk); 64 | data_arr=32'h 00010200; 65 | 66 | @(posedge clk); 67 | data_arr=32'h 00010100; 68 | 69 | @(posedge clk); 70 | data_arr=32'h 02030200; 71 | 72 | @(posedge clk); 73 | data_arr=32'h 04010000; 74 | 75 | @(posedge clk); 76 | data_arr=32'h 05000000; 77 | 78 | end 79 | 80 | endmodule 81 | 82 | -------------------------------------------------------------------------------- /Main module.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvgprasanth/Systolic-Array-Matrix-Multiplication/ca98ebbab9b8579867003b8f8adb5bfd5e6e63bb/Main module.png -------------------------------------------------------------------------------- /Output verification.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvgprasanth/Systolic-Array-Matrix-Multiplication/ca98ebbab9b8579867003b8f8adb5bfd5e6e63bb/Output verification.jpg -------------------------------------------------------------------------------- /Problem Statement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvgprasanth/Systolic-Array-Matrix-Multiplication/ca98ebbab9b8579867003b8f8adb5bfd5e6e63bb/Problem Statement.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Systolic-Array-Matrix-Multiplication 2 | Implementation of weight stationary systolic array which has a size of 4x4(scalable) to 256X256. 3 | 4 | The MMU (Matrix Multiplication Unit) module is the top-level module that represents a systolic array for 5 | matrix multiplication. It takes several inputs, processes them systolically through multiple MAC (Multiply-Accumulate) 6 | units arranged in a 2D array, and produces an output accumulator result. 7 | The MAC (Multiply-Accumulate) module represents a single multiply-accumulate unit. It takes inputs, 8 | multiplies data with weight, accumulates the results, and produces output data and accumulation. Overall, 9 | the MMU module orchestrates the interaction between multiple MAC modules, arranging them in a 10 | systolic array fashion to perform matrix multiplication. The MAC module represents a single multiplyaccumulate operation, with control for weight loading and accumulator reset. The design as a whole is intended for matrix multiplication operations in a systolic array configuration. Careful data and weight flow management ensures system correctness, confirmed through rigorous testing and verification procedures. Moving forward, DC synthesis using the ASAP7 PDK delivered comprehensive reports on area, timing, power, synthesis, and potential violations. Post synthesis gate level simulation results were also verified. This analysis unveils deeper insights into the design's performance and behavior. The detailed explination can be verified in the results.pdf. 11 | 12 | -------------------------------------------------------------------------------- /Results.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvgprasanth/Systolic-Array-Matrix-Multiplication/ca98ebbab9b8579867003b8f8adb5bfd5e6e63bb/Results.pdf -------------------------------------------------------------------------------- /Systolicflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvgprasanth/Systolic-Array-Matrix-Multiplication/ca98ebbab9b8579867003b8f8adb5bfd5e6e63bb/Systolicflow.png --------------------------------------------------------------------------------