├── .github ├── verible.waiver └── workflows │ ├── ci.yml │ ├── gitlab-ci.yml~ │ └── lint.yml ├── .gitlab-ci.yml ├── Bender.local ├── Bender.lock ├── Bender.yml ├── LICENSE_HW ├── Makefile ├── README.md ├── bender_common.mk ├── bender_sim.mk ├── bender_synth.mk ├── doc ├── RedmuleSubsystem-CoreComplex.png ├── redmule_complex_testbench.png ├── redmule_overview.png └── redmule_testbench.png ├── golden-model ├── FP16 │ └── scripts │ │ ├── addmax.py │ │ ├── addmin.py │ │ ├── dump_utils.py │ │ ├── gemm.py │ │ ├── maxmin.py │ │ ├── minmax.py │ │ ├── mulmax.py │ │ └── mulmin.py ├── FP8 │ └── scripts │ │ ├── addmax.py │ │ ├── addmin.py │ │ ├── dump_utils.py │ │ ├── gemm.py │ │ ├── maxmin.py │ │ ├── minmax.py │ │ ├── mulmax.py │ │ └── mulmin.py ├── Makefile └── setup-py.sh ├── rtl ├── deprecated │ ├── redmule_complex_wrap.sv │ └── redmule_wrap.sv ├── redmule_castin.sv ├── redmule_castout.sv ├── redmule_ce.sv ├── redmule_complex.sv ├── redmule_ctrl.sv ├── redmule_engine.sv ├── redmule_fma.sv ├── redmule_inst_decoder.sv ├── redmule_memory_scheduler.sv ├── redmule_noncomp.sv ├── redmule_pkg.sv ├── redmule_row.sv ├── redmule_scheduler.sv ├── redmule_streamer.sv ├── redmule_tiler.sv ├── redmule_top.sv ├── w_buffer │ ├── redmule_w_buffer.sv │ └── redmule_w_buffer_scm.sv ├── x_buffer │ ├── redmule_x_buffer.sv │ ├── redmule_x_buffer_scm.sv │ └── redmule_x_pad_scm.sv └── z_buffer │ ├── redmule_z_buffer.sv │ └── redmule_z_buffer_scm.sv ├── scripts ├── parse_s19.py ├── regression-list.sh ├── s19tomem.py ├── setup-complex.sh ├── setup-hwpe.sh ├── setup64.sh └── stack_init.py ├── sw ├── archi_redmule.h ├── hal_redmule.h ├── kernel │ ├── crt0.S │ └── link.ld ├── redmule.c └── utils │ ├── redmule_utils.h │ └── tinyprintf.h └── target └── sim ├── src ├── redmule_tb.cpp ├── redmule_tb.sv ├── redmule_tb_wrap.sv └── tb_dummy_memory.sv ├── verilator └── verilator.mk └── vsim ├── vsim.mk └── wave.tcl /.github/verible.waiver: -------------------------------------------------------------------------------- 1 | # Copyright 2022 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Disable line length check 6 | waive --rule=line-length 7 | # Disable parameter style check 8 | waive --rule=parameter-name-style 9 | # Disable default check in case statements 10 | waive --rule=case-missing-default 11 | # Disable default check typedef structure parameters 12 | waive --rule=typedef-structs-unions -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 OpenHW Group 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # Run functional regression checks 6 | name: ci 7 | on: [push, pull_request] 8 | 9 | jobs: 10 | install-tools: 11 | name: install-tools 12 | runs-on: ubuntu-latest 13 | env: 14 | NUM_JOBS: 8 15 | steps: 16 | - uses: actions/checkout@v4 17 | with: 18 | submodules: recursive 19 | 20 | - name: Verify Verilator installation 21 | run: | 22 | make verilator 23 | 24 | - name: Verify GCC installation 25 | run: | 26 | make riscv32-gcc 27 | 28 | - name: Verify bender installation 29 | run: | 30 | make bender 31 | 32 | - name: Verify Python tools installation 33 | run: | 34 | cd golden-model && source setup-py.sh 35 | 36 | # - name: Install 37 | # run: | 38 | # make verilator riscv32-gcc bender; cd golden-model && source setup-py.sh 39 | 40 | run-hwpe-tests: 41 | name: run-hwpe-tests 42 | runs-on: ubuntu-latest 43 | env: 44 | Target: verilator 45 | Gcc: vendor/install/riscv/bin/ 46 | UseXif: 0 47 | 48 | needs: 49 | install-tools 50 | steps: 51 | - uses: actions/checkout@v4 52 | with: 53 | submodules: recursive 54 | 55 | - name: Install required tools 56 | run: | 57 | make bender 58 | make riscv32-gcc 59 | make verilator 60 | pip3 install numpy 61 | cd golden-model && source setup-py.sh && cd .. 62 | 63 | - name: Run Tests 64 | run: | 65 | source scripts/regression-list.sh 66 | 67 | run-complex-tests: 68 | name: run-complex-tests 69 | runs-on: ubuntu-latest 70 | env: 71 | Target: verilator 72 | Gcc: vendor/install/riscv/bin/ 73 | UseXif: 1 74 | 75 | needs: 76 | install-tools 77 | steps: 78 | - uses: actions/checkout@v4 79 | with: 80 | submodules: recursive 81 | 82 | - name: Install required tools 83 | run: | 84 | make bender 85 | make riscv32-gcc 86 | make verilator 87 | pip3 install numpy 88 | cd golden-model && source setup-py.sh && cd .. 89 | 90 | - name: Run Tests 91 | run: | 92 | source scripts/regression-list.sh 93 | -------------------------------------------------------------------------------- /.github/workflows/gitlab-ci.yml~: -------------------------------------------------------------------------------- 1 | # Copyright 2022 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: gitlab-ci 6 | 7 | on: [ push, pull_request, workflow_dispatch ] 8 | 9 | jobs: 10 | 11 | check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - 15 | name: Mirror and check 16 | uses: pulp-platform/pulp-actions/gitlab-ci@v2 17 | # Skip on forks or pull requests from forks due to missing secrets. 18 | if: > 19 | github.repository == 'pulp-platform/redmule' && 20 | (github.event_name != 'pull_request' || 21 | github.event.pull_request.head.repo.full_name == github.repository) 22 | with: 23 | domain: iis-git.ee.ethz.ch 24 | repo: github-mirror/redmule 25 | token: ${{ secrets.GITLAB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2022 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | name: lint 6 | 7 | on: [ push, pull_request, workflow_dispatch ] 8 | 9 | jobs: 10 | 11 | lint-license: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - 15 | name: Checkout 16 | uses: actions/checkout@v3 17 | - 18 | name: Check license 19 | uses: pulp-platform/pulp-actions/lint-license@v2 20 | with: 21 | license: | 22 | Copyright (\d{4}(-\d{4})?\s)?.* 23 | (Solderpad Hardware License, Version 0.51|Licensed under the Apache License, Version 2.0), see LICENSE for details. 24 | SPDX-License-Identifier: (SHL-0.51|Apache-2.0) 25 | # Exclude generated headers (no license checker support for optional lines) 26 | exclude_paths: | 27 | sw/utils/tinyprintf.h 28 | sw/inc/* 29 | *.md 30 | *.do 31 | *.lock 32 | LICENSE* 33 | 34 | lint-sv: 35 | runs-on: ubuntu-latest 36 | steps: 37 | - 38 | name: Checkout 39 | uses: actions/checkout@v3 40 | - 41 | name: Run Verible 42 | uses: chipsalliance/verible-linter-action@main 43 | with: 44 | paths: rtl 45 | exclude_paths: | 46 | rtl/redmule_noncomp.sv 47 | rtl/redmule_fma.sv 48 | extra_args: "--waiver_files .github/verible.waiver" 49 | github_token: ${{ secrets.GITHUB_TOKEN }} 50 | fail_on_error: true 51 | reviewdog_reporter: github-check 52 | 53 | lint-cxx: 54 | runs-on: ubuntu-latest 55 | steps: 56 | - 57 | name: Checkout 58 | uses: actions/checkout@v3 59 | - 60 | name: Run Clang-format 61 | uses: DoozyX/clang-format-lint-action@v0.14 62 | with: 63 | extensions: 'c,h,cpp' 64 | clangFormatVersion: 14 65 | style: > 66 | { 67 | IndentWidth: 2, 68 | ColumnLimit: 100, 69 | AlignEscapedNewlines: DontAlign, 70 | SortIncludes: false, 71 | AllowShortFunctionsOnASingleLine: true, 72 | AllowShortIfStatementsOnASingleLine: true, 73 | AllowShortLoopsOnASingleLine: true 74 | } 75 | exclude: | 76 | ./sw/inc/* 77 | ./sw/utils/tinyprintf.h 78 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2022 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | 7 | .base: 8 | artifacts: 9 | when: always 10 | expire_in: 1 week 11 | 12 | stages: 13 | - init 14 | - build 15 | - test 16 | 17 | init: 18 | stage: init 19 | extends: .base 20 | script: 21 | - cd golden-model; source setup-py.sh 22 | artifacts: 23 | when: always 24 | expire_in: 1 week 25 | paths: 26 | - ./golden-model/venv 27 | 28 | .redmule-build-tpl: 29 | extends: .base 30 | stage: build 31 | dependencies: 32 | - init 33 | script: 34 | - SETUP_CONFIG=${SETUP_CONFIG} 35 | - source scripts/${SETUP_CONFIG}.sh 36 | - make update-ips 37 | - make hw-build 38 | artifacts: 39 | when: always 40 | expire_in: 1 week 41 | paths: 42 | - ./.bender 43 | - ./scripts/compile.tcl 44 | - ./vsim/* 45 | 46 | redmule-build-hwpe: 47 | extends: .redmule-build-tpl 48 | variables: 49 | SETUP_CONFIG: "setup-hwpe" 50 | 51 | redmule-build-complex: 52 | extends: .redmule-build-tpl 53 | variables: 54 | SETUP_CONFIG: "setup-complex" 55 | 56 | .redmule-vsim-tpl: 57 | extends: .base 58 | stage: test 59 | script: 60 | - SETUP_CONFIG=${SETUP_CONFIG} 61 | - source scripts/${SETUP_CONFIG}.sh 62 | - make golden OP=${OP} M=${M} N=${N} K=${K} fp_fmt=${FMT} 63 | - make sw-clean sw-build 64 | - make run 65 | - '(grep -rn "Success!" ./vsim/transcript)' 66 | - (! grep -rn "Fail!" ./vsim/transcript) 67 | 68 | hwpe-test: 69 | extends: .redmule-vsim-tpl 70 | dependencies: 71 | - redmule-build-hwpe 72 | variables: 73 | SETUP_CONFIG: "setup-hwpe" 74 | parallel: 75 | matrix: 76 | - { OP: gemm, M: 96, N: 96, K: 96, FMT: FP16 } 77 | - { OP: gemm, M: 128, N: 128, K: 128, FMT: FP16 } 78 | - { OP: gemm, M: 12, N: 16, K: 16, FMT: FP16 } 79 | - { OP: gemm, M: 24, N: 16, K: 16, FMT: FP16 } 80 | - { OP: gemm, M: 48, N: 32, K: 32, FMT: FP16 } 81 | - { OP: gemm, M: 30, N: 32, K: 17, FMT: FP16 } 82 | - { OP: gemm, M: 24, N: 32, K: 1, FMT: FP16 } 83 | - { OP: gemm, M: 31, N: 32, K: 16, FMT: FP16 } 84 | - { OP: gemm, M: 17, N: 32, K: 16, FMT: FP16 } 85 | - { OP: gemm, M: 31, N: 32, K: 31, FMT: FP16 } 86 | - { OP: gemm, M: 17, N: 32, K: 3, FMT: FP16 } 87 | - { OP: gemm, M: 5, N: 32, K: 17, FMT: FP16 } 88 | - { OP: gemm, M: 5, N: 32, K: 3, FMT: FP16 } 89 | - { OP: gemm, M: 36, N: 31, K: 32, FMT: FP16 } 90 | - { OP: gemm, M: 12, N: 31, K: 16, FMT: FP16 } 91 | - { OP: gemm, M: 23, N: 31, K: 31, FMT: FP16 } 92 | - { OP: gemm, M: 24, N: 17, K: 32, FMT: FP16 } 93 | - { OP: gemm, M: 24, N: 20, K: 32, FMT: FP16 } 94 | 95 | complex-test: 96 | extends: .redmule-vsim-tpl 97 | dependencies: 98 | - redmule-build-complex 99 | variables: 100 | SETUP_CONFIG: "setup-complex" 101 | parallel: 102 | matrix: 103 | - { OP: gemm, M: 96, N: 96, K: 96, FMT: FP16 } 104 | - { OP: gemm, M: 128, N: 128, K: 128, FMT: FP16 } 105 | - { OP: gemm, M: 12, N: 16, K: 16, FMT: FP16 } 106 | - { OP: gemm, M: 24, N: 16, K: 16, FMT: FP16 } 107 | - { OP: gemm, M: 48, N: 32, K: 32, FMT: FP16 } 108 | - { OP: gemm, M: 30, N: 32, K: 17, FMT: FP16 } 109 | - { OP: gemm, M: 24, N: 32, K: 1, FMT: FP16 } 110 | - { OP: gemm, M: 31, N: 32, K: 16, FMT: FP16 } 111 | - { OP: gemm, M: 17, N: 32, K: 16, FMT: FP16 } 112 | - { OP: gemm, M: 31, N: 32, K: 31, FMT: FP16 } 113 | - { OP: gemm, M: 17, N: 32, K: 3, FMT: FP16 } 114 | - { OP: gemm, M: 5, N: 32, K: 17, FMT: FP16 } 115 | - { OP: gemm, M: 5, N: 32, K: 3, FMT: FP16 } 116 | - { OP: gemm, M: 36, N: 31, K: 32, FMT: FP16 } 117 | - { OP: gemm, M: 12, N: 31, K: 16, FMT: FP16 } 118 | - { OP: gemm, M: 23, N: 31, K: 31, FMT: FP16 } 119 | - { OP: gemm, M: 24, N: 17, K: 32, FMT: FP16 } 120 | - { OP: gemm, M: 24, N: 20, K: 32, FMT: FP16 } 121 | -------------------------------------------------------------------------------- /Bender.local: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | overrides: 9 | fpnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: "pulp-v0.1.3" } -------------------------------------------------------------------------------- /Bender.lock: -------------------------------------------------------------------------------- 1 | packages: 2 | cluster_interconnect: 3 | revision: 1284def6c0b7f7e9355eb093d00883ad9dead1b7 4 | version: null 5 | source: 6 | Git: https://github.com/pulp-platform/cluster_interconnect.git 7 | dependencies: 8 | - common_cells 9 | common_cells: 10 | revision: 9afda9abb565971649c2aa0985639c096f351171 11 | version: 1.38.0 12 | source: 13 | Git: https://github.com/pulp-platform/common_cells.git 14 | dependencies: 15 | - common_verification 16 | - tech_cells_generic 17 | common_verification: 18 | revision: fb1885f48ea46164a10568aeff51884389f67ae3 19 | version: 0.2.5 20 | source: 21 | Git: https://github.com/pulp-platform/common_verification.git 22 | dependencies: [] 23 | cv32e40p: 24 | revision: 800a09d97a1e9418e127e8bbf1763c1d1097c92f 25 | version: null 26 | source: 27 | Git: https://github.com/pulp-platform/cv32e40p.git 28 | dependencies: 29 | - common_cells 30 | - fpnew 31 | - tech_cells_generic 32 | cv32e40x: 33 | revision: 96b933ac2f723351872da55e7d2e9a82abd5df34 34 | version: null 35 | source: 36 | Git: https://github.com/pulp-platform/cv32e40x.git 37 | dependencies: [] 38 | fpnew: 39 | revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 40 | version: null 41 | source: 42 | Git: https://github.com/pulp-platform/cvfpu.git 43 | dependencies: 44 | - common_cells 45 | - fpu_div_sqrt_mvp 46 | fpu_div_sqrt_mvp: 47 | revision: 86e1f558b3c95e91577c41b2fc452c86b04e85ac 48 | version: 1.0.4 49 | source: 50 | Git: https://github.com/pulp-platform/fpu_div_sqrt_mvp.git 51 | dependencies: 52 | - common_cells 53 | hci: 54 | revision: 06fcba671e060f2e1b03b7ebe2d3e719f1557099 55 | version: null 56 | source: 57 | Git: https://github.com/pulp-platform/hci.git 58 | dependencies: 59 | - cluster_interconnect 60 | - hwpe-stream 61 | - l2_tcdm_hybrid_interco 62 | hwpe-ctrl: 63 | revision: 3690a3c648f120546d8de2bc583d5170c36d2f20 64 | version: null 65 | source: 66 | Git: https://github.com/pulp-platform/hwpe-ctrl.git 67 | dependencies: 68 | - tech_cells_generic 69 | hwpe-stream: 70 | revision: b3d33afdd27e79bcda1348d0ab5f4afd52c03106 71 | version: 1.9.0 72 | source: 73 | Git: https://github.com/pulp-platform/hwpe-stream.git 74 | dependencies: 75 | - tech_cells_generic 76 | ibex: 77 | revision: b18f7ef178ed07f5085051f96042c670a919fd5c 78 | version: null 79 | source: 80 | Git: https://github.com/pulp-platform/ibex.git 81 | dependencies: 82 | - tech_cells_generic 83 | l2_tcdm_hybrid_interco: 84 | revision: fa55e72859dcfb117a2788a77352193bef94ff2b 85 | version: 1.0.0 86 | source: 87 | Git: https://github.com/pulp-platform/L2_tcdm_hybrid_interco.git 88 | dependencies: [] 89 | obi: 90 | revision: 8097928cf1b43712f93d5356f336397879b4ad2c 91 | version: 0.1.6 92 | source: 93 | Git: https://github.com/pulp-platform/obi.git 94 | dependencies: 95 | - common_cells 96 | - common_verification 97 | tech_cells_generic: 98 | revision: 7968dd6e6180df2c644636bc6d2908a49f2190cf 99 | version: 0.2.13 100 | source: 101 | Git: https://github.com/pulp-platform/tech_cells_generic.git 102 | dependencies: 103 | - common_verification 104 | -------------------------------------------------------------------------------- /Bender.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | # Bender manifest 8 | 9 | package: 10 | name: redmule 11 | authors: 12 | - "Yvan Tortorella (yvan.tortorella@unibo.it)" 13 | 14 | dependencies: 15 | cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: "pulpissimo-v4.1.0" } 16 | cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: "redmule-v1.1" } 17 | ibex : { git: "https://github.com/pulp-platform/ibex.git" , rev: pulpissimo-v6.1.2 } 18 | hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , version: =1.9.0 } 19 | hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , rev: "3690a3c" } # master 20 | hci : { git: "https://github.com/pulp-platform/hci.git" , rev: "06fcba6" } # main 21 | fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: "pulp-v0.1.3" } 22 | common_cells : { git: "https://github.com/pulp-platform/common_cells.git" , version: =1.38.0 } 23 | obi : { git: "https://github.com/pulp-platform/obi.git" , version: =0.1.6 } 24 | tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: =0.2.13 } 25 | 26 | sources: 27 | files: 28 | # RedMulE 29 | - rtl/redmule_pkg.sv 30 | - rtl/redmule_tiler.sv 31 | - rtl/redmule_ctrl.sv 32 | - rtl/redmule_scheduler.sv 33 | - rtl/redmule_castin.sv 34 | - rtl/redmule_castout.sv 35 | - rtl/redmule_streamer.sv 36 | - rtl/x_buffer/redmule_x_buffer.sv 37 | - rtl/x_buffer/redmule_x_pad_scm.sv 38 | - rtl/x_buffer/redmule_x_buffer_scm.sv 39 | - rtl/w_buffer/redmule_w_buffer.sv 40 | - rtl/w_buffer/redmule_w_buffer_scm.sv 41 | - rtl/z_buffer/redmule_z_buffer.sv 42 | - rtl/z_buffer/redmule_z_buffer_scm.sv 43 | - rtl/redmule_fma.sv 44 | - rtl/redmule_noncomp.sv 45 | - rtl/redmule_ce.sv 46 | - rtl/redmule_row.sv 47 | - rtl/redmule_engine.sv 48 | - rtl/redmule_inst_decoder.sv 49 | - rtl/redmule_top.sv 50 | - rtl/redmule_memory_scheduler.sv 51 | - rtl/redmule_complex.sv 52 | 53 | - target: redmule_deprecated 54 | files: 55 | - rtl/redmule_wrap.sv 56 | - rtl/redmule_complex_wrap.sv 57 | 58 | - target: redmule_test 59 | files: 60 | - target/sim/src/tb_dummy_memory.sv 61 | - target/sim/src/redmule_tb.sv 62 | 63 | - target: vsim 64 | files: 65 | - target/sim/src/redmule_tb_wrap.sv 66 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | # Top-level Makefile 8 | 9 | # Paths to folders 10 | RootDir := $(dir $(abspath $(firstword $(MAKEFILE_LIST)))) 11 | TargetDir := $(RootDir)target 12 | SimDir := $(TargetDir)/sim 13 | ScriptsDir := $(RootDir)scripts 14 | VerilatorPath := target/sim/verilator 15 | VsimPath := target/sim/vsim 16 | SW ?= $(RootDir)sw 17 | BUILD_DIR ?= $(SW)/build 18 | SIM_DIR ?= $(RootDir)vsim 19 | Bender ?= $(CargoInstallDir)/bin/bender 20 | Gcc ?= $(GccInstallDir)/bin/ 21 | ISA ?= riscv 22 | ARCH ?= rv 23 | XLEN ?= 32 24 | XTEN ?= imc_zicsr 25 | PYTHON ?= python3 26 | 27 | target ?= verilator 28 | TargetPath := $(SimDir)/$(target) 29 | 30 | # Useful Parameters 31 | gui ?= 0 32 | ipstools ?= 0 33 | P_STALL ?= 0.0 34 | UseXif ?= 0 35 | 36 | # Included makefrags 37 | include bender_common.mk 38 | include bender_sim.mk 39 | include bender_synth.mk 40 | include $(TargetPath)/$(target).mk 41 | 42 | compile_script_synth ?= $(RootDir)scripts/synth_compile.tcl 43 | 44 | INI_PATH = $(RootDir)modelsim.ini 45 | WORK_PATH = $(SIM_DIR)/work 46 | 47 | TEST_SRCS := $(SW)/redmule.c 48 | 49 | ifeq ($(UseXif),1) 50 | FLAGS += -DCOMPLEX_OFFLOADER 51 | endif 52 | 53 | ifeq ($(verbose),1) 54 | FLAGS += -DVERBOSE 55 | endif 56 | 57 | ifeq ($(debug),1) 58 | FLAGS += -DDEBUG 59 | endif 60 | 61 | # Include directories 62 | INC += -I$(SW) 63 | INC += -I$(SW)/inc 64 | INC += -I$(SW)/utils 65 | 66 | BOOTSCRIPT := $(SW)/kernel/crt0.S 67 | LINKSCRIPT := $(SW)/kernel/link.ld 68 | 69 | CC=$(Gcc)$(ISA)$(XLEN)-unknown-elf-gcc 70 | LD=$(CC) 71 | OBJDUMP=$(Gcc)$(ISA)$(XLEN)-unknown-elf-objdump 72 | CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=ilp32 -D__$(ISA)__ -O2 -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP 73 | LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=ilp32 -D__$(ISA)__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections 74 | 75 | # Setup build object dirs 76 | CRT=$(BUILD_DIR)/crt0.o 77 | OBJ=$(BUILD_DIR)/verif.o 78 | BIN=$(BUILD_DIR)/verif 79 | DUMP=$(BUILD_DIR)/verif.dump 80 | STIM_INSTR=$(BUILD_DIR)/stim_instr.txt 81 | STIM_DATA=$(BUILD_DIR)/stim_data.txt 82 | STACK_INIT=$(BUILD_DIR)/stack_init.txt 83 | 84 | # Build implicit rules 85 | $(STIM_INSTR) $(STIM_DATA) $(STACK_INIT): $(BIN) 86 | objcopy --srec-len 1 --output-target=srec $(BIN) $(BIN).s19 87 | $(PYTHON) scripts/parse_s19.py < $(BIN).s19 > $(BIN).txt 88 | $(PYTHON) scripts/s19tomem.py $(BIN).txt $(STIM_INSTR) $(STIM_DATA) 89 | $(PYTHON) scripts/stack_init.py $(STACK_INIT) 90 | 91 | $(BIN): $(CRT) $(OBJ) 92 | $(LD) $(LD_OPTS) -o $(BIN) $(CRT) $(OBJ) -T$(LINKSCRIPT) 93 | 94 | $(CRT): $(BUILD_DIR) 95 | $(CC) $(CC_OPTS) -c $(BOOTSCRIPT) -o $(CRT) 96 | 97 | $(OBJ): $(TEST_SRCS) 98 | $(CC) $(CC_OPTS) -c $(TEST_SRCS) $(FLAGS) $(INC) -o $(OBJ) 99 | 100 | $(BUILD_DIR): 101 | mkdir -p $(BUILD_DIR) 102 | 103 | SHELL := /bin/bash 104 | 105 | # Generate instructions and data stimuli 106 | sw-build: $(STIM_INSTR) $(STIM_DATA) $(STACK_INIT) dis 107 | 108 | $(SIM_DIR): 109 | mkdir -p $(SIM_DIR) 110 | 111 | synth-ips: 112 | $(Bender) update 113 | $(Bender) script synopsys \ 114 | $(common_targs) $(common_defs) \ 115 | $(synth_targs) $(synth_defs) \ 116 | > ${compile_script_synth} 117 | 118 | sw-clean: 119 | rm -rf $(BUILD_DIR) 120 | 121 | dis: 122 | $(OBJDUMP) -d $(BIN) > $(DUMP) 123 | 124 | OP ?= gemm 125 | fp_fmt ?= FP16 126 | M ?= 12 127 | N ?= 16 128 | K ?= 16 129 | 130 | golden: golden-clean 131 | $(MAKE) -C golden-model $(OP) SW=$(SW)/inc M=$(M) N=$(N) K=$(K) fp_fmt=$(fp_fmt) 132 | 133 | golden-clean: 134 | $(MAKE) -C golden-model golden-clean 135 | 136 | clean-all: sw-clean 137 | rm -rf $(RootDir).bender 138 | rm -rf $(compile_script) 139 | 140 | sw-all: sw-clean sw-build 141 | 142 | # Install tools 143 | CXX ?= g++ 144 | NumCores := $(shell nproc) 145 | NumCoresHalf := $(shell echo "$$(($(NumCores) / 2))") 146 | VendorDir ?= $(RootDir)vendor 147 | InstallDir ?= $(VendorDir)/install 148 | # Verilator 149 | VerilatorVersion ?= v5.028 150 | VerilatorInstallDir := $(InstallDir)/verilator 151 | # GCC 152 | GccInstallDir := $(InstallDir)/riscv 153 | RiscvTarDir := riscv.tar.gz 154 | GccUrl := https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2024.08.28/riscv32-elf-ubuntu-20.04-gcc-nightly-2024.08.28-nightly.tar.gz 155 | # Bender 156 | RustupInit := $(ScriptsDir)/rustup-init.sh 157 | CargoInstallDir := $(InstallDir)/cargo 158 | RustupInstallDir := $(InstallDir)/rustup 159 | Cargo := $(CargoInstallDir)/bin/cargo 160 | 161 | verilator: $(InstallDir)/bin/verilator 162 | 163 | $(InstallDir)/bin/verilator: 164 | rm -rf $(VendorDir)/verilator 165 | mkdir -p $(VendorDir) && cd $(VendorDir) && git clone https://github.com/verilator/verilator.git 166 | # Checkout the right version 167 | cd $(VendorDir)/verilator && git reset --hard && git fetch && git checkout $(VerilatorVersion) 168 | # Compile verilator 169 | sudo apt install libfl-dev help2man 170 | mkdir -p $(VerilatorInstallDir) && cd $(VendorDir)/verilator && git clean -xfdf && autoconf && \ 171 | ./configure --prefix=$(VerilatorInstallDir) CXX=$(CXX) && make -j$(NumCoresHalf) && make install 172 | 173 | riscv32-gcc: $(GccInstallDir) 174 | 175 | $(GccInstallDir): 176 | rm -rf $(GccInstallDir) $(VendorDir)/$(RiscvTarDir) 177 | mkdir -p $(InstallDir) 178 | cd $(VendorDir) && \ 179 | wget $(GccUrl) -O $(RiscvTarDir) && \ 180 | tar -xzvf $(RiscvTarDir) -C $(InstallDir) riscv 181 | 182 | bender: $(CargoInstallDir)/bin/bender 183 | 184 | $(CargoInstallDir)/bin/bender: 185 | curl --proto '=https' --tlsv1.2 https://sh.rustup.rs -sSf > $(RustupInit) 186 | mkdir -p $(InstallDir) 187 | export CARGO_HOME=$(CargoInstallDir) && export RUSTUP_HOME=$(RustupInstallDir) && \ 188 | chmod +x $(RustupInit); source $(RustupInit) -y && \ 189 | $(Cargo) install bender 190 | rm -rf $(RustupInit) 191 | 192 | tools: bender riscv32-gcc 193 | -------------------------------------------------------------------------------- /bender_common.mk: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | common_targs += -t rtl 9 | common_defs += -D COREV_ASSERT_OFF 10 | -------------------------------------------------------------------------------- /bender_sim.mk: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | sim_targs += -t redmule_test 9 | sim_defs += -D COREV_ASSERT_OFF 10 | 11 | ifneq ($(target),verilator) 12 | ifeq ($(UseXif),1) 13 | sim_targs += -t cv32e40x_bhv 14 | sim_defs += -D CV32E40X_TRACE_EXECUTION 15 | else 16 | sim_targs += -t cv32e40p_include_tracer 17 | sim_defs += -D CV32E40P_TRACE_EXECUTION 18 | endif 19 | endif 20 | -------------------------------------------------------------------------------- /bender_synth.mk: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | synth_targs += 9 | 10 | ifeq ($(REDMULE_COMPLEX),1) 11 | synth_defs += -D REDMULE_COMPLEX_SYNTH 12 | else 13 | synth_defs += -D REDMULE_HWPE_SYNTH 14 | endif 15 | -------------------------------------------------------------------------------- /doc/RedmuleSubsystem-CoreComplex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pulp-platform/redmule/10eeb391c36158037e9b48207923b568e1ee78af/doc/RedmuleSubsystem-CoreComplex.png -------------------------------------------------------------------------------- /doc/redmule_complex_testbench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pulp-platform/redmule/10eeb391c36158037e9b48207923b568e1ee78af/doc/redmule_complex_testbench.png -------------------------------------------------------------------------------- /doc/redmule_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pulp-platform/redmule/10eeb391c36158037e9b48207923b568e1ee78af/doc/redmule_overview.png -------------------------------------------------------------------------------- /doc/redmule_testbench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pulp-platform/redmule/10eeb391c36158037e9b48207923b568e1ee78af/doc/redmule_testbench.png -------------------------------------------------------------------------------- /golden-model/FP16/scripts/addmax.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | import argparse 14 | import dump_utils as dump 15 | import os 16 | 17 | # COMPUTE: 18 | # Z[m_size, k_size] = max (( X[m_size, n_size] + W[n_size, k_size] ), Y[m_size, k_size]) 19 | 20 | #Visualize data with more precision 21 | torch.set_printoptions(precision=10, sci_mode=False) 22 | 23 | parser = argparse.ArgumentParser("AddMax Operation Test") 24 | parser.add_argument( '--m_size', type=int, default=3 ) 25 | parser.add_argument( '--n_size', type=int, default=3 ) 26 | parser.add_argument( '--k_size', type=int, default=3 ) 27 | parser.add_argument( '--file_name', type=str, default='net_parameters.h') 28 | parser.add_argument( '--inc_dir', type=str) 29 | parser.add_argument( '--txt_dir', type=str) 30 | args = parser.parse_args() 31 | 32 | # Network parameters 33 | m_size = args.m_size 34 | n_size = args.n_size 35 | k_size = args.k_size 36 | 37 | f = open(args.file_name, "w") 38 | 39 | # Test Matrices 40 | X = torch.rand (m_size, n_size).half() 41 | W = torch.rand (n_size, k_size).half() 42 | Y = torch.rand (m_size, k_size).half() 43 | Z = torch.zeros(m_size, k_size).half() 44 | 45 | print("\nInput Data: ") 46 | print("\nX is: ", X, X.shape, X.dtype) 47 | f.write('fp16 X[IN_CH*MID_CH] = {'+dump.tensor_to_string(X)+'};\n') 48 | 49 | print("\nW is: ", W, W.shape, W.dtype) 50 | f.write('fp16 W[MID_CH*OUT_CH] = {'+dump.tensor_to_string(W)+'};\n') 51 | 52 | print("\nY is: ", Y, Y.shape, Y.dtype) 53 | f.write('fp16 Y[MID_CH*OUT_CH] = {'+dump.tensor_to_string(Y)+'};\n') 54 | 55 | print("\nComputing add-max..") 56 | for m in range(m_size): 57 | for k in range(k_size): 58 | Z[m][k] = Y[m][k] 59 | for n in range(n_size): 60 | Z[m][k] = torch.max(Z[m][k], torch.add(input = X[m][n], other = W[n][k])) 61 | 62 | print("\nZ is: ", Z, Z.shape, Z.dtype) 63 | f.write('fp16 Z[IN_CH*OUT_CH] = {'+dump.tensor_to_string(Z)+'};\n') 64 | 65 | print("\n\n") 66 | 67 | f.close() 68 | 69 | # Matrices conversion to hexadecimal and txt files generation 70 | txt_path = args.txt_dir 71 | for f in os.listdir(txt_path): 72 | os.remove(os.path.join(txt_path, f)) 73 | f_x = open(''+txt_path+'/x_input.txt', "w") 74 | for i in range(m_size): 75 | for j in range (n_size): 76 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 77 | x_hex = hex(int(x_bin, 2))[2:] 78 | f_x.write(x_hex) 79 | f_x.write(' ') 80 | f_x.write("\n") 81 | f_x.close() 82 | 83 | f_w = open(''+txt_path+'/w_input.txt', "w") 84 | for i in range(n_size): 85 | for j in range (k_size): 86 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 87 | w_hex = hex(int(w_bin, 2))[2:] 88 | f_w.write(w_hex) 89 | f_w.write(' ') 90 | f_w.write("\n") 91 | f_w.close() 92 | 93 | f_y = open(''+txt_path+'/y_input.txt', "w") 94 | for i in range(m_size): 95 | for j in range (k_size): 96 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 97 | y_hex = hex(int(y_bin, 2))[2:] 98 | f_y.write(y_hex) 99 | f_y.write(' ') 100 | f_y.write("\n") 101 | f_y.close() 102 | 103 | f_z = open(''+txt_path+'/z_output.txt', "w") 104 | for i in range(m_size): 105 | for j in range (k_size): 106 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 107 | z_hex = hex(int(z_bin, 2))[2:] 108 | f_z.write(z_hex) 109 | f_z.write(' ') 110 | f_z.write("\n") 111 | f_z.close() 112 | 113 | in_rows = str(m_size) 114 | in_cols = str(n_size) 115 | out_cols = str(k_size) 116 | x_dim = str(m_size*n_size) 117 | w_dim = str(n_size*k_size) 118 | y_dim = str(m_size*k_size) 119 | z_dim = str(m_size*k_size) 120 | out_int = str(int(m_size*k_size/2)) 121 | header = ' /* Header file generated by RedMulE Golden Model */\n' 122 | 123 | # ------------------------------------------------------------------------------------# 124 | # Header files generation # 125 | # ------------------------------------------------------------------------------------# 126 | 127 | # Path to the genereted files 128 | inc_path = args.inc_dir 129 | for f in os.listdir(inc_path): 130 | os.remove(os.path.join(inc_path, f)) 131 | 132 | f_x = open(''+inc_path+'/x_input.h', "w") 133 | f_x.write(''+header+'') 134 | f_x.write('uint16_t x_inp ['+x_dim+'] = {\n') 135 | for i in range(m_size): 136 | for j in range (n_size): 137 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 138 | x_hex = hex(int(x_bin, 2))[2:] 139 | if (i == m_size - 1 and j == n_size - 1): 140 | f_x.write('0x'+x_hex+' ') 141 | else: 142 | f_x.write('0x'+x_hex+', ') 143 | f_x.write("\n") 144 | f_x.write("};") 145 | f_x.close() 146 | 147 | f_x = open(''+inc_path+'/x_2D.h', "w") 148 | f_x.write(''+header+'') 149 | f_x.write('uint16_t x_inp_2D ['+in_rows+']['+in_cols+'] = {\n') 150 | for i in range(m_size): 151 | for j in range (n_size): 152 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 153 | x_hex = hex(int(x_bin, 2))[2:] 154 | if (i == m_size - 1 and j == n_size - 1): 155 | f_x.write('0x'+x_hex+' ') 156 | else: 157 | f_x.write('0x'+x_hex+', ') 158 | f_x.write("\n") 159 | f_x.write("};") 160 | f_x.close() 161 | 162 | f_w = open(''+inc_path+'/w_input.h', "w") 163 | f_w.write(''+header+'') 164 | f_w.write('uint16_t w_inp ['+w_dim+'] = {\n') 165 | for i in range(n_size): 166 | for j in range (k_size): 167 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 168 | w_hex = hex(int(w_bin, 2))[2:] 169 | if (i == n_size - 1 and j == k_size - 1): 170 | f_w.write('0x'+w_hex+' ') 171 | else: 172 | f_w.write('0x'+w_hex+', ') 173 | f_w.write("\n") 174 | f_w.write("};") 175 | f_w.close() 176 | 177 | f_w = open(''+inc_path+'/w_2D.h', "w") 178 | f_w.write(''+header+'') 179 | f_w.write('uint16_t w_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 180 | for i in range(n_size): 181 | for j in range (k_size): 182 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 183 | w_hex = hex(int(w_bin, 2))[2:] 184 | if (i == n_size - 1 and j == k_size - 1): 185 | f_w.write('0x'+w_hex+' ') 186 | else: 187 | f_w.write('0x'+w_hex+', ') 188 | f_w.write("\n") 189 | f_w.write("};") 190 | f_w.close() 191 | 192 | f_y = open(''+inc_path+'/y_input.h', "w") 193 | f_y.write(''+header+'') 194 | f_y.write('uint16_t y_inp ['+y_dim+'] = {\n') 195 | for i in range(m_size): 196 | for j in range (k_size): 197 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 198 | y_hex = hex(int(y_bin, 2))[2:] 199 | if (i == m_size - 1 and j == k_size - 1): 200 | f_y.write('0x'+y_hex+' ') 201 | else: 202 | f_y.write('0x'+y_hex+', ') 203 | f_y.write("\n") 204 | f_y.write("};") 205 | f_y.close() 206 | 207 | f_y = open(''+inc_path+'/y_2D.h', "w") 208 | f_y.write(''+header+'') 209 | f_y.write('uint16_t y_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 210 | for i in range(m_size): 211 | for j in range (k_size): 212 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 213 | y_hex = hex(int(y_bin, 2))[2:] 214 | if (i == m_size - 1 and j == k_size - 1): 215 | f_y.write('0x'+y_hex+' ') 216 | else: 217 | f_y.write('0x'+y_hex+', ') 218 | f_y.write("\n") 219 | f_y.write("};") 220 | f_y.close() 221 | 222 | f_z = open(''+inc_path+'/z_output.h', "w") 223 | f_z.write(''+header+'') 224 | f_z.write('uint16_t z_oup ['+z_dim+'] = {\n') 225 | for i in range(m_size): 226 | for j in range (k_size): 227 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 228 | z_hex = hex(int(z_bin, 2))[2:] 229 | if (i == m_size - 1 and j == k_size - 1): 230 | f_z.write('0x'+z_hex+' ') 231 | else: 232 | f_z.write('0x'+z_hex+', ') 233 | f_z.write("\n") 234 | f_z.write("};") 235 | f_z.close() 236 | 237 | f_z = open(''+inc_path+'/z_2D.h', "w") 238 | f_z.write(''+header+'') 239 | f_z.write('uint16_t z_oup_2D ['+in_rows+']['+out_cols+'] = {\n') 240 | for i in range(m_size): 241 | for j in range (k_size): 242 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 243 | z_hex = hex(int(z_bin, 2))[2:] 244 | if (i == m_size - 1 and j == k_size - 1): 245 | f_z.write('0x'+z_hex+' ') 246 | else: 247 | f_z.write('0x'+z_hex+', ') 248 | f_z.write("\n") 249 | f_z.write("};") 250 | f_z.close() 251 | 252 | # Writing tensors' dimensions 253 | f_d = open(''+inc_path+'/tensor_dim.h', "w") 254 | f_d.write(''+header+'') 255 | f_d.write('#ifndef __TENSOR_DIM__\n' ) 256 | f_d.write('#define __TENSOR_DIM__\n\n' ) 257 | f_d.write('#define M_SIZE '+in_rows+' \n' ) 258 | f_d.write('#define N_SIZE '+in_cols+' \n' ) 259 | f_d.write('#define K_SIZE '+out_cols+'\n' ) 260 | f_d.write('#define SRC_FMT FP16\n' ) 261 | f_d.write('#define DST_FMT FP16\n' ) 262 | f_d.write('#define FPFORMAT 16\n' ) 263 | f_d.write('uint8_t gemm_ops = ADDMAX; \n' ) 264 | f_d.write('\n#endif\n' ) 265 | f_d.close() 266 | 267 | #------------------------------------------------------------------------------------------# 268 | # 32-bits parser # 269 | #------------------------------------------------------------------------------------------# 270 | 271 | f_c = open(''+inc_path+'/golden.h', "w") 272 | f_c.write(''+header+'') 273 | f_c.write('uint32_t golden ['+out_int+'] = {\n') 274 | for i in range(m_size): 275 | j = 0 276 | while j < k_size - 1: 277 | c_bin_0 = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 278 | c_bin_1 = bin(np.float16(Z[i][j+1]).view('H'))[2:].zfill(16) 279 | c_hex_0 = hex(int(c_bin_0, 2))[2:] 280 | c_hex_1 = hex(int(c_bin_1, 2))[2:] 281 | c_hex = c_hex_1+c_hex_0 282 | f_c.write('0x'+c_hex+',\n') 283 | j += 2 284 | f_c.write("};") 285 | f_c.close() 286 | -------------------------------------------------------------------------------- /golden-model/FP16/scripts/addmin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | import argparse 14 | import dump_utils as dump 15 | import os 16 | 17 | # COMPUTE: 18 | # Z[m_size, k_size] = min (( X[m_size, n_size] + W[n_size, k_size] ), Y[m_size, k_size]) 19 | 20 | #Visualize data with more precision 21 | torch.set_printoptions(precision=10, sci_mode=False) 22 | 23 | parser = argparse.ArgumentParser("GEMM-Ops Operation Test") 24 | parser.add_argument( '--m_size', type=int, default=3 ) 25 | parser.add_argument( '--n_size', type=int, default=3 ) 26 | parser.add_argument( '--k_size', type=int, default=3 ) 27 | parser.add_argument( '--file_name', type=str, default='net_parameters.h') 28 | parser.add_argument( '--inc_dir', type=str) 29 | parser.add_argument( '--txt_dir', type=str) 30 | args = parser.parse_args() 31 | 32 | # Network parameters 33 | m_size = args.m_size 34 | n_size = args.n_size 35 | k_size = args.k_size 36 | 37 | f = open(args.file_name, "w") 38 | 39 | # Test Matrices 40 | X = torch.rand (m_size, n_size).half() 41 | W = torch.rand (n_size, k_size).half() 42 | Y = torch.rand (m_size, k_size).half() 43 | Z = torch.zeros(m_size, k_size).half() 44 | 45 | print("\nInput Data: ") 46 | print("\nX is: ", X, X.shape, X.dtype) 47 | f.write('fp16 X[IN_CH*MID_CH] = {'+dump.tensor_to_string(X)+'};\n') 48 | 49 | print("\nW is: ", W, W.shape, W.dtype) 50 | f.write('fp16 W[MID_CH*OUT_CH] = {'+dump.tensor_to_string(W)+'};\n') 51 | 52 | print("\nY is: ", Y, Y.shape, Y.dtype) 53 | f.write('fp16 Y[MID_CH*OUT_CH] = {'+dump.tensor_to_string(Y)+'};\n') 54 | 55 | print("\nComputing add-min..") 56 | for m in range(m_size): 57 | for k in range(k_size): 58 | Z[m][k] = Y[m][k] 59 | for n in range(n_size): 60 | Z[m][k] = torch.min(Z[m][k], torch.add(input = X[m][n], other = W[n][k])) 61 | 62 | print("\nZ is: ", Z, Z.shape, Z.dtype) 63 | f.write('fp16 Z[IN_CH*OUT_CH] = {'+dump.tensor_to_string(Z)+'};\n') 64 | 65 | print("\n\n") 66 | 67 | f.close() 68 | 69 | # Matrices conversion to hexadecimal and txt files generation 70 | txt_path = args.txt_dir 71 | for f in os.listdir(txt_path): 72 | os.remove(os.path.join(txt_path, f)) 73 | f_x = open(''+txt_path+'/x_input.txt', "w") 74 | for i in range(m_size): 75 | for j in range (n_size): 76 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 77 | x_hex = hex(int(x_bin, 2))[2:] 78 | f_x.write(x_hex) 79 | f_x.write(' ') 80 | f_x.write("\n") 81 | f_x.close() 82 | 83 | f_w = open(''+txt_path+'/w_input.txt', "w") 84 | for i in range(n_size): 85 | for j in range (k_size): 86 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 87 | w_hex = hex(int(w_bin, 2))[2:] 88 | f_w.write(w_hex) 89 | f_w.write(' ') 90 | f_w.write("\n") 91 | f_w.close() 92 | 93 | f_y = open(''+txt_path+'/y_input.txt', "w") 94 | for i in range(m_size): 95 | for j in range (k_size): 96 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 97 | y_hex = hex(int(y_bin, 2))[2:] 98 | f_y.write(y_hex) 99 | f_y.write(' ') 100 | f_y.write("\n") 101 | f_y.close() 102 | 103 | f_z = open(''+txt_path+'/z_output.txt', "w") 104 | for i in range(m_size): 105 | for j in range (k_size): 106 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 107 | z_hex = hex(int(z_bin, 2))[2:] 108 | f_z.write(z_hex) 109 | f_z.write(' ') 110 | f_z.write("\n") 111 | f_z.close() 112 | 113 | in_rows = str(m_size) 114 | in_cols = str(n_size) 115 | out_cols = str(k_size) 116 | x_dim = str(m_size*n_size) 117 | w_dim = str(n_size*k_size) 118 | y_dim = str(m_size*k_size) 119 | z_dim = str(m_size*k_size) 120 | out_int = str(int(m_size*k_size/2)) 121 | header = ' /* Header file generated by RedMulE Golden Model */\n' 122 | 123 | # ------------------------------------------------------------------------------------# 124 | # Header files generation # 125 | # ------------------------------------------------------------------------------------# 126 | 127 | # Path to the genereted files 128 | inc_path = args.inc_dir 129 | for f in os.listdir(inc_path): 130 | os.remove(os.path.join(inc_path, f)) 131 | 132 | f_x = open(''+inc_path+'/x_input.h', "w") 133 | f_x.write(''+header+'') 134 | f_x.write('uint16_t x_inp ['+x_dim+'] = {\n') 135 | for i in range(m_size): 136 | for j in range (n_size): 137 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 138 | x_hex = hex(int(x_bin, 2))[2:] 139 | if (i == m_size - 1 and j == n_size - 1): 140 | f_x.write('0x'+x_hex+' ') 141 | else: 142 | f_x.write('0x'+x_hex+', ') 143 | f_x.write("\n") 144 | f_x.write("};") 145 | f_x.close() 146 | 147 | f_x = open(''+inc_path+'/x_2D.h', "w") 148 | f_x.write(''+header+'') 149 | f_x.write('uint16_t x_inp_2D ['+in_rows+']['+in_cols+'] = {\n') 150 | for i in range(m_size): 151 | for j in range (n_size): 152 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 153 | x_hex = hex(int(x_bin, 2))[2:] 154 | if (i == m_size - 1 and j == n_size - 1): 155 | f_x.write('0x'+x_hex+' ') 156 | else: 157 | f_x.write('0x'+x_hex+', ') 158 | f_x.write("\n") 159 | f_x.write("};") 160 | f_x.close() 161 | 162 | f_w = open(''+inc_path+'/w_input.h', "w") 163 | f_w.write(''+header+'') 164 | f_w.write('uint16_t w_inp ['+w_dim+'] = {\n') 165 | for i in range(n_size): 166 | for j in range (k_size): 167 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 168 | w_hex = hex(int(w_bin, 2))[2:] 169 | if (i == n_size - 1 and j == k_size - 1): 170 | f_w.write('0x'+w_hex+' ') 171 | else: 172 | f_w.write('0x'+w_hex+', ') 173 | f_w.write("\n") 174 | f_w.write("};") 175 | f_w.close() 176 | 177 | f_w = open(''+inc_path+'/w_2D.h', "w") 178 | f_w.write(''+header+'') 179 | f_w.write('uint16_t w_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 180 | for i in range(n_size): 181 | for j in range (k_size): 182 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 183 | w_hex = hex(int(w_bin, 2))[2:] 184 | if (i == n_size - 1 and j == k_size - 1): 185 | f_w.write('0x'+w_hex+' ') 186 | else: 187 | f_w.write('0x'+w_hex+', ') 188 | f_w.write("\n") 189 | f_w.write("};") 190 | f_w.close() 191 | 192 | f_y = open(''+inc_path+'/y_input.h', "w") 193 | f_y.write(''+header+'') 194 | f_y.write('uint16_t y_inp ['+y_dim+'] = {\n') 195 | for i in range(m_size): 196 | for j in range (k_size): 197 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 198 | y_hex = hex(int(y_bin, 2))[2:] 199 | if (i == m_size - 1 and j == k_size - 1): 200 | f_y.write('0x'+y_hex+' ') 201 | else: 202 | f_y.write('0x'+y_hex+', ') 203 | f_y.write("\n") 204 | f_y.write("};") 205 | f_y.close() 206 | 207 | f_y = open(''+inc_path+'/y_2D.h', "w") 208 | f_y.write(''+header+'') 209 | f_y.write('uint16_t y_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 210 | for i in range(m_size): 211 | for j in range (k_size): 212 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 213 | y_hex = hex(int(y_bin, 2))[2:] 214 | if (i == m_size - 1 and j == k_size - 1): 215 | f_y.write('0x'+y_hex+' ') 216 | else: 217 | f_y.write('0x'+y_hex+', ') 218 | f_y.write("\n") 219 | f_y.write("};") 220 | f_y.close() 221 | 222 | f_z = open(''+inc_path+'/z_output.h', "w") 223 | f_z.write(''+header+'') 224 | f_z.write('uint16_t z_oup ['+z_dim+'] = {\n') 225 | for i in range(m_size): 226 | for j in range (k_size): 227 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 228 | z_hex = hex(int(z_bin, 2))[2:] 229 | if (i == m_size - 1 and j == k_size - 1): 230 | f_z.write('0x'+z_hex+' ') 231 | else: 232 | f_z.write('0x'+z_hex+', ') 233 | f_z.write("\n") 234 | f_z.write("};") 235 | f_z.close() 236 | 237 | f_z = open(''+inc_path+'/z_2D.h', "w") 238 | f_z.write(''+header+'') 239 | f_z.write('uint16_t z_oup_2D ['+in_rows+']['+out_cols+'] = {\n') 240 | for i in range(m_size): 241 | for j in range (k_size): 242 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 243 | z_hex = hex(int(z_bin, 2))[2:] 244 | if (i == m_size - 1 and j == k_size - 1): 245 | f_z.write('0x'+z_hex+' ') 246 | else: 247 | f_z.write('0x'+z_hex+', ') 248 | f_z.write("\n") 249 | f_z.write("};") 250 | f_z.close() 251 | 252 | # Writing tensors' dimensions 253 | f_d = open(''+inc_path+'/tensor_dim.h', "w") 254 | f_d.write(''+header+'') 255 | f_d.write('#ifndef __TENSOR_DIM__\n' ) 256 | f_d.write('#define __TENSOR_DIM__\n\n' ) 257 | f_d.write('#define M_SIZE '+in_rows+' \n' ) 258 | f_d.write('#define N_SIZE '+in_cols+' \n' ) 259 | f_d.write('#define K_SIZE '+out_cols+'\n' ) 260 | f_d.write('#define SRC_FMT FP16\n' ) 261 | f_d.write('#define DST_FMT FP16\n' ) 262 | f_d.write('#define FPFORMAT 16\n' ) 263 | f_d.write('uint8_t gemm_ops = ADDMIN; \n' ) 264 | f_d.write('\n#endif\n' ) 265 | f_d.close() 266 | 267 | #------------------------------------------------------------------------------------------# 268 | # 32-bits parser # 269 | #------------------------------------------------------------------------------------------# 270 | 271 | f_c = open(''+inc_path+'/golden.h', "w") 272 | f_c.write(''+header+'') 273 | f_c.write('uint32_t golden ['+out_int+'] = {\n') 274 | for i in range(m_size): 275 | j = 0 276 | while j < k_size - 1: 277 | c_bin_0 = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 278 | c_bin_1 = bin(np.float16(Z[i][j+1]).view('H'))[2:].zfill(16) 279 | c_hex_0 = hex(int(c_bin_0, 2))[2:] 280 | c_hex_1 = hex(int(c_bin_1, 2))[2:] 281 | c_hex = c_hex_1+c_hex_0 282 | f_c.write('0x'+c_hex+',\n') 283 | j += 2 284 | f_c.write("};") 285 | f_c.close() 286 | -------------------------------------------------------------------------------- /golden-model/FP16/scripts/dump_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import torch 9 | 10 | def tensor_to_string(tensor): 11 | tensor_string = '' 12 | ndim = len(tensor.size()) 13 | if ndim == 1: 14 | sz0 = tensor.size()[0] 15 | for i in range(sz0): 16 | tensor_string += str(tensor[i].item()) 17 | tensor_string += 'f, ' if i < sz0-1 else 'f' 18 | 19 | elif ndim == 2: 20 | sz0 = tensor.size()[0] 21 | sz1 = tensor.size()[1] 22 | print('Sizes: ',sz0,sz1) 23 | for i in range(sz0): 24 | for j in range(sz1): 25 | tensor_string += str(tensor[i][j].item()) 26 | tensor_string += 'f, ' if (i*sz1+j) < (sz0*sz1-1) else 'f' 27 | 28 | else: 29 | 30 | pass # FIXME to be implemented 31 | 32 | 33 | return tensor_string 34 | 35 | 36 | 37 | def main(): 38 | import argparse 39 | parser = argparse.ArgumentParser("FCN Layer Test") 40 | parser.add_argument( '--in_size', type=int, default=2, 41 | help="An integer will be increased by 1 and printed." ) 42 | parser.add_argument( '--out_size', type=int, default=2, 43 | help="An integer will be increased by 1 and printed." ) 44 | args = parser.parse_args() 45 | 46 | dim0_sz = args.in_size 47 | dim1_sz = args.out_size 48 | t = torch.rand(dim0_sz) 49 | print(t) 50 | print(tensor_to_string(t)) 51 | 52 | t = torch.rand(dim1_sz, dim0_sz) 53 | print(t) 54 | print(tensor_to_string(t)) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /golden-model/FP16/scripts/maxmin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | import argparse 14 | import dump_utils as dump 15 | import os 16 | 17 | # COMPUTE: 18 | # Z[m_size, k_size] = min(max ( X[m_size, n_size], W[n_size, k_size] ), Y[m_size, k_size]) 19 | 20 | #Visualize data with more precision 21 | torch.set_printoptions(precision=10, sci_mode=False) 22 | 23 | parser = argparse.ArgumentParser("AddMax Operation Test") 24 | parser.add_argument( '--m_size', type=int, default=3 ) 25 | parser.add_argument( '--n_size', type=int, default=3 ) 26 | parser.add_argument( '--k_size', type=int, default=3 ) 27 | parser.add_argument( '--file_name', type=str, default='net_parameters.h') 28 | parser.add_argument( '--inc_dir', type=str) 29 | parser.add_argument( '--txt_dir', type=str) 30 | args = parser.parse_args() 31 | 32 | # Network parameters 33 | m_size = args.m_size 34 | n_size = args.n_size 35 | k_size = args.k_size 36 | 37 | f = open(args.file_name, "w") 38 | 39 | # Test Matrices 40 | X = torch.rand (m_size, n_size).half() 41 | W = torch.rand (n_size, k_size).half() 42 | Y = torch.rand (m_size, k_size).half() 43 | Z = torch.zeros(m_size, k_size).half() 44 | 45 | print("\nInput Data: ") 46 | print("\nX is: ", X, X.shape, X.dtype) 47 | f.write('fp16 X[IN_CH*MID_CH] = {'+dump.tensor_to_string(X)+'};\n') 48 | 49 | print("\nW is: ", W, W.shape, W.dtype) 50 | f.write('fp16 W[MID_CH*OUT_CH] = {'+dump.tensor_to_string(W)+'};\n') 51 | 52 | print("\nY is: ", Y, Y.shape, Y.dtype) 53 | f.write('fp16 Y[MID_CH*OUT_CH] = {'+dump.tensor_to_string(Y)+'};\n') 54 | 55 | print("\nComputing max-min..") 56 | for m in range(m_size): 57 | for k in range(k_size): 58 | Z[m][k] = Y[m][k] 59 | for n in range(n_size): 60 | Z[m][k] = torch.min(Z[m][k], torch.max(X[m][n], W[n][k])) 61 | 62 | print("\nZ is: ", Z, Z.shape, Z.dtype) 63 | f.write('fp16 Z[IN_CH*OUT_CH] = {'+dump.tensor_to_string(Z)+'};\n') 64 | 65 | print("\n\n") 66 | 67 | f.close() 68 | 69 | # Matrices conversion to hexadecimal and txt files generation 70 | txt_path = args.txt_dir 71 | for f in os.listdir(txt_path): 72 | os.remove(os.path.join(txt_path, f)) 73 | f_x = open(''+txt_path+'/x_input.txt', "w") 74 | for i in range(m_size): 75 | for j in range (n_size): 76 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 77 | x_hex = hex(int(x_bin, 2))[2:] 78 | f_x.write(x_hex) 79 | f_x.write(' ') 80 | f_x.write("\n") 81 | f_x.close() 82 | 83 | f_w = open(''+txt_path+'/w_input.txt', "w") 84 | for i in range(n_size): 85 | for j in range (k_size): 86 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 87 | w_hex = hex(int(w_bin, 2))[2:] 88 | f_w.write(w_hex) 89 | f_w.write(' ') 90 | f_w.write("\n") 91 | f_w.close() 92 | 93 | f_y = open(''+txt_path+'/y_input.txt', "w") 94 | for i in range(m_size): 95 | for j in range (k_size): 96 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 97 | y_hex = hex(int(y_bin, 2))[2:] 98 | f_y.write(y_hex) 99 | f_y.write(' ') 100 | f_y.write("\n") 101 | f_y.close() 102 | 103 | f_z = open(''+txt_path+'/z_output.txt', "w") 104 | for i in range(m_size): 105 | for j in range (k_size): 106 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 107 | z_hex = hex(int(z_bin, 2))[2:] 108 | f_z.write(z_hex) 109 | f_z.write(' ') 110 | f_z.write("\n") 111 | f_z.close() 112 | 113 | in_rows = str(m_size) 114 | in_cols = str(n_size) 115 | out_cols = str(k_size) 116 | x_dim = str(m_size*n_size) 117 | w_dim = str(n_size*k_size) 118 | y_dim = str(m_size*k_size) 119 | z_dim = str(m_size*k_size) 120 | out_int = str(int(m_size*k_size/2)) 121 | header = ' /* Header file generated by RedMulE Golden Model */\n' 122 | 123 | # ------------------------------------------------------------------------------------# 124 | # Header files generation # 125 | # ------------------------------------------------------------------------------------# 126 | 127 | # Path to the genereted files 128 | inc_path = args.inc_dir 129 | for f in os.listdir(inc_path): 130 | os.remove(os.path.join(inc_path, f)) 131 | 132 | f_x = open(''+inc_path+'/x_input.h', "w") 133 | f_x.write(''+header+'') 134 | f_x.write('uint16_t x_inp ['+x_dim+'] = {\n') 135 | for i in range(m_size): 136 | for j in range (n_size): 137 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 138 | x_hex = hex(int(x_bin, 2))[2:] 139 | if (i == m_size - 1 and j == n_size - 1): 140 | f_x.write('0x'+x_hex+' ') 141 | else: 142 | f_x.write('0x'+x_hex+', ') 143 | f_x.write("\n") 144 | f_x.write("};") 145 | f_x.close() 146 | 147 | f_x = open(''+inc_path+'/x_2D.h', "w") 148 | f_x.write(''+header+'') 149 | f_x.write('uint16_t x_inp_2D ['+in_rows+']['+in_cols+'] = {\n') 150 | for i in range(m_size): 151 | for j in range (n_size): 152 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 153 | x_hex = hex(int(x_bin, 2))[2:] 154 | if (i == m_size - 1 and j == n_size - 1): 155 | f_x.write('0x'+x_hex+' ') 156 | else: 157 | f_x.write('0x'+x_hex+', ') 158 | f_x.write("\n") 159 | f_x.write("};") 160 | f_x.close() 161 | 162 | f_w = open(''+inc_path+'/w_input.h', "w") 163 | f_w.write(''+header+'') 164 | f_w.write('uint16_t w_inp ['+w_dim+'] = {\n') 165 | for i in range(n_size): 166 | for j in range (k_size): 167 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 168 | w_hex = hex(int(w_bin, 2))[2:] 169 | if (i == n_size - 1 and j == k_size - 1): 170 | f_w.write('0x'+w_hex+' ') 171 | else: 172 | f_w.write('0x'+w_hex+', ') 173 | f_w.write("\n") 174 | f_w.write("};") 175 | f_w.close() 176 | 177 | f_w = open(''+inc_path+'/w_2D.h', "w") 178 | f_w.write(''+header+'') 179 | f_w.write('uint16_t w_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 180 | for i in range(n_size): 181 | for j in range (k_size): 182 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 183 | w_hex = hex(int(w_bin, 2))[2:] 184 | if (i == n_size - 1 and j == k_size - 1): 185 | f_w.write('0x'+w_hex+' ') 186 | else: 187 | f_w.write('0x'+w_hex+', ') 188 | f_w.write("\n") 189 | f_w.write("};") 190 | f_w.close() 191 | 192 | f_y = open(''+inc_path+'/y_input.h', "w") 193 | f_y.write(''+header+'') 194 | f_y.write('uint16_t y_inp ['+y_dim+'] = {\n') 195 | for i in range(m_size): 196 | for j in range (k_size): 197 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 198 | y_hex = hex(int(y_bin, 2))[2:] 199 | if (i == m_size - 1 and j == k_size - 1): 200 | f_y.write('0x'+y_hex+' ') 201 | else: 202 | f_y.write('0x'+y_hex+', ') 203 | f_y.write("\n") 204 | f_y.write("};") 205 | f_y.close() 206 | 207 | f_y = open(''+inc_path+'/y_2D.h', "w") 208 | f_y.write(''+header+'') 209 | f_y.write('uint16_t y_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 210 | for i in range(m_size): 211 | for j in range (k_size): 212 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 213 | y_hex = hex(int(y_bin, 2))[2:] 214 | if (i == m_size - 1 and j == k_size - 1): 215 | f_y.write('0x'+y_hex+' ') 216 | else: 217 | f_y.write('0x'+y_hex+', ') 218 | f_y.write("\n") 219 | f_y.write("};") 220 | f_y.close() 221 | 222 | f_z = open(''+inc_path+'/z_output.h', "w") 223 | f_z.write(''+header+'') 224 | f_z.write('uint16_t z_oup ['+z_dim+'] = {\n') 225 | for i in range(m_size): 226 | for j in range (k_size): 227 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 228 | z_hex = hex(int(z_bin, 2))[2:] 229 | if (i == m_size - 1 and j == k_size - 1): 230 | f_z.write('0x'+z_hex+' ') 231 | else: 232 | f_z.write('0x'+z_hex+', ') 233 | f_z.write("\n") 234 | f_z.write("};") 235 | f_z.close() 236 | 237 | f_z = open(''+inc_path+'/z_2D.h', "w") 238 | f_z.write(''+header+'') 239 | f_z.write('uint16_t z_oup_2D ['+in_rows+']['+out_cols+'] = {\n') 240 | for i in range(m_size): 241 | for j in range (k_size): 242 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 243 | z_hex = hex(int(z_bin, 2))[2:] 244 | if (i == m_size - 1 and j == k_size - 1): 245 | f_z.write('0x'+z_hex+' ') 246 | else: 247 | f_z.write('0x'+z_hex+', ') 248 | f_z.write("\n") 249 | f_z.write("};") 250 | f_z.close() 251 | 252 | # Writing tensors' dimensions 253 | f_d = open(''+inc_path+'/tensor_dim.h', "w") 254 | f_d.write(''+header+'') 255 | f_d.write('#ifndef __TENSOR_DIM__\n' ) 256 | f_d.write('#define __TENSOR_DIM__\n\n' ) 257 | f_d.write('#define M_SIZE '+in_rows+' \n' ) 258 | f_d.write('#define N_SIZE '+in_cols+' \n' ) 259 | f_d.write('#define K_SIZE '+out_cols+'\n' ) 260 | f_d.write('#define SRC_FMT FP16\n' ) 261 | f_d.write('#define DST_FMT FP16\n' ) 262 | f_d.write('#define FPFORMAT 16\n' ) 263 | f_d.write('uint8_t gemm_ops = MAXMIN; \n' ) 264 | f_d.write('\n#endif\n' ) 265 | f_d.close() 266 | 267 | #------------------------------------------------------------------------------------------# 268 | # 32-bits parser # 269 | #------------------------------------------------------------------------------------------# 270 | 271 | f_c = open(''+inc_path+'/golden.h', "w") 272 | f_c.write(''+header+'') 273 | f_c.write('uint32_t golden ['+out_int+'] = {\n') 274 | for i in range(m_size): 275 | j = 0 276 | while j < k_size - 1: 277 | c_bin_0 = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 278 | c_bin_1 = bin(np.float16(Z[i][j+1]).view('H'))[2:].zfill(16) 279 | c_hex_0 = hex(int(c_bin_0, 2))[2:] 280 | c_hex_1 = hex(int(c_bin_1, 2))[2:] 281 | c_hex = c_hex_1+c_hex_0 282 | f_c.write('0x'+c_hex+',\n') 283 | j += 2 284 | f_c.write("};") 285 | f_c.close() 286 | -------------------------------------------------------------------------------- /golden-model/FP16/scripts/mulmin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | import argparse 14 | import dump_utils as dump 15 | import os 16 | 17 | # COMPUTE: 18 | # Z[m_size, k_size] = min(( X[m_size, n_size] x W[n_size, k_size] ), Y[m_size, k_size]) 19 | 20 | #Visualize data with more precision 21 | torch.set_printoptions(precision=10, sci_mode=False) 22 | 23 | parser = argparse.ArgumentParser("GEMM-Ops Operation Test") 24 | parser.add_argument( '--m_size', type=int, default=3 ) 25 | parser.add_argument( '--n_size', type=int, default=3 ) 26 | parser.add_argument( '--k_size', type=int, default=3 ) 27 | parser.add_argument( '--file_name', type=str, default='net_parameters.h') 28 | parser.add_argument( '--inc_dir', type=str) 29 | parser.add_argument( '--txt_dir', type=str) 30 | args = parser.parse_args() 31 | 32 | # Network parameters 33 | m_size = args.m_size 34 | n_size = args.n_size 35 | k_size = args.k_size 36 | 37 | f = open(args.file_name, "w") 38 | 39 | # Test Matrices 40 | X = torch.rand (m_size, n_size).half() 41 | W = torch.rand (n_size, k_size).half() 42 | Y = torch.rand (m_size, k_size).half() 43 | Z = torch.zeros(m_size, k_size).half() 44 | 45 | print("\nInput Data: ") 46 | print("\nX is: ", X, X.shape, X.dtype) 47 | f.write('fp16 X[IN_CH*MID_CH] = {'+dump.tensor_to_string(X)+'};\n') 48 | 49 | print("\nW is: ", W, W.shape, W.dtype) 50 | f.write('fp16 W[MID_CH*OUT_CH] = {'+dump.tensor_to_string(W)+'};\n') 51 | 52 | print("\nY is: ", Y, Y.shape, Y.dtype) 53 | f.write('fp16 Y[MID_CH*OUT_CH] = {'+dump.tensor_to_string(Y)+'};\n') 54 | 55 | print("\nComputing mul-min..") 56 | for m in range(m_size): 57 | for k in range(k_size): 58 | Z[m][k] = Y[m][k] 59 | for n in range(n_size): 60 | Z[m][k] = torch.min(Z[m][k], torch.mul(X[m][n], W[n][k])) 61 | 62 | print("\nZ is: ", Z, Z.shape, Z.dtype) 63 | f.write('fp16 Z[IN_CH*OUT_CH] = {'+dump.tensor_to_string(Z)+'};\n') 64 | 65 | print("\n\n") 66 | 67 | f.close() 68 | 69 | # Matrices conversion to hexadecimal and txt files generation 70 | txt_path = args.txt_dir 71 | for f in os.listdir(txt_path): 72 | os.remove(os.path.join(txt_path, f)) 73 | f_x = open(''+txt_path+'/x_input.txt', "w") 74 | for i in range(m_size): 75 | for j in range (n_size): 76 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 77 | x_hex = hex(int(x_bin, 2))[2:] 78 | f_x.write(x_hex) 79 | f_x.write(' ') 80 | f_x.write("\n") 81 | f_x.close() 82 | 83 | f_w = open(''+txt_path+'/w_input.txt', "w") 84 | for i in range(n_size): 85 | for j in range (k_size): 86 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 87 | w_hex = hex(int(w_bin, 2))[2:] 88 | f_w.write(w_hex) 89 | f_w.write(' ') 90 | f_w.write("\n") 91 | f_w.close() 92 | 93 | f_y = open(''+txt_path+'/y_input.txt', "w") 94 | for i in range(m_size): 95 | for j in range (k_size): 96 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 97 | y_hex = hex(int(y_bin, 2))[2:] 98 | f_y.write(y_hex) 99 | f_y.write(' ') 100 | f_y.write("\n") 101 | f_y.close() 102 | 103 | f_z = open(''+txt_path+'/z_output.txt', "w") 104 | for i in range(m_size): 105 | for j in range (k_size): 106 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 107 | z_hex = hex(int(z_bin, 2))[2:] 108 | f_z.write(z_hex) 109 | f_z.write(' ') 110 | f_z.write("\n") 111 | f_z.close() 112 | 113 | in_rows = str(m_size) 114 | in_cols = str(n_size) 115 | out_cols = str(k_size) 116 | x_dim = str(m_size*n_size) 117 | w_dim = str(n_size*k_size) 118 | y_dim = str(m_size*k_size) 119 | z_dim = str(m_size*k_size) 120 | out_int = str(int(m_size*k_size/2)) 121 | header = ' /* Header file generated by RedMulE Golden Model */\n' 122 | 123 | # ------------------------------------------------------------------------------------# 124 | # Header files generation # 125 | # ------------------------------------------------------------------------------------# 126 | 127 | # Path to the genereted files 128 | inc_path = args.inc_dir 129 | for f in os.listdir(inc_path): 130 | os.remove(os.path.join(inc_path, f)) 131 | f_x = open(''+inc_path+'/x_input.h', "w") 132 | f_x.write(''+header+'') 133 | f_x.write('uint16_t x_inp ['+x_dim+'] = {\n') 134 | for i in range(m_size): 135 | for j in range (n_size): 136 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 137 | x_hex = hex(int(x_bin, 2))[2:] 138 | if (i == m_size - 1 and j == n_size - 1): 139 | f_x.write('0x'+x_hex+' ') 140 | else: 141 | f_x.write('0x'+x_hex+', ') 142 | f_x.write("\n") 143 | f_x.write("};") 144 | f_x.close() 145 | 146 | f_x = open(''+inc_path+'/x_2D.h', "w") 147 | f_x.write(''+header+'') 148 | f_x.write('uint16_t x_inp_2D ['+in_rows+']['+in_cols+'] = {\n') 149 | for i in range(m_size): 150 | for j in range (n_size): 151 | x_bin = bin(np.float16(X[i][j]).view('H'))[2:].zfill(16) 152 | x_hex = hex(int(x_bin, 2))[2:] 153 | if (i == m_size - 1 and j == n_size - 1): 154 | f_x.write('0x'+x_hex+' ') 155 | else: 156 | f_x.write('0x'+x_hex+', ') 157 | f_x.write("\n") 158 | f_x.write("};") 159 | f_x.close() 160 | 161 | f_w = open(''+inc_path+'/w_input.h', "w") 162 | f_w.write(''+header+'') 163 | f_w.write('uint16_t w_inp ['+w_dim+'] = {\n') 164 | for i in range(n_size): 165 | for j in range (k_size): 166 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 167 | w_hex = hex(int(w_bin, 2))[2:] 168 | if (i == n_size - 1 and j == k_size - 1): 169 | f_w.write('0x'+w_hex+' ') 170 | else: 171 | f_w.write('0x'+w_hex+', ') 172 | f_w.write("\n") 173 | f_w.write("};") 174 | f_w.close() 175 | 176 | f_w = open(''+inc_path+'/w_2D.h', "w") 177 | f_w.write(''+header+'') 178 | f_w.write('uint16_t w_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 179 | for i in range(n_size): 180 | for j in range (k_size): 181 | w_bin = bin(np.float16(W[i][j]).view('H'))[2:].zfill(16) 182 | w_hex = hex(int(w_bin, 2))[2:] 183 | if (i == n_size - 1 and j == k_size - 1): 184 | f_w.write('0x'+w_hex+' ') 185 | else: 186 | f_w.write('0x'+w_hex+', ') 187 | f_w.write("\n") 188 | f_w.write("};") 189 | f_w.close() 190 | 191 | f_y = open(''+inc_path+'/y_input.h', "w") 192 | f_y.write(''+header+'') 193 | f_y.write('uint16_t y_inp ['+y_dim+'] = {\n') 194 | for i in range(m_size): 195 | for j in range (k_size): 196 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 197 | y_hex = hex(int(y_bin, 2))[2:] 198 | if (i == m_size - 1 and j == k_size - 1): 199 | f_y.write('0x'+y_hex+' ') 200 | else: 201 | f_y.write('0x'+y_hex+', ') 202 | f_y.write("\n") 203 | f_y.write("};") 204 | f_y.close() 205 | 206 | f_y = open(''+inc_path+'/y_2D.h', "w") 207 | f_y.write(''+header+'') 208 | f_y.write('uint16_t y_inp_2D ['+in_cols+']['+out_cols+'] = {\n') 209 | for i in range(m_size): 210 | for j in range (k_size): 211 | y_bin = bin(np.float16(Y[i][j]).view('H'))[2:].zfill(16) 212 | y_hex = hex(int(y_bin, 2))[2:] 213 | if (i == m_size - 1 and j == k_size - 1): 214 | f_y.write('0x'+y_hex+' ') 215 | else: 216 | f_y.write('0x'+y_hex+', ') 217 | f_y.write("\n") 218 | f_y.write("};") 219 | f_y.close() 220 | 221 | f_z = open(''+inc_path+'/z_output.h', "w") 222 | f_z.write(''+header+'') 223 | f_z.write('uint16_t z_oup ['+z_dim+'] = {\n') 224 | for i in range(m_size): 225 | for j in range (k_size): 226 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 227 | z_hex = hex(int(z_bin, 2))[2:] 228 | if (i == m_size - 1 and j == k_size - 1): 229 | f_z.write('0x'+z_hex+' ') 230 | else: 231 | f_z.write('0x'+z_hex+', ') 232 | f_z.write("\n") 233 | f_z.write("};") 234 | f_z.close() 235 | 236 | f_z = open(''+inc_path+'/z_2D.h', "w") 237 | f_z.write(''+header+'') 238 | f_z.write('uint16_t z_oup_2D ['+in_rows+']['+out_cols+'] = {\n') 239 | for i in range(m_size): 240 | for j in range (k_size): 241 | z_bin = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 242 | z_hex = hex(int(z_bin, 2))[2:] 243 | if (i == m_size - 1 and j == k_size - 1): 244 | f_z.write('0x'+z_hex+' ') 245 | else: 246 | f_z.write('0x'+z_hex+', ') 247 | f_z.write("\n") 248 | f_z.write("};") 249 | f_z.close() 250 | 251 | # Writing tensors' dimensions 252 | f_d = open(''+inc_path+'/tensor_dim.h', "w") 253 | f_d.write(''+header+'') 254 | f_d.write('#ifndef __TENSOR_DIM__\n' ) 255 | f_d.write('#define __TENSOR_DIM__\n\n' ) 256 | f_d.write('#define M_SIZE '+in_rows+' \n' ) 257 | f_d.write('#define N_SIZE '+in_cols+' \n' ) 258 | f_d.write('#define K_SIZE '+out_cols+'\n' ) 259 | f_d.write('#define SRC_FMT FP16\n' ) 260 | f_d.write('#define DST_FMT FP16\n' ) 261 | f_d.write('#define FPFORMAT 16\n' ) 262 | f_d.write('uint8_t gemm_ops = MULMIN; \n' ) 263 | f_d.write('\n#endif\n' ) 264 | f_d.close() 265 | 266 | #------------------------------------------------------------------------------------------# 267 | # 32-bits parser # 268 | #------------------------------------------------------------------------------------------# 269 | 270 | f_c = open(''+inc_path+'/golden.h', "w") 271 | f_c.write(''+header+'') 272 | f_c.write('uint32_t golden ['+out_int+'] = {\n') 273 | for i in range(m_size): 274 | j = 0 275 | while j < k_size - 1: 276 | c_bin_0 = bin(np.float16(Z[i][j]).view('H'))[2:].zfill(16) 277 | c_bin_1 = bin(np.float16(Z[i][j+1]).view('H'))[2:].zfill(16) 278 | c_hex_0 = hex(int(c_bin_0, 2))[2:] 279 | c_hex_1 = hex(int(c_bin_1, 2))[2:] 280 | c_hex = c_hex_1+c_hex_0 281 | f_c.write('0x'+c_hex+',\n') 282 | j += 2 283 | f_c.write("};") 284 | f_c.close() 285 | -------------------------------------------------------------------------------- /golden-model/FP8/scripts/dump_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | import torch 9 | 10 | def tensor_to_string(tensor): 11 | tensor_string = '' 12 | ndim = len(tensor.size()) 13 | if ndim == 1: 14 | sz0 = tensor.size()[0] 15 | for i in range(sz0): 16 | tensor_string += str(tensor[i].item()) 17 | tensor_string += 'f, ' if i < sz0-1 else 'f' 18 | 19 | elif ndim == 2: 20 | sz0 = tensor.size()[0] 21 | sz1 = tensor.size()[1] 22 | print('Sizes: ',sz0,sz1) 23 | for i in range(sz0): 24 | for j in range(sz1): 25 | tensor_string += str(tensor[i][j].item()) 26 | tensor_string += 'f, ' if (i*sz1+j) < (sz0*sz1-1) else 'f' 27 | 28 | else: 29 | 30 | pass # FIXME to be implemented 31 | 32 | 33 | return tensor_string 34 | 35 | 36 | 37 | def main(): 38 | import argparse 39 | parser = argparse.ArgumentParser("FCN Layer Test") 40 | parser.add_argument( '--in_size', type=int, default=2, 41 | help="An integer will be increased by 1 and printed." ) 42 | parser.add_argument( '--out_size', type=int, default=2, 43 | help="An integer will be increased by 1 and printed." ) 44 | args = parser.parse_args() 45 | 46 | dim0_sz = args.in_size 47 | dim1_sz = args.out_size 48 | t = torch.rand(dim0_sz) 49 | print(t) 50 | print(tensor_to_string(t)) 51 | 52 | t = torch.rand(dim1_sz, dim0_sz) 53 | print(t) 54 | print(tensor_to_string(t)) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /golden-model/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | # RedMulE Golden Model Makefile 8 | 9 | SHELL := /bin/bash 10 | CUR_DIR := $(shell pwd) 11 | PENV := $(CUR_DIR)/venv 12 | SW ?= 13 | 14 | ## Perform Matrix-Matrix Operations of the kins _Z = Y (op2) (X (op1) W)_ 15 | ## We assume X is _MxN_, W is _NxK_, while Y and Z are _MxK_ 16 | 17 | # Tensors Dimensions 18 | M ?= 3 19 | N ?= 3 20 | K ?= 3 21 | 22 | # FP format 23 | fp_fmt ?= FP16 24 | 25 | check_sw: 26 | mkdir -p $(SW) 27 | 28 | check_penv: check_sw 29 | ifndef PENV 30 | $(error PENV is undefined. Make sure to export \ 31 | PENV= before continue.) 32 | python3 -m venv $(PENV) 33 | source .venv/bin/activate 34 | pip3 install numpy 35 | pip3 install torch 36 | endif 37 | 38 | # The current Makefile can be used as follows: 39 | # make $(operation) -> execute any of the supported GEMM-Ops operation (Z = (X op1 W) op2 Y) 40 | # Set the tensor dimensions while launching the Makefile. For example, in order to run a GEMM 41 | # between three 64x64 matrices, run: 42 | # `make clean gemm M=64 N=64 K=64` 43 | 44 | addmax: check_penv 45 | mkdir -p $@/txt; \ 46 | cd $(PENV); \ 47 | source ./bin/activate; \ 48 | cd $(CUR_DIR)/$(fp_fmt); \ 49 | python3 ./scripts/$@.py \ 50 | --m_size $(M) --n_size $(N) --k_size $(K) \ 51 | --inc_dir $(SW) \ 52 | --txt_dir $(CUR_DIR)/$@/txt; \ 53 | cd $(PENV); \ 54 | deactivate 55 | 56 | addmin: check_penv 57 | mkdir -p $@/txt; \ 58 | cd $(PENV); \ 59 | source ./bin/activate; \ 60 | cd $(CUR_DIR)/$(fp_fmt); \ 61 | python3 ./scripts/$@.py \ 62 | --m_size $(M) --n_size $(N) --k_size $(K) \ 63 | --inc_dir $(SW) \ 64 | --txt_dir $(CUR_DIR)/$@/txt; \ 65 | cd $(PENV); \ 66 | deactivate 67 | 68 | gemm: check_penv 69 | mkdir -p $@/txt; \ 70 | cd $(PENV); \ 71 | source ./bin/activate; \ 72 | cd $(CUR_DIR)/$(fp_fmt); \ 73 | python3 ./scripts/$@.py \ 74 | --m_size $(M) --n_size $(N) --k_size $(K) \ 75 | --inc_dir $(SW) \ 76 | --txt_dir $(CUR_DIR)/$@/txt; \ 77 | cd $(PENV); \ 78 | deactivate 79 | 80 | maxmin: check_penv 81 | mkdir -p $@/txt; \ 82 | cd $(PENV); \ 83 | source ./bin/activate; \ 84 | cd $(CUR_DIR)/$(fp_fmt); \ 85 | python3 ./scripts/$@.py \ 86 | --m_size $(M) --n_size $(N) --k_size $(K) \ 87 | --inc_dir $(SW)/inc \ 88 | --txt_dir $(CUR_DIR)/$@/txt; \ 89 | cd $(PENV); \ 90 | deactivate 91 | 92 | minmax: check_penv 93 | mkdir -p $@/txt; \ 94 | cd $(PENV); \ 95 | source ./bin/activate; \ 96 | cd $(CUR_DIR)/$(fp_fmt); \ 97 | python3 ./scripts/$@.py \ 98 | --m_size $(M) --n_size $(N) --k_size $(K) \ 99 | --inc_dir $(SW) \ 100 | --txt_dir $(CUR_DIR)/$@/txt; \ 101 | cd $(PENV); \ 102 | deactivate 103 | 104 | mulmax: check_penv 105 | mkdir -p $@/txt; \ 106 | cd $(PENV); \ 107 | source ./bin/activate; \ 108 | cd $(CUR_DIR)/$(fp_fmt); \ 109 | python3 ./scripts/$@.py \ 110 | --m_size $(M) --n_size $(N) --k_size $(K) \ 111 | --inc_dir $(SW) \ 112 | --txt_dir $(CUR_DIR)/$@/txt; \ 113 | cd $(PENV); \ 114 | deactivate 115 | 116 | mulmin: check_penv 117 | mkdir -p $@/txt; \ 118 | cd $(PENV); \ 119 | source ./bin/activate; \ 120 | cd $(CUR_DIR)/$(fp_fmt); \ 121 | python3 ./scripts/$@.py \ 122 | --m_size $(M) --n_size $(N) --k_size $(K) \ 123 | --inc_dir $(SW) \ 124 | --txt_dir $(CUR_DIR)/$@/txt; \ 125 | cd $(PENV); \ 126 | deactivate 127 | 128 | golden-clean: 129 | rm -rf FP16/net_parameters.h 130 | rm -rf FP8/net_parameters.h 131 | rm -rf FP16/scripts/__pycache__ 132 | rm -rf FP8/scripts/__pycache__ 133 | rm -rf addmax addmin gemm maxmin minmax mulmax mulmin 134 | -------------------------------------------------------------------------------- /golden-model/setup-py.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | export PYTHON=python3 9 | export PENV=$(pwd)/venv 10 | $PYTHON -m venv $PENV 11 | source $PENV/bin/activate 12 | pip3 install --upgrade pip 13 | pip3 install numpy 14 | pip3 install torch 15 | deactivate 16 | -------------------------------------------------------------------------------- /rtl/deprecated/redmule_complex_wrap.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | module redmule_complex_wrap 9 | import redmule_pkg::*; 10 | #( 11 | localparam int unsigned AddrWidth = 32, 12 | localparam int unsigned NumIrqs = 0 13 | )( 14 | input logic clk_i , 15 | input logic rst_ni , 16 | input logic test_mode_i , 17 | input logic fetch_enable_i , 18 | input logic [ AddrWidth-1:0] boot_addr_i , 19 | input logic [ NumIrqs-1:0] irq_i , 20 | output logic [$clog2(NumIrqs)-1:0] irq_id_o , 21 | output logic irq_ack_o , 22 | output logic core_sleep_o , 23 | input core_default_inst_rsp_t core_inst_rsp_i , 24 | output core_default_inst_req_t core_inst_req_o , 25 | input core_default_data_rsp_t core_data_rsp_i , 26 | output core_default_data_req_t core_data_req_o , 27 | input redmule_default_data_rsp_t redmule_data_rsp_i, 28 | output redmule_default_data_req_t redmule_data_req_o 29 | ); 30 | localparam int unsigned DW = redmule_pkg::DATA_W; 31 | localparam int unsigned NC = 1; 32 | 33 | logic test_mode ; 34 | logic fetch_enable; 35 | logic [ AddrWidth-1:0] boot_addr ; 36 | logic [ NumIrqs-1:0] irq ; 37 | logic [$clog2(NumIrqs)-1:0] irq_id ; 38 | logic irq_ack ; 39 | logic core_sleep ; 40 | 41 | core_default_inst_rsp_t core_inst_rsp; 42 | core_default_inst_req_t core_inst_req; 43 | core_default_data_rsp_t core_data_rsp; 44 | core_default_data_req_t core_data_req; 45 | 46 | hci_core_intf #(.DW(DW)) tcdm (.clk(clk_i)); 47 | 48 | always_ff @(posedge clk_i, negedge rst_ni) begin 49 | if (~rst_ni) begin 50 | // Inputs 51 | test_mode <= '0; 52 | fetch_enable <= '0; 53 | boot_addr <= '0; 54 | irq <= '0; 55 | core_inst_rsp <= '0; 56 | core_data_rsp <= '0; 57 | tcdm.gnt <= '0; 58 | tcdm.r_valid <= '0; 59 | tcdm.r_data <= '0; 60 | tcdm.r_opc <= '0; 61 | tcdm.r_user <= '0; 62 | // Outputs 63 | irq_id_o <= '0; 64 | irq_ack_o <= '0; 65 | core_sleep_o <= '0; 66 | core_inst_req_o <= '0; 67 | core_data_req_o <= '0; 68 | redmule_data_req_o <= '0; 69 | end else begin 70 | // Inputs 71 | test_mode <= test_mode_i ; 72 | fetch_enable <= fetch_enable_i ; 73 | boot_addr <= boot_addr_i ; 74 | irq <= irq_i ; 75 | core_inst_rsp <= core_inst_rsp_i ; 76 | core_data_rsp <= core_data_rsp_i ; 77 | tcdm.gnt <= redmule_data_rsp_i.gnt ; 78 | tcdm.r_valid <= redmule_data_rsp_i.r_valid; 79 | tcdm.r_data <= redmule_data_rsp_i.r_data ; 80 | tcdm.r_opc <= redmule_data_rsp_i.r_opc ; 81 | tcdm.r_user <= redmule_data_rsp_i.r_user ; 82 | // Outputs 83 | irq_id_o <= irq_id ; 84 | irq_ack_o <= irq_ack ; 85 | core_sleep_o <= core_sleep ; 86 | core_inst_req_o <= core_inst_req ; 87 | core_data_req_o <= core_data_req ; 88 | redmule_data_req_o.req <= tcdm.req ; 89 | redmule_data_req_o.wen <= tcdm.wen ; 90 | redmule_data_req_o.be <= tcdm.be ; 91 | redmule_data_req_o.boffs <= tcdm.boffs; 92 | redmule_data_req_o.add <= tcdm.add ; 93 | redmule_data_req_o.data <= tcdm.data ; 94 | redmule_data_req_o.lrdy <= tcdm.lrdy ; 95 | redmule_data_req_o.user <= tcdm.user ; 96 | end 97 | end 98 | 99 | redmule_complex #( 100 | .CoreType ( redmule_pkg::CV32X ), // CV32E40P, CV32E40X, IBEX, SNITCH, CVA6 101 | .ID_WIDTH ( redmule_pkg::ID ), 102 | .N_CORES ( NC ), 103 | .DW ( DW ), // TCDM port dimension (in bits) 104 | .MP ( DW/32 ), 105 | .NumIrqs ( NumIrqs ), 106 | .AddrWidth ( AddrWidth ), 107 | .core_data_req_t ( core_default_data_req_t ), 108 | .core_data_rsp_t ( core_default_data_rsp_t ), 109 | .core_inst_req_t ( core_default_inst_req_t ), 110 | .core_inst_rsp_t ( core_default_inst_rsp_t ) 111 | ) i_redmule_complex ( 112 | .clk_i ( clk_i ), 113 | .rst_ni ( rst_ni ), 114 | .test_mode_i ( test_mode ), 115 | .fetch_enable_i ( fetch_enable ), 116 | .boot_addr_i ( boot_addr ), 117 | .irq_i ( irq ), 118 | .irq_id_o ( irq_id ), 119 | .irq_ack_o ( irq_ack ), 120 | .core_sleep_o ( core_sleep ), 121 | .core_inst_rsp_i ( core_inst_rsp ), 122 | .core_inst_req_o ( core_inst_req ), 123 | .core_data_rsp_i ( core_data_rsp ), 124 | .core_data_req_o ( core_data_req ), 125 | .tcdm ( tcdm ) 126 | ); 127 | 128 | endmodule : redmule_complex_wrap 129 | -------------------------------------------------------------------------------- /rtl/deprecated/redmule_wrap.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | `include "hci_helpers.svh" 9 | 10 | module redmule_wrap 11 | import fpnew_pkg::*; 12 | import hci_package::*; 13 | import redmule_pkg::*; 14 | import hwpe_ctrl_package::*; 15 | import hwpe_stream_package::*; 16 | #( 17 | parameter int unsigned ID_WIDTH = 8 , 18 | parameter int unsigned N_CORES = 8 , 19 | parameter int unsigned DW = DATA_W , // TCDM port dimension (in bits) 20 | parameter int unsigned MP = DW/redmule_pkg::MemDw, 21 | parameter int unsigned EW = 0 , // ECC signals width 22 | localparam fp_format_e FpFormat = FPFORMAT , // Data format (default is FP16) 23 | localparam int unsigned Height = ARRAY_HEIGHT , // Number of PEs within a row 24 | localparam int unsigned Width = ARRAY_WIDTH , // Number of parallel rows 25 | localparam int unsigned NumPipeRegs = PIPE_REGS , // Number of pipeline registers within each PE 26 | localparam pipe_config_t PipeConfig = DISTRIBUTED , 27 | localparam int unsigned BITW = fp_width(FpFormat) // Number of bits for the given format 28 | )( 29 | // global signals 30 | input logic clk_i , 31 | input logic rst_ni , 32 | input logic test_mode_i , 33 | // evnets 34 | output logic [N_CORES-1:0][1:0] evt_o , 35 | output logic busy_o , 36 | // tcdm master ports 37 | output logic [ MP-1:0] tcdm_req_o , 38 | input logic [ MP-1:0] tcdm_gnt_i , 39 | output logic [ MP-1:0][31:0] tcdm_add_o , 40 | output logic [ MP-1:0] tcdm_wen_o , 41 | output logic [ MP-1:0][ 3:0] tcdm_be_o , 42 | output logic [ MP-1:0][31:0] tcdm_data_o , 43 | output logic [ EW-1:0] tcdm_ecc_o , 44 | input logic [ MP-1:0][31:0] tcdm_r_data_i , 45 | input logic [ MP-1:0] tcdm_r_valid_i , 46 | input logic tcdm_r_opc_i , 47 | input logic tcdm_r_user_i , 48 | input logic [ EW-1:0] tcdm_r_ecc_i , 49 | // periph slave port 50 | input logic periph_req_i , 51 | output logic periph_gnt_o , 52 | input logic [ 31:0] periph_add_i , 53 | input logic periph_wen_i , 54 | input logic [ 3:0] periph_be_i , 55 | input logic [ 31:0] periph_data_i , 56 | input logic [ID_WIDTH-1:0] periph_id_i , 57 | output logic [ 31:0] periph_r_data_o , 58 | output logic periph_r_valid_o, 59 | output logic [ID_WIDTH-1:0] periph_r_id_o 60 | ); 61 | 62 | localparam hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '{ 63 | DW: DW, 64 | AW: DEFAULT_AW, 65 | BW: DEFAULT_BW, 66 | UW: DEFAULT_UW, 67 | IW: DEFAULT_IW, 68 | EW: EW, 69 | EHW: DEFAULT_EHW 70 | }; 71 | 72 | hci_core_intf #( 73 | `ifndef SYNTHESIS 74 | .WAIVE_RSP3_ASSERT ( 1'b1 ), // waive RSP-3 on memory-side of HCI FIFO 75 | .WAIVE_RSP5_ASSERT ( 1'b1 ), // waive RSP-5 on memory-side of HCI FIFO 76 | `endif 77 | .DW ( DW ), 78 | .EW ( EW ) ) tcdm ( .clk ( clk_i ) ); 79 | 80 | hwpe_ctrl_intf_periph #(.ID_WIDTH(ID_WIDTH)) periph (.clk(clk_i)); 81 | 82 | logic busy; 83 | logic [N_CORES-1:0][1:0] evt; 84 | 85 | `ifdef REDMULE_HWPE_SYNTH 86 | always_ff @(posedge clk_i, negedge rst_ni) begin 87 | if (~rst_ni) begin 88 | // TCDM port 89 | for (int ii = 0; ii < MP; ii++) begin 90 | tcdm_req_o [ii] <= '0; 91 | tcdm_add_o [ii] <= '0; 92 | tcdm_wen_o [ii] <= '0; 93 | tcdm_be_o [ii] <= '0; 94 | tcdm_data_o [ii] <= '0; 95 | end 96 | tcdm_ecc_o <= '0; 97 | tcdm.gnt <= '0; 98 | tcdm.r_valid <= '0; 99 | tcdm.r_data <= '0; 100 | tcdm.r_opc <= '0; 101 | tcdm.r_user <= '0; 102 | tcdm.r_ecc <= '0; 103 | tcdm.r_id <= '0; 104 | tcdm.egnt <= '0; 105 | tcdm.r_evalid <= '0; 106 | // Control port 107 | periph.req <= '0; 108 | periph.add <= '0; 109 | periph.wen <= '0; 110 | periph.be <= '0; 111 | periph.data <= '0; 112 | periph.id <= '0; 113 | periph_gnt_o <= '0; 114 | periph_r_data_o <= '0; 115 | periph_r_valid_o <= '0; 116 | periph_r_id_o <= '0; 117 | // Other 118 | busy_o <= '0; 119 | evt_o <= '0; 120 | end else begin 121 | // TCDM port 122 | for (int ii = 0; ii < MP; ii++) begin 123 | tcdm_req_o [ii] <= tcdm.req; 124 | tcdm_add_o [ii] <= tcdm.add + ii*4; 125 | tcdm_wen_o [ii] <= tcdm.wen; 126 | tcdm_be_o [ii] <= tcdm.be[ii*4+:4]; 127 | tcdm_data_o [ii] <= tcdm.data[ii*32+:32]; 128 | end 129 | tcdm_ecc_o <= tcdm.ecc; 130 | tcdm.gnt <= &(tcdm_gnt_i); 131 | tcdm.r_valid <= &(tcdm_r_valid_i); 132 | tcdm.r_data <= { >> {tcdm_r_data_i} }; 133 | tcdm.r_opc <= tcdm_r_opc_i; 134 | tcdm.r_user <= tcdm_r_user_i; 135 | tcdm.r_ecc <= tcdm_r_ecc_i; 136 | tcdm.r_id <= '0; 137 | tcdm.egnt <= '1; 138 | tcdm.r_evalid <= '0; 139 | // Control port 140 | periph.req <= periph_req_i; 141 | periph.add <= periph_add_i; 142 | periph.wen <= periph_wen_i; 143 | periph.be <= periph_be_i; 144 | periph.data <= periph_data_i; 145 | periph.id <= periph_id_i; 146 | periph_gnt_o <= periph.gnt; 147 | periph_r_data_o <= periph.r_data; 148 | periph_r_valid_o <= periph.r_valid; 149 | periph_r_id_o <= periph.r_id; 150 | // Other 151 | busy_o <= busy; 152 | evt_o <= evt; 153 | end 154 | end 155 | `else 156 | for(genvar ii=0; ii> {tcdm_r_data_i} }; 167 | assign tcdm.r_opc = tcdm_r_opc_i; 168 | assign tcdm.r_user = tcdm_r_user_i; 169 | assign tcdm.r_ecc = tcdm_r_ecc_i; 170 | assign tcdm.r_id = '0; 171 | assign tcdm.egnt = '1; 172 | assign tcdm.r_evalid = '0; 173 | 174 | assign periph.req = periph_req_i; 175 | assign periph.add = periph_add_i; 176 | assign periph.wen = periph_wen_i; 177 | assign periph.be = periph_be_i; 178 | assign periph.data = periph_data_i; 179 | assign periph.id = periph_id_i; 180 | assign periph_gnt_o = periph.gnt; 181 | assign periph_r_data_o = periph.r_data; 182 | assign periph_r_valid_o = periph.r_valid; 183 | assign periph_r_id_o = periph.r_id; 184 | `endif 185 | 186 | redmule_top #( 187 | .ID_WIDTH ( ID_WIDTH ), 188 | .N_CORES ( N_CORES ), 189 | .DW ( DW ), 190 | .`HCI_SIZE_PARAM(tcdm) ( `HCI_SIZE_PARAM(tcdm) ) 191 | ) i_redmule_top ( 192 | .clk_i ( clk_i ), 193 | .rst_ni ( rst_ni ), 194 | .test_mode_i ( test_mode_i ), 195 | .evt_o ( evt_o ), 196 | .busy_o ( busy_o ), 197 | .tcdm ( tcdm ), 198 | .periph ( periph ) 199 | ); 200 | 201 | endmodule: redmule_wrap 202 | -------------------------------------------------------------------------------- /rtl/redmule_castin.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | import fpnew_pkg::*; 9 | import hci_package::*; 10 | import redmule_pkg::*; 11 | 12 | module redmule_castin #( 13 | parameter int unsigned DATA_W = redmule_pkg::DATA_W, 14 | parameter fpnew_pkg::fmt_logic_t FpFmtConfig = FpFmtConfig, 15 | parameter fpnew_pkg::ifmt_logic_t IntFmtConfig = IntFmtConfig, 16 | parameter fpnew_pkg::fp_format_e DstFormat = FPFORMAT, 17 | parameter fpnew_pkg::operation_e Operation = CAST_OP, 18 | parameter logic Pipe = 1'b0 , 19 | localparam int unsigned BW = hci_package::DEFAULT_BW , 20 | localparam int unsigned OW = ADDR_W , 21 | localparam int unsigned UW = hci_package::DEFAULT_UW , 22 | localparam int unsigned WIDTH = fpnew_pkg::maximum(fpnew_pkg::max_fp_width(FpFmtConfig), 23 | fpnew_pkg::max_int_width(IntFmtConfig)) 24 | )( 25 | input logic clk_i , 26 | input logic rst_ni , 27 | input logic clear_i , 28 | input logic cast_i , 29 | input logic [DATA_W-1:0] src_i , 30 | input fpnew_pkg::fp_format_e src_fmt_i, 31 | output logic [DATA_W-1:0] dst_o 32 | ); 33 | 34 | localparam int unsigned NUM_CAST = DATA_W/BITW; 35 | localparam int unsigned NARRBITW = fpnew_pkg::fp_width(fpnew_pkg::FP8); 36 | // localparam int unsigned ZEROBITS = WIDTH - NARRBITW; 37 | localparam int unsigned ZEROBITS = MIN_FMT; 38 | localparam fpnew_pkg::int_format_e INT_SRC = fpnew_pkg::INT8; 39 | 40 | logic [DATA_W-1:0] src_int; 41 | 42 | assign src_int[DATA_W-DW_CUT-1:0] = src_i[DATA_W-DW_CUT-1:0]; 43 | assign src_int[DATA_W-1:DATA_W-DW_CUT] = '0; 44 | 45 | logic [DATA_W-1:0] dst_int; 46 | logic [NUM_CAST-1:0][WIDTH-1:0] result , 47 | operand; 48 | 49 | generate 50 | for (genvar i = 0; i < NUM_CAST; i++) begin : gen_cast_units 51 | 52 | assign operand [i] = {{ZEROBITS{1'b0}}, src_int[i*MIN_FMT+:MIN_FMT]}; 53 | 54 | fpnew_cast_multi #( 55 | .FpFmtConfig ( FpFmtConfig ), 56 | .IntFmtConfig ( IntFmtConfig ) 57 | ) redmule_cast_i ( 58 | .clk_i ( clk_i ), 59 | .rst_ni ( rst_ni ), 60 | .operands_i ( operand [i] ), 61 | .is_boxed_i ( '1 ), 62 | .rnd_mode_i ( fpnew_pkg::RNE ), 63 | .op_i ( Operation ), 64 | .op_mod_i ( '0 ), 65 | .src_fmt_i ( src_fmt_i ), 66 | .dst_fmt_i ( DstFormat ), 67 | .int_fmt_i ( INT_SRC ), 68 | .tag_i ( '0 ), 69 | .mask_i ( '0 ), 70 | .aux_i ( '0 ), 71 | .in_valid_i ( '1 ), 72 | .in_ready_o ( ), 73 | .flush_i ( '0 ), 74 | .result_o ( result [i] ), 75 | .status_o ( ), 76 | .extension_bit_o( ), 77 | .tag_o ( ), 78 | .mask_o ( ), 79 | .aux_o ( ), 80 | .out_valid_o ( ), 81 | .out_ready_i ( '1 ), 82 | .busy_o ( ) 83 | ); 84 | 85 | assign dst_int [i*WIDTH+:WIDTH] = result[i]; 86 | 87 | end 88 | 89 | endgenerate 90 | 91 | assign dst_o = cast_i ? dst_int : src_i; 92 | 93 | endmodule : redmule_castin 94 | -------------------------------------------------------------------------------- /rtl/redmule_castout.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | import fpnew_pkg::*; 9 | import hci_package::*; 10 | import redmule_pkg::*; 11 | 12 | module redmule_castout #( 13 | parameter int unsigned DATA_W = redmule_pkg::DATA_W, 14 | parameter fpnew_pkg::fmt_logic_t FpFmtConfig = FpFmtConfig, 15 | parameter fpnew_pkg::ifmt_logic_t IntFmtConfig = IntFmtConfig, 16 | parameter fpnew_pkg::fp_format_e SrcFormat = FPFORMAT, 17 | parameter fpnew_pkg::operation_e Operation = CAST_OP, 18 | parameter logic Pipe = 1'b0 , 19 | localparam int unsigned BW = hci_package::DEFAULT_BW , 20 | localparam int unsigned OW = ADDR_W , 21 | localparam int unsigned UW = hci_package::DEFAULT_UW , 22 | localparam int unsigned WIDTH = fpnew_pkg::maximum(fpnew_pkg::max_fp_width(FpFmtConfig), 23 | fpnew_pkg::max_int_width(IntFmtConfig)) 24 | )( 25 | input logic clk_i , 26 | input logic rst_ni , 27 | input logic clear_i , 28 | input logic cast_i , 29 | input logic [DATA_W-1:0] src_i , 30 | input fpnew_pkg::fp_format_e dst_fmt_i, 31 | output logic [DATA_W-1:0] dst_o 32 | ); 33 | 34 | localparam int unsigned NUM_CAST = DATA_W/BITW; 35 | localparam int unsigned NARRBITW = fpnew_pkg::fp_width(fpnew_pkg::FP8); 36 | // localparam int unsigned ZEROBITS = WIDTH - NARRBITW; 37 | localparam int unsigned ZEROBITS = MIN_FMT; 38 | localparam fpnew_pkg::int_format_e INT_SRC = fpnew_pkg::INT8; 39 | 40 | logic [DATA_W-1:0] dst_int, 41 | res; 42 | logic [NUM_CAST-1:0][WIDTH-1:0] result , 43 | operand; 44 | 45 | generate 46 | for (genvar i = 0; i < NUM_CAST; i++) begin : gen_cast_units 47 | 48 | assign operand [i] = src_i[i*WIDTH+:WIDTH]; 49 | 50 | fpnew_cast_multi #( 51 | .FpFmtConfig ( FpFmtConfig ), 52 | .IntFmtConfig ( IntFmtConfig ) 53 | ) redmule_cast_i ( 54 | .clk_i ( clk_i ), 55 | .rst_ni ( rst_ni ), 56 | .operands_i ( operand [i] ), 57 | .is_boxed_i ( '1 ), 58 | .rnd_mode_i ( fpnew_pkg::RNE ), 59 | .op_i ( Operation ), 60 | .op_mod_i ( '0 ), 61 | .src_fmt_i ( SrcFormat ), 62 | .dst_fmt_i ( dst_fmt_i ), 63 | .int_fmt_i ( INT_SRC ), 64 | .tag_i ( '0 ), 65 | .mask_i ( '0 ), 66 | .aux_i ( '0 ), 67 | .in_valid_i ( '1 ), 68 | .in_ready_o ( ), 69 | .flush_i ( '0 ), 70 | .result_o ( result [i] ), 71 | .status_o ( ), 72 | .extension_bit_o ( ), 73 | .tag_o ( ), 74 | .mask_o ( ), 75 | .aux_o ( ), 76 | .out_valid_o ( ), 77 | .out_ready_i ( '1 ), 78 | .busy_o ( ) 79 | ); 80 | 81 | assign res [i*MIN_FMT+:MIN_FMT] = result[i][WIDTH-MIN_FMT-1:0]; 82 | 83 | end 84 | 85 | endgenerate 86 | 87 | assign dst_int = {{DATA_W-DW_CUT{1'b0}}, res[DATA_W-DW_CUT-1:0]}; 88 | 89 | assign dst_o = cast_i ? dst_int : src_i; 90 | 91 | endmodule : redmule_castout 92 | -------------------------------------------------------------------------------- /rtl/redmule_ctrl.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // Andrea Belano 7 | // 8 | 9 | import redmule_pkg::*; 10 | 11 | module redmule_ctrl 12 | import hwpe_ctrl_package::*; 13 | #( 14 | parameter int unsigned N_CORES = 8 , 15 | parameter int unsigned IO_REGS = REDMULE_REGS , 16 | parameter int unsigned ID_WIDTH = 8 , 17 | parameter int unsigned SysDataWidth = 32 , 18 | parameter int unsigned N_CONTEXT = 2 , 19 | parameter int unsigned Height = 4 , 20 | parameter int unsigned Width = 8 , 21 | parameter int unsigned NumPipeRegs = 3 , 22 | localparam int unsigned TILE = (NumPipeRegs +1)*Height 23 | )( 24 | input logic clk_i , 25 | input logic rst_ni , 26 | input logic test_mode_i , 27 | output logic busy_o , 28 | output logic clear_o , 29 | output logic [N_CORES-1:0][1:0] evt_o , 30 | output ctrl_regfile_t reg_file_o , 31 | input logic reg_enable_i , 32 | input logic start_cfg_i , 33 | input flgs_streamer_t flgs_streamer_i , 34 | output logic cfg_complete_o , 35 | // Flags coming from the state machine 36 | input logic w_loaded_i , 37 | // Control signals for the engine 38 | output logic flush_o , 39 | // Control signals for the state machine 40 | output cntrl_scheduler_t cntrl_scheduler_o , 41 | output cntrl_flags_t cntrl_flags_o, 42 | // Peripheral slave port 43 | hwpe_ctrl_intf_periph.slave periph 44 | ); 45 | 46 | logic clear, latch_clear; 47 | logic tiler_setback, tiler_valid; 48 | 49 | typedef enum logic [2:0] { 50 | REDMULE_LATCH_RST, 51 | REDMULE_IDLE, 52 | REDMULE_STARTING, 53 | REDMULE_COMPUTING, 54 | REDMULE_FINISHED 55 | } redmule_ctrl_state_e; 56 | 57 | redmule_ctrl_state_e current, next; 58 | 59 | hwpe_ctrl_package::ctrl_regfile_t reg_file_d, reg_file_q; 60 | hwpe_ctrl_package::ctrl_slave_t cntrl_slave; 61 | hwpe_ctrl_package::flags_slave_t flgs_slave; 62 | 63 | // Control slave interface 64 | hwpe_ctrl_slave #( 65 | .REGFILE_SCM ( 0 ), 66 | .N_CORES ( N_CORES ), 67 | .N_CONTEXT ( N_CONTEXT ), 68 | .N_IO_REGS ( REDMULE_REGS ), 69 | .N_GENERIC_REGS ( 6 ), 70 | .ID_WIDTH ( ID_WIDTH ) 71 | ) i_slave ( 72 | .clk_i ( clk_i ), 73 | .rst_ni ( rst_ni ), 74 | .clear_o ( clear ), 75 | .cfg ( periph ), 76 | .ctrl_i ( cntrl_slave ), 77 | .flags_o ( flgs_slave ), 78 | .reg_file ( reg_file_d ) 79 | ); 80 | 81 | redmule_tiler i_cfg_tiler ( 82 | .clk_i ( clk_i ), 83 | .rst_ni ( rst_ni ), 84 | .clear_i ( clear ), 85 | .setback_i ( tiler_setback ), 86 | .start_cfg_i ( start_cfg_i ), 87 | .reg_file_i ( reg_file_d ), 88 | .valid_o ( tiler_valid ), 89 | .reg_file_o ( reg_file_q ) 90 | ); 91 | 92 | assign cfg_complete_o = tiler_valid; 93 | /*---------------------------------------------------------------------------------------------*/ 94 | /* Register island */ 95 | /*---------------------------------------------------------------------------------------------*/ 96 | 97 | // State register 98 | always_ff @(posedge clk_i or negedge rst_ni) begin : state_register 99 | if(~rst_ni) begin 100 | current <= REDMULE_LATCH_RST; 101 | end else begin 102 | if (clear) 103 | current <= REDMULE_IDLE; 104 | else 105 | current <= next; 106 | end 107 | end 108 | 109 | logic slave_start; 110 | always_ff @(posedge clk_i, negedge rst_ni) begin 111 | if (~rst_ni) begin 112 | slave_start <= 1'b0; 113 | end else begin 114 | if (clear || tiler_setback) 115 | slave_start <= 1'b0; 116 | else if (flgs_slave.start) 117 | slave_start <= 1'b1; 118 | end 119 | end 120 | 121 | /*---------------------------------------------------------------------------------------------*/ 122 | /* Register file assignment */ 123 | /*---------------------------------------------------------------------------------------------*/ 124 | assign reg_file_o = reg_file_q; 125 | 126 | /*---------------------------------------------------------------------------------------------*/ 127 | /* Controller FSM */ 128 | /*---------------------------------------------------------------------------------------------*/ 129 | 130 | assign cntrl_scheduler_o.first_load = current == REDMULE_STARTING; 131 | assign tiler_setback = current == REDMULE_IDLE && next == REDMULE_STARTING; 132 | assign busy_o = current != REDMULE_LATCH_RST && current != REDMULE_IDLE && current != REDMULE_FINISHED; 133 | assign flush_o = current == REDMULE_FINISHED; 134 | assign cntrl_scheduler_o.rst = current == REDMULE_FINISHED; 135 | assign cntrl_scheduler_o.finished = current == REDMULE_FINISHED; 136 | assign latch_clear = current == REDMULE_LATCH_RST; 137 | 138 | always_comb begin : controller_fsm 139 | cntrl_flags_o.idle = 1'b0; 140 | cntrl_slave = '0; 141 | next = current; 142 | 143 | case (current) 144 | REDMULE_LATCH_RST: begin 145 | cntrl_flags_o.idle = 1'b1; 146 | next = REDMULE_IDLE; 147 | end 148 | 149 | REDMULE_IDLE: begin 150 | cntrl_flags_o.idle = 1'b1; 151 | if ((slave_start & tiler_valid) || test_mode_i) begin 152 | next = REDMULE_STARTING; 153 | end 154 | end 155 | 156 | REDMULE_STARTING: begin 157 | if (w_loaded_i) begin 158 | next = REDMULE_COMPUTING; 159 | end 160 | end 161 | 162 | REDMULE_COMPUTING: begin 163 | if (flgs_streamer_i.z_stream_sink_flags.done) begin 164 | next = REDMULE_FINISHED; 165 | end 166 | end 167 | 168 | REDMULE_FINISHED: begin 169 | next = REDMULE_IDLE; 170 | cntrl_slave.done = 1'b1; 171 | end 172 | endcase 173 | end 174 | 175 | /*---------------------------------------------------------------------------------------------*/ 176 | /* Other combinational assigmnets */ 177 | /*---------------------------------------------------------------------------------------------*/ 178 | assign evt_o = flgs_slave.evt[7:0]; 179 | assign clear_o = clear || latch_clear; 180 | 181 | endmodule : redmule_ctrl 182 | -------------------------------------------------------------------------------- /rtl/redmule_engine.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | module redmule_engine 9 | import fpnew_pkg::*; 10 | import redmule_pkg::*; 11 | #( 12 | parameter fp_format_e FpFormat = FP16 , 13 | parameter int unsigned Height = 4 , // Number of PEs per row 14 | parameter int unsigned Width = 8 , // Number of parallel index 15 | parameter int unsigned NumPipeRegs = 3 , 16 | parameter pipe_config_t PipeConfig = DISTRIBUTED , 17 | parameter type TagType = logic , 18 | parameter type AuxType = logic , 19 | localparam int unsigned BITW = fpnew_pkg::fp_width(FpFormat), // Number of bits for the given format 20 | localparam int unsigned H = Height , 21 | localparam int unsigned W = Width , 22 | localparam int unsigned DELAY = NumPipeRegs+1 23 | )( 24 | input logic clk_i , 25 | input logic rst_ni , 26 | // Input Elements 27 | input logic [W-1:0][H-1:0][BITW-1:0] x_input_i , // Inputs to be loaded inside the buffer 28 | input logic [H-1:0][BITW-1:0] w_input_i , // Weights to be streamed inside the datapath 29 | input logic [W-1:0] [BITW-1:0] y_bias_i , 30 | // Output Result 31 | output logic [W-1:0] [BITW-1:0] z_output_o , // Outputs computations 32 | // fpnew_fma Input Signals 33 | input logic [2:0] fma_is_boxed_i , 34 | input logic [1:0] noncomp_is_boxed_i , 35 | input fpnew_pkg::roundmode_e stage1_rnd_i , 36 | input fpnew_pkg::roundmode_e stage2_rnd_i , 37 | input fpnew_pkg::operation_e op1_i , 38 | input fpnew_pkg::operation_e op2_i , 39 | input logic op_mod_i , 40 | input TagType tag_i , 41 | input AuxType aux_i , 42 | // fpnew_fma Input Handshake 43 | input logic in_valid_i , 44 | output logic [W-1:0][H-1:0] in_ready_o , 45 | input logic reg_enable_i , 46 | input logic flush_i , 47 | // fpnew_fma Output signals 48 | output fpnew_pkg::status_t [W-1:0][H-1:0] status_o , 49 | output logic [W-1:0][H-1:0] extension_bit_o , 50 | output fpnew_pkg::classmask_e [W-1:0][H-1:0] class_mask_o , 51 | output logic [W-1:0][H-1:0] is_class_o , 52 | output TagType [W-1:0][H-1:0] tag_o , 53 | output AuxType [W-1:0][H-1:0] aux_o , 54 | // fpnew_fma Output handshake 55 | output logic [W-1:0][H-1:0] out_valid_o , 56 | input logic out_ready_i , 57 | // fpnew_fma Indication of valid data in flight 58 | output logic [W-1:0][H-1:0] busy_o , 59 | // control bus from FSM 60 | input cntrl_engine_t ctrl_engine_i 61 | ); 62 | 63 | /*This module contains the complete RedMulE datapath. The datapath is mainly composed by: 64 | 1) An input buffer that loads the operands from the input 65 | 2) An output buffer, made of HxW array that stores the partial products 66 | 3) The real datapath, which is an array of W parallel rows, each composed by H fma modules interconnected in series*/ 67 | 68 | logic [W-1:0] row_clk; 69 | logic [W-1:0] [BITW-1:0] result, feedback; 70 | 71 | generate 72 | for (genvar index = 0; index < W; index++) begin: gen_redmule_rows 73 | /*--------------------------------------- Array ----------------------------------------*/ 74 | tc_clk_gating i_row_clk_gating ( 75 | .clk_i ( clk_i ), 76 | .en_i ( ctrl_engine_i.row_clk_gate_en[index] ), 77 | .test_en_i ( '0 ), 78 | .clk_o ( row_clk[index] ) 79 | ); 80 | 81 | redmule_row #( 82 | .FpFormat ( FpFormat ), 83 | .Height ( H ), 84 | .NumPipeRegs ( NumPipeRegs ), 85 | .PipeConfig ( PipeConfig ) 86 | ) i_row ( 87 | .clk_i ( row_clk[index] ), 88 | .rst_ni ( rst_ni ), 89 | .x_input_i ( x_input_i [index] ), 90 | .w_input_i ( w_input_i ), 91 | .y_bias_i ( feedback [index] ), 92 | .z_output_o ( result [index] ), 93 | .fma_is_boxed_i ( fma_is_boxed_i ), 94 | .noncomp_is_boxed_i ( noncomp_is_boxed_i ), 95 | .stage1_rnd_i ( stage1_rnd_i ), 96 | .stage2_rnd_i ( stage2_rnd_i ), 97 | .op1_i ( op1_i ), 98 | .op2_i ( op2_i ), 99 | .op_mod_i ( op_mod_i ), 100 | .tag_i ( tag_i ), 101 | .aux_i ( aux_i ), 102 | .in_valid_i ( in_valid_i ), 103 | .in_ready_o ( in_ready_o [index] ), 104 | .reg_enable_i ( reg_enable_i ), 105 | .flush_i ( flush_i ), 106 | .status_o ( status_o [index] ), 107 | .extension_bit_o ( extension_bit_o [index] ), 108 | .class_mask_o ( class_mask_o [index] ), 109 | .is_class_o ( is_class_o [index] ), 110 | .tag_o ( tag_o [index] ), 111 | .aux_o ( aux_o [index] ), 112 | .out_valid_o ( out_valid_o [index] ), 113 | .out_ready_i ( out_ready_i ), 114 | .busy_o ( busy_o [index] ) 115 | ); 116 | 117 | // In case input matrix is bigger than the array, we feedback the partial results to continue the computation 118 | always_comb begin : partial_product_feedback 119 | feedback[index] = y_bias_i[index]; 120 | if (ctrl_engine_i.accumulate) 121 | feedback[index] = result[index]; 122 | else 123 | feedback[index] = y_bias_i[index]; 124 | end 125 | end 126 | endgenerate 127 | 128 | assign z_output_o = result; 129 | 130 | endmodule : redmule_engine 131 | -------------------------------------------------------------------------------- /rtl/redmule_inst_decoder.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | module redmule_inst_decoder 9 | import redmule_pkg::*; 10 | import cv32e40x_pkg::*; 11 | #( 12 | parameter int unsigned SysInstWidth = 32 , 13 | parameter int unsigned SysDataWidth = 32 , 14 | parameter int unsigned NumRfReadPrts = 2 , 15 | parameter int unsigned OpWidth = 3 , 16 | parameter int unsigned FormatWidth = 3 , 17 | parameter int unsigned OpCodeWidth = 7 , 18 | parameter int unsigned NumCfgRegs = 6 , 19 | localparam int unsigned SizeLarge = SysDataWidth/2, 20 | localparam int unsigned SizeSmall = SysDataWidth/4 21 | )( 22 | input logic clk_i , 23 | input logic rst_ni , 24 | input logic clear_i , 25 | cv32e40x_if_xif.coproc_issue xif_issue_if_i , 26 | cv32e40x_if_xif.coproc_result xif_result_if_o, 27 | cv32e40x_if_xif.coproc_compressed xif_compressed_if_i, 28 | cv32e40x_if_xif.coproc_mem xif_mem_if_o, 29 | hwpe_ctrl_intf_periph.master periph , 30 | input logic cfg_complete_i , 31 | output logic start_cfg_o 32 | ); 33 | 34 | logic [OpCodeWidth-1:0] op_code; 35 | logic [ OpWidth-1:0] operation_d; 36 | logic [FormatWidth-1:0] format_d; 37 | logic [NumCfgRegs-1:0][SysDataWidth-1:0] cfg_reg_d, cfg_reg_q; 38 | logic is_gemm_d, 39 | widening_d, 40 | custom_fmt_d; 41 | logic clk_en, clk_int; 42 | logic cfg_ready; 43 | logic count_rst, count_update; 44 | logic [NumCfgRegs-1:0] reg_offs; 45 | 46 | typedef enum logic [1:0] { 47 | Idle, 48 | WriteCfg, 49 | Trigger 50 | } redmule_instr_cfg_state_e; 51 | 52 | redmule_instr_cfg_state_e current, next; 53 | 54 | // Xif static binding 55 | assign xif_compressed_if_i.compressed_ready = 1'b0; 56 | assign xif_compressed_if_i.compressed_resp = '0; 57 | 58 | assign xif_mem_if_o.mem_valid = 1'b0; 59 | assign xif_mem_if_o.mem_req = '0; 60 | 61 | tc_clk_gating i_arith_clock_gating ( 62 | .clk_i ( clk_i ), 63 | .en_i ( clk_en ), 64 | .test_en_i ( '0 ), 65 | .clk_o ( clk_int ) 66 | ); 67 | 68 | assign op_code = xif_issue_if_i.issue_req.instr[OpCodeWidth-1:0]; 69 | assign xif_result_if_o.result_valid = (xif_result_if_o.result_ready) ? 1'b1 : 1'b0; 70 | assign xif_result_if_o.result = '0; 71 | 72 | always_comb begin: opcode_decoder 73 | clk_en = '0; 74 | cfg_ready = '0; 75 | cfg_reg_d = cfg_reg_q; 76 | xif_issue_if_i.issue_ready = 1'b0; 77 | xif_issue_if_i.issue_resp = '0; 78 | is_gemm_d = '0; 79 | widening_d = '0; 80 | custom_fmt_d = '0; 81 | format_d = '0; 82 | operation_d = '0; 83 | 84 | if (xif_issue_if_i.issue_valid) begin 85 | case (op_code) 86 | MCNFIG: begin 87 | xif_issue_if_i.issue_ready = 1'b1; 88 | xif_issue_if_i.issue_resp.accept = 1'b1; 89 | clk_en = 1'b1 | clear_i; 90 | if (xif_issue_if_i.issue_req.rs_valid) begin 91 | // HalfWord[0] of Rs1 contains M size 92 | cfg_reg_d[3][SizeLarge-1:0] = xif_issue_if_i.issue_req.rs[0][SizeLarge-1:0]; 93 | // BalfWord[1] of Rs1 contains K size 94 | cfg_reg_d[3][SysDataWidth-1:SizeLarge] = xif_issue_if_i.issue_req.rs[0][SysDataWidth-1:SizeLarge]; 95 | // Rs2 contains N size 96 | cfg_reg_d[4] = xif_issue_if_i.issue_req.rs[1]; 97 | end 98 | end 99 | 100 | MARITH: begin 101 | xif_issue_if_i.issue_ready = 1'b1; 102 | xif_issue_if_i.issue_resp.accept = 1'b1; 103 | clk_en = 1'b1 | clear_i; 104 | cfg_reg_d[5] = xif_issue_if_i.issue_req.instr; // Arithmetic instruction 105 | if (xif_issue_if_i.issue_req.rs_valid) begin 106 | cfg_ready = 1'b1; 107 | cfg_reg_d[0] = xif_issue_if_i.issue_req.rs[0]; // Rs1 contains X start pointer 108 | cfg_reg_d[1] = xif_issue_if_i.issue_req.rs[1]; // Rs2 contains W start pointer 109 | cfg_reg_d[2] = xif_issue_if_i.issue_req.rs[2]; // Rs3 contains Z start pointer 110 | end 111 | end 112 | 113 | /* The core will try to offload all CSR instructions to the coupled co-processor, so we need to 114 | check if the offloaded CSR instruction tries to access one of the CSRs available in RedMulE or 115 | not. If not, we need to raise the issue_ready to signal that we received the offload request, 116 | but keep the issue_resp.accept low to signal that we are not accepting the instruction. 117 | For furhter details, look at the CORE-V Extension Interface documentation 118 | (https://docs.openhwgroup.org/projects/openhw-group-core-v-xif/en/latest/x_ext.html#issue-interface) 119 | and at the following issue: https://github.com/openhwgroup/cv32e40x/issues/945. */ 120 | RVCSR: begin 121 | xif_issue_if_i.issue_ready = 1'b1; 122 | if (xif_issue_if_i.issue_req.instr[31:20] <= CSR_REDMULE_MACFG && 123 | xif_issue_if_i.issue_req.instr[31:20] >= CSR_REDMULE_MACFG) begin 124 | xif_issue_if_i.issue_resp.accept = 1'b1; 125 | end else begin 126 | xif_issue_if_i.issue_resp.accept = 1'b0; 127 | end 128 | end 129 | endcase 130 | end 131 | end 132 | 133 | always_ff @(posedge clk_int, negedge rst_ni) begin : arith_pipe 134 | if (~rst_ni) begin 135 | cfg_reg_q <= '0; 136 | end else begin 137 | if (clear_i) begin 138 | cfg_reg_q <= '0; 139 | end else begin 140 | cfg_reg_q <= cfg_reg_d; 141 | end 142 | end 143 | end 144 | 145 | always_ff @(posedge clk_i, negedge rst_ni) begin : slave_cfg 146 | if (~rst_ni) begin 147 | current <= Idle; 148 | end else begin 149 | if (clear_i) 150 | current <= Idle; 151 | else 152 | current <= next; 153 | end 154 | end 155 | 156 | always_ff @(posedge clk_i, negedge rst_ni) begin : ptrs_counter 157 | if (~rst_ni) begin 158 | reg_offs <= '0; 159 | end else begin 160 | if (clear_i | count_rst) 161 | reg_offs <= '0; 162 | else if (count_update) 163 | reg_offs <= reg_offs + 1; 164 | end 165 | end 166 | 167 | always_comb begin : cfg_fsm 168 | count_rst = '0; 169 | count_update = '0; 170 | next = current; 171 | periph.req = '0; 172 | periph.wen = '0; 173 | periph.be = '0; 174 | periph.add = '0; 175 | periph.id = '0; 176 | periph.data = '0; 177 | start_cfg_o = 1'b0; 178 | 179 | case (current) 180 | Idle: begin 181 | if (cfg_ready) 182 | next = WriteCfg; 183 | end 184 | 185 | WriteCfg: begin 186 | periph.req = 1'b1; 187 | periph.wen = 1'b0; 188 | periph.be = '1; 189 | periph.add = 'h40 + 4*reg_offs; 190 | periph.id = '0; 191 | periph.data = cfg_reg_q[reg_offs]; 192 | if (periph.gnt) begin 193 | count_update = 1'b1; 194 | if (reg_offs == NumCfgRegs - 1) begin 195 | next = Trigger; 196 | start_cfg_o = 1'b1; 197 | count_rst = 1'b1; 198 | end 199 | end 200 | end 201 | 202 | Trigger: begin 203 | if (cfg_complete_i) begin 204 | periph.req = 1'b1; 205 | periph.wen = 1'b0; 206 | periph.be = '1; 207 | periph.add = '0; 208 | periph.id = '0; 209 | periph.data = '0; 210 | 211 | if (periph.gnt) 212 | next = Idle; 213 | end 214 | end 215 | endcase 216 | end 217 | 218 | endmodule: redmule_inst_decoder 219 | -------------------------------------------------------------------------------- /rtl/redmule_memory_scheduler.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2025 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Andrea Belano 6 | // 7 | 8 | module redmule_memory_scheduler 9 | import redmule_pkg::*; 10 | import hwpe_ctrl_package::*; 11 | #( 12 | parameter int unsigned DW = DATAW, 13 | parameter int unsigned W = ARRAY_WIDTH, 14 | parameter int unsigned H = ARRAY_HEIGHT, 15 | parameter int unsigned ELW = BITW, 16 | localparam int unsigned D = TOT_DEPTH 17 | ) ( 18 | input logic clk_i , 19 | input logic rst_ni , 20 | input logic clear_i , 21 | input ctrl_regfile_t reg_file_i , 22 | input flgs_streamer_t flgs_streamer_i , 23 | input cntrl_scheduler_t cntrl_scheduler_i, 24 | input cntrl_flags_t cntrl_flags_i , 25 | output cntrl_streamer_t cntrl_streamer_o 26 | ); 27 | localparam int unsigned JMP = NumByte*(DATA_W/MemDw - 1); 28 | 29 | logic [31:0] x_cols_offs_d, x_cols_offs_q; 30 | logic [31:0] x_rows_offs_d, x_rows_offs_q; 31 | 32 | logic [15:0] x_cols_iters_d, x_cols_iters_q, 33 | x_rows_iters_d, x_rows_iters_q; 34 | 35 | logic [15:0] w_iters_d, w_iters_q; 36 | 37 | logic [15:0] tot_x_read_d, tot_x_read_q; 38 | 39 | logic [$clog2(W):0] num_x_reads; 40 | 41 | always_ff @(posedge clk_i or negedge rst_ni) begin : x_cols_iters_register 42 | if (~rst_ni) begin 43 | x_cols_iters_q <= '0; 44 | end else begin 45 | if (clear_i) begin 46 | x_cols_iters_q <= '0; 47 | end else if (flgs_streamer_i.x_stream_source_flags.done) begin 48 | x_cols_iters_q <= x_cols_iters_d; 49 | end 50 | end 51 | end 52 | 53 | assign x_cols_iters_d = x_cols_iters_q == reg_file_i.hwpe_params[X_ITERS][15:0]-1 ? '0 : x_cols_iters_q + 1; 54 | 55 | always_ff @(posedge clk_i or negedge rst_ni) begin : w_iters_register 56 | if (~rst_ni) begin 57 | w_iters_q <= '0; 58 | end else begin 59 | if (clear_i) begin 60 | w_iters_q <= '0; 61 | end else if (flgs_streamer_i.x_stream_source_flags.done && x_cols_iters_q == reg_file_i.hwpe_params[X_ITERS][15:0]-1) begin 62 | w_iters_q <= w_iters_d; 63 | end 64 | end 65 | end 66 | 67 | assign w_iters_d = w_iters_q == reg_file_i.hwpe_params[W_ITERS][15:0]-1 ? '0 : w_iters_q + 1; 68 | 69 | always_ff @(posedge clk_i or negedge rst_ni) begin : x_rows_iters_register 70 | if (~rst_ni) begin 71 | x_rows_iters_q <= '0; 72 | end else begin 73 | if (clear_i) begin 74 | x_rows_iters_q <= '0; 75 | end else if (flgs_streamer_i.x_stream_source_flags.done && x_cols_iters_q == reg_file_i.hwpe_params[X_ITERS][15:0]-1 && w_iters_q == reg_file_i.hwpe_params[W_ITERS][15:0]-1) begin 76 | x_rows_iters_q <= x_rows_iters_d; 77 | end 78 | end 79 | end 80 | 81 | assign x_rows_iters_d = x_rows_iters_q == reg_file_i.hwpe_params[X_ITERS][31:16]-1 ? '0 : x_rows_iters_q + 1; 82 | 83 | always_ff @(posedge clk_i or negedge rst_ni) begin : tot_x_read_register 84 | if (~rst_ni) begin 85 | tot_x_read_q <= '0; 86 | end else begin 87 | if (clear_i) begin 88 | tot_x_read_q <= '0; 89 | end else if (flgs_streamer_i.x_stream_source_flags.done) begin 90 | tot_x_read_q <= tot_x_read_q + 1; 91 | end 92 | end 93 | end 94 | 95 | assign tot_x_read_d = tot_x_read_q == reg_file_i.hwpe_params[TOT_X_READ] ? '0 : tot_x_read_q + 1; 96 | 97 | always_ff @(posedge clk_i or negedge rst_ni) begin : x_cols_offs_register 98 | if (~rst_ni) begin 99 | x_cols_offs_q <= '0; 100 | end else begin 101 | if (clear_i) begin 102 | x_cols_offs_q <= '0; 103 | end else if (flgs_streamer_i.x_stream_source_flags.done) begin 104 | x_cols_offs_q <= x_cols_offs_d; 105 | end 106 | end 107 | end 108 | 109 | assign x_cols_offs_d = x_cols_iters_q == reg_file_i.hwpe_params[X_ITERS][15:0]-1 ? '0 : x_cols_offs_q + JMP; 110 | 111 | always_ff @(posedge clk_i or negedge rst_ni) begin : x_rows_offs_register 112 | if (~rst_ni) begin 113 | x_rows_offs_q <= '0; 114 | end else begin 115 | if (clear_i) begin 116 | x_rows_offs_q <= '0; 117 | end else if (flgs_streamer_i.x_stream_source_flags.done && x_cols_iters_q == reg_file_i.hwpe_params[X_ITERS][15:0]-1 && w_iters_q == reg_file_i.hwpe_params[W_ITERS][15:0]-1) begin 118 | x_rows_offs_q <= x_rows_offs_d; 119 | end 120 | end 121 | end 122 | 123 | assign x_rows_offs_d = x_rows_iters_q == reg_file_i.hwpe_params[X_ITERS][31:16]-1 ? '0 : x_rows_offs_q + reg_file_i.hwpe_params[X_ROWS_OFFS]; 124 | 125 | assign num_x_reads = x_rows_iters_q == reg_file_i.hwpe_params[X_ITERS][31:16]-1 && reg_file_i.hwpe_params[LEFTOVERS][31:24] != '0 ? reg_file_i.hwpe_params[LEFTOVERS][31:24] : W; 126 | 127 | // Here we initialize the streamer source signals 128 | // for the X stream source 129 | assign cntrl_streamer_o.x_stream_source_ctrl.req_start = !cntrl_flags_i.idle && flgs_streamer_i.x_stream_source_flags.ready_start && 130 | (cntrl_scheduler_i.first_load || tot_x_read_q < reg_file_i.hwpe_params[TOT_X_READ]); 131 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.base_addr = reg_file_i.hwpe_params[X_ADDR] 132 | + x_rows_offs_q + x_cols_offs_q; 133 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.tot_len = num_x_reads; 134 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.d0_len = 'd1; 135 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.d0_stride = 'd0; 136 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.d1_len = W; 137 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.d1_stride = reg_file_i.hwpe_params[X_D1_STRIDE]; 138 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.d2_stride = '0; 139 | assign cntrl_streamer_o.x_stream_source_ctrl.addressgen_ctrl.dim_enable_1h = 2'b11; 140 | 141 | // Here we initialize the streamer source signals 142 | // for the W stream source 143 | assign cntrl_streamer_o.w_stream_source_ctrl.req_start = cntrl_scheduler_i.first_load && flgs_streamer_i.z_stream_sink_flags.ready_start; 144 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.base_addr = reg_file_i.hwpe_params[W_ADDR]; 145 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.tot_len = reg_file_i.hwpe_params[W_TOT_LEN]; 146 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.d0_len = reg_file_i.hwpe_params[W_ITERS][31:16]; 147 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.d0_stride = reg_file_i.hwpe_params[W_D0_STRIDE]; 148 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.d1_len = reg_file_i.hwpe_params[W_ITERS][15:0]; 149 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.d1_stride = JMP; 150 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.d2_stride = 'd0; 151 | assign cntrl_streamer_o.w_stream_source_ctrl.addressgen_ctrl.dim_enable_1h = 2'b11; 152 | 153 | // Here we initialize the streamer source signals 154 | // for the Y stream source 155 | assign cntrl_streamer_o.y_stream_source_ctrl.req_start = cntrl_scheduler_i.first_load && reg_file_i.hwpe_params[OP_SELECTION][0] && flgs_streamer_i.y_stream_source_flags.ready_start; 156 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.base_addr = reg_file_i.hwpe_params[Z_ADDR]; 157 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.tot_len = reg_file_i.hwpe_params[Z_TOT_LEN]; 158 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.d0_len = W; 159 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.d0_stride = reg_file_i.hwpe_params[Z_D0_STRIDE]; 160 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.d1_len = reg_file_i.hwpe_params[W_ITERS][15:0]; 161 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.d1_stride = JMP; 162 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.d2_stride = reg_file_i.hwpe_params[Z_D2_STRIDE]; 163 | assign cntrl_streamer_o.y_stream_source_ctrl.addressgen_ctrl.dim_enable_1h = 2'b11; 164 | 165 | // Here we initialize the streamer sink signals for 166 | // the Z stream sink 167 | assign cntrl_streamer_o.z_stream_sink_ctrl.req_start = cntrl_scheduler_i.first_load && flgs_streamer_i.z_stream_sink_flags.ready_start; 168 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.base_addr = reg_file_i.hwpe_params[Z_ADDR]; 169 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.tot_len = reg_file_i.hwpe_params[Z_TOT_LEN]; 170 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.d0_len = W; 171 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.d0_stride = reg_file_i.hwpe_params[Z_D0_STRIDE]; 172 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.d1_len = reg_file_i.hwpe_params[W_ITERS][15:0]; 173 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.d1_stride = JMP; 174 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.d2_stride = reg_file_i.hwpe_params[Z_D2_STRIDE]; 175 | assign cntrl_streamer_o.z_stream_sink_ctrl.addressgen_ctrl.dim_enable_1h = 2'b11; 176 | 177 | assign cntrl_streamer_o.input_cast_src_fmt = fpnew_pkg::fp_format_e'(reg_file_i.hwpe_params[OP_SELECTION][15:13]); 178 | assign cntrl_streamer_o.input_cast_dst_fmt = fpnew_pkg::fp_format_e'(reg_file_i.hwpe_params[OP_SELECTION][12:10]); 179 | assign cntrl_streamer_o.output_cast_src_fmt = fpnew_pkg::fp_format_e'(reg_file_i.hwpe_params[OP_SELECTION][12:10]); 180 | assign cntrl_streamer_o.output_cast_dst_fmt = fpnew_pkg::fp_format_e'(reg_file_i.hwpe_params[OP_SELECTION][15:13]); 181 | 182 | endmodule : redmule_memory_scheduler 183 | -------------------------------------------------------------------------------- /rtl/redmule_row.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | module redmule_row 9 | import fpnew_pkg::*; 10 | #( 11 | parameter fpnew_pkg::fp_format_e FpFormat = fpnew_pkg::FP16, 12 | parameter int unsigned Height = 4, // Number of PEs per row 13 | parameter int unsigned NumPipeRegs = 2, 14 | parameter fpnew_pkg::pipe_config_t PipeConfig = fpnew_pkg::DISTRIBUTED, 15 | parameter type TagType = logic, 16 | parameter type AuxType = logic, 17 | localparam int unsigned BITW = fpnew_pkg::fp_width(FpFormat), // Number of bits for the given format 18 | localparam int unsigned H = Height 19 | )( 20 | input logic clk_i , 21 | input logic rst_ni , 22 | // Input Elements 23 | input logic [H-1:0][BITW-1:0] x_input_i , 24 | input logic [H-1:0][BITW-1:0] w_input_i , 25 | input logic [BITW-1:0] y_bias_i , 26 | // Output Result 27 | output logic [BITW-1:0] z_output_o , 28 | // fpnew_fma Input Signals 29 | input logic [2:0] fma_is_boxed_i , 30 | input logic [1:0] noncomp_is_boxed_i, 31 | input fpnew_pkg::roundmode_e stage1_rnd_i , 32 | input fpnew_pkg::roundmode_e stage2_rnd_i , 33 | input fpnew_pkg::operation_e op1_i , 34 | input fpnew_pkg::operation_e op2_i , 35 | input logic op_mod_i , 36 | input TagType tag_i , 37 | input AuxType aux_i , 38 | // fpnew_fma Input Handshake 39 | input logic in_valid_i , 40 | output logic [H-1:0] in_ready_o , 41 | input logic reg_enable_i , 42 | input logic flush_i , 43 | // fpnew_fma Output signals 44 | output fpnew_pkg::status_t [H-1:0] status_o , 45 | output logic [H-1:0] extension_bit_o, 46 | output fpnew_pkg::classmask_e [H-1:0] class_mask_o , 47 | output logic [H-1:0] is_class_o , 48 | output TagType [H-1:0] tag_o , 49 | output AuxType [H-1:0] aux_o , 50 | // fpnew_fma Output handshake 51 | output logic [H-1:0] out_valid_o , 52 | input logic out_ready_i , 53 | // fpnew_fma Indication of valid data in flight 54 | output logic [H-1:0] busy_o 55 | ); 56 | 57 | // Local signals for operands assign: elemnts 0 and 1 are addressed to multiplication, 58 | // element 2 is destined to accumulation. 59 | logic [H-1:0] [2:0][BITW-1:0] input_operands; 60 | logic [H-1:0] [BITW-1:0] y_bias_int , 61 | partial_result; 62 | logic [BITW-1:0] result; 63 | 64 | // Signals for intermediate registers 65 | logic [H-1:0] [BITW-1:0] output_q; 66 | 67 | // Generate PEs 68 | generate 69 | for (genvar index = 0; index < H; index++) begin : gen_computing_element 70 | assign input_operands [index][0] = x_input_i [index]; 71 | assign input_operands [index][1] = w_input_i [index]; 72 | if (index > 0) 73 | assign input_operands [index][2] = output_q [index-1]; 74 | else 75 | assign input_operands [index][2] = y_bias_i; 76 | 77 | redmule_ce #( 78 | .FpFormat ( FpFormat ), 79 | .NumPipeRegs ( NumPipeRegs ), 80 | .PipeConfig ( PipeConfig ), 81 | .Stallable ( 1'b1 ) 82 | ) i_computing_element ( 83 | .clk_i ( clk_i ), 84 | .rst_ni ( rst_ni ), 85 | .x_input_i ( input_operands [index][0] ), 86 | .w_input_i ( input_operands [index][1] ), 87 | .y_bias_i ( input_operands [index][2] ), 88 | .fma_is_boxed_i ( fma_is_boxed_i ), 89 | .noncomp_is_boxed_i ( noncomp_is_boxed_i ), 90 | .stage1_rnd_i ( stage1_rnd_i ), 91 | .stage2_rnd_i ( stage2_rnd_i ), 92 | .op1_i ( op1_i ), 93 | .op2_i ( op2_i ), 94 | .op_mod_i ( op_mod_i ), 95 | .tag_i ( tag_i ), 96 | .aux_i ( aux_i ), 97 | .in_valid_i ( in_valid_i ), 98 | .in_ready_o ( in_ready_o [index] ), 99 | .reg_enable_i ( reg_enable_i ), 100 | .flush_i ( flush_i ), 101 | .z_output_o ( partial_result [index] ), 102 | .status_o ( status_o [index] ), 103 | .extension_bit_o ( extension_bit_o [index] ), 104 | .class_mask_o ( class_mask_o [index] ), 105 | .is_class_o ( is_class_o [index] ), 106 | .tag_o ( tag_o [index] ), 107 | .aux_o ( aux_o [index] ), 108 | .out_valid_o ( out_valid_o [index] ), 109 | .out_ready_i ( out_ready_i ), 110 | .busy_o ( busy_o [index] ) 111 | ); 112 | end 113 | endgenerate 114 | 115 | always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_output_register 116 | if(~rst_ni) begin 117 | output_q <= '0; 118 | end else begin 119 | for (int i = 0; i < H; i++) begin 120 | if (flush_i) 121 | output_q [i] <= '0; 122 | else if (reg_enable_i) 123 | output_q [i] <= partial_result [i]; 124 | else 125 | output_q [i] <= output_q [i]; 126 | end 127 | end 128 | end 129 | 130 | assign z_output_o = output_q [H-1]; 131 | 132 | endmodule : redmule_row 133 | -------------------------------------------------------------------------------- /rtl/w_buffer/redmule_w_buffer.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // Andrea Belano 7 | // 8 | 9 | module redmule_w_buffer 10 | import fpnew_pkg::*; 11 | import redmule_pkg::*; 12 | #( 13 | parameter int unsigned DW = 288 , 14 | parameter fp_format_e FpFormat = FP16 , 15 | parameter int unsigned Height = ARRAY_HEIGHT , // Number of PEs per row 16 | parameter int unsigned N_REGS = PIPE_REGS , // Number of registers per PE 17 | localparam int unsigned BITW = fp_width(FpFormat), // Number of bits for the given format 18 | localparam int unsigned H = Height , 19 | localparam int unsigned D = DW/BITW 20 | )( 21 | input logic clk_i , 22 | input logic rst_ni , 23 | input logic clear_i , 24 | input w_buffer_ctrl_t ctrl_i , 25 | output w_buffer_flgs_t flags_o , 26 | output logic [H-1:0][BITW-1:0] w_buffer_o, 27 | input logic [DW-1:0] w_buffer_i 28 | ); 29 | 30 | localparam int unsigned C = (D+N_REGS)/(N_REGS+1); 31 | localparam int unsigned EL_ADDR_W = $clog2(N_REGS+1); 32 | localparam int unsigned EL_DATA_W = (N_REGS+1)*BITW; 33 | 34 | logic [$clog2(H):0] w_row; 35 | 36 | logic [EL_ADDR_W-1:0] el_addr_d, el_addr_q; 37 | logic [$clog2(C)-1:0] col_addr_d, col_addr_q; 38 | 39 | logic [D-1:0][BITW-1:0] w_data; 40 | 41 | logic [H-1:0][$clog2(H)-1:0] buffer_r_addr_d, buffer_r_addr_q; 42 | logic [H-1:0] buffer_r_addr_valid_d, buffer_r_addr_valid_q; 43 | 44 | logic buf_write_en; 45 | logic [$clog2(H)-1:0] buf_write_addr; 46 | 47 | for (genvar d = 0; d < D; d++) begin : gen_zero_padding 48 | assign w_data[d] = (d < ctrl_i.width && w_row < ctrl_i.height) ? w_buffer_i[(d+1)*BITW-1:d*BITW] : '0; 49 | end 50 | 51 | assign buf_write_en = ctrl_i.load; 52 | assign buf_write_addr = w_row; 53 | 54 | redmule_w_buffer_scm #( 55 | .WORD_SIZE ( BITW ), 56 | .ROWS ( H ), 57 | .COLS ( C ), 58 | .ELMS ( N_REGS+1 ) 59 | ) i_w_buf ( 60 | .clk_i ( clk_i ), 61 | .rst_ni ( rst_ni ), 62 | .clear_i ( clear_i ), 63 | .write_en_i ( buf_write_en ), 64 | .write_addr_i ( buf_write_addr ), 65 | .wdata_i ( w_data ), 66 | .read_en_i ( ctrl_i.shift ), 67 | .elms_read_addr_i ( el_addr_q ), 68 | .cols_read_offs_i ( col_addr_q ), 69 | .rows_read_addr_i ( buffer_r_addr_d ), 70 | .rdata_o ( w_buffer_o ) 71 | ); 72 | 73 | assign flags_o.w_ready = buf_write_en; 74 | 75 | always_comb begin : buffer_r_addr_assignment 76 | buffer_r_addr_q = '0; 77 | buffer_r_addr_d = '0; 78 | buffer_r_addr_valid_q = '0; 79 | buffer_r_addr_valid_d = '0; 80 | 81 | for (int h = 0; h < H; h++) begin 82 | buffer_r_addr_q[h] = h; 83 | end 84 | 85 | for (int h = 0; h < H; h++) begin 86 | buffer_r_addr_d[h] = h; 87 | end 88 | end 89 | 90 | // Write side 91 | 92 | always_ff @(posedge clk_i or negedge rst_ni) begin : element_counter 93 | if(~rst_ni) begin 94 | el_addr_q <= '0; 95 | end else begin 96 | if (clear_i) 97 | el_addr_q <= '0; 98 | else if (ctrl_i.shift) 99 | el_addr_q <= el_addr_d; 100 | end 101 | end 102 | 103 | always_ff @(posedge clk_i or negedge rst_ni) begin : section_counter 104 | if(~rst_ni) begin 105 | col_addr_q <= '0; 106 | end else begin 107 | if (clear_i) 108 | col_addr_q <= '0; 109 | else if (ctrl_i.shift) 110 | col_addr_q <= col_addr_d; 111 | end 112 | end 113 | 114 | assign el_addr_d = (el_addr_q == N_REGS) ? '0 : el_addr_q + 1; 115 | assign col_addr_d = (el_addr_q == N_REGS) ? (col_addr_q == (C-1) ? '0 : col_addr_q + 1) : col_addr_q; 116 | 117 | // Counter to track the number of shifts per row 118 | always_ff @(posedge clk_i or negedge rst_ni) begin : row_load_counter 119 | if(~rst_ni) begin 120 | w_row <= '0; 121 | end else begin 122 | if (clear_i || w_row == H ) 123 | w_row <= '0; 124 | else if (ctrl_i.load) 125 | w_row <= w_row + 1; 126 | else 127 | w_row <= w_row; 128 | end 129 | end 130 | 131 | endmodule : redmule_w_buffer 132 | -------------------------------------------------------------------------------- /rtl/w_buffer/redmule_w_buffer_scm.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2025 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Andrea Belano 6 | // 7 | 8 | module redmule_w_buffer_scm 9 | import redmule_pkg::*; 10 | #( 11 | parameter int unsigned WORD_SIZE = 32, 12 | parameter int unsigned ROWS = 1 , 13 | parameter int unsigned COLS = 1 , 14 | parameter int unsigned ELMS = 1 , 15 | parameter int unsigned USE_LATCHES = LATCH_BUFFERS 16 | ) ( 17 | input logic clk_i , 18 | input logic rst_ni , 19 | input logic clear_i , 20 | input logic write_en_i , 21 | input logic [$clog2(ROWS)-1:0] write_addr_i , 22 | input logic [COLS-1:0][ELMS-1:0][WORD_SIZE-1:0] wdata_i , 23 | input logic read_en_i , 24 | input logic [$clog2(ELMS)-1:0] elms_read_addr_i , // The element read address is the same for all entries 25 | input logic [$clog2(COLS)-1:0] cols_read_offs_i , // We only need the column read address of the first row as the others can be generated 26 | input logic [ROWS-1:0][$clog2(ROWS)-1:0] rows_read_addr_i , 27 | output logic [ROWS-1:0][WORD_SIZE-1:0] rdata_o 28 | ); 29 | logic [ROWS-1:0][COLS-1:0][ELMS-1:0][WORD_SIZE-1:0] buffer_q; 30 | logic [COLS-1:0][ELMS-1:0][WORD_SIZE-1:0] wdata_q; 31 | 32 | logic [$clog2(ELMS)-1:0] elms_read_addr_q; 33 | logic [$clog2(COLS)-1:0] cols_read_offs_q; 34 | logic [ROWS-1:0][$clog2(ROWS)-1:0] rows_read_addr_q; 35 | 36 | logic [ROWS-1:0][$clog2(COLS)-1:0] cols_read_addr; 37 | 38 | logic [ROWS-1:0] clk_w; 39 | 40 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_raddrs 41 | if(~rst_ni) begin 42 | elms_read_addr_q <= '0; 43 | cols_read_offs_q <= '0; 44 | rows_read_addr_q <= '0; 45 | end else begin 46 | if (clear_i) begin 47 | elms_read_addr_q <= '0; 48 | cols_read_offs_q <= '0; 49 | rows_read_addr_q <= '0; 50 | end else if (read_en_i) begin 51 | elms_read_addr_q <= elms_read_addr_i; 52 | cols_read_offs_q <= cols_read_offs_i; 53 | rows_read_addr_q <= rows_read_addr_i; 54 | end 55 | end 56 | end 57 | 58 | for (genvar r = 0; r < ROWS; r++) begin : gen_cols_read_addrs 59 | assign cols_read_addr[r] = cols_read_offs_q >= r ? cols_read_offs_q - r : ROWS - (r - cols_read_offs_q); 60 | end 61 | 62 | for (genvar r = 0; r < ROWS; r++) begin : gen_output_assignment 63 | assign rdata_o[r] = buffer_q[rows_read_addr_q[r]][cols_read_addr[r]][elms_read_addr_q]; 64 | end 65 | 66 | if (USE_LATCHES) begin : gen_latches 67 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_wdata 68 | if(~rst_ni) begin 69 | wdata_q <= '0; 70 | end else begin 71 | if (clear_i) begin 72 | wdata_q <= '0; 73 | end else if (write_en_i) begin 74 | wdata_q <= wdata_i; 75 | end 76 | end 77 | end 78 | 79 | for (genvar r = 0; r < ROWS; r++) begin : gen_write_clock_gates 80 | tc_clk_gating i_rows_cg ( 81 | .clk_i ( clk_i ), 82 | .en_i ( write_addr_i == r && write_en_i || clear_i ), 83 | .test_en_i ( '0 ), 84 | .clk_o ( clk_w[r] ) 85 | ); 86 | end 87 | 88 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 89 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 90 | always_latch begin : wdata 91 | if (clk_w[r]) begin 92 | buffer_q[r][c] = wdata_q[c]; 93 | end 94 | end 95 | end 96 | end 97 | end else begin : gen_flip_flops 98 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 99 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 100 | always_ff @(posedge clk_i or negedge rst_ni) begin : wdata 101 | if (~rst_ni) begin 102 | buffer_q[r][c] <= '0; 103 | end else begin 104 | if (clear_i) begin 105 | buffer_q[r][c] <= '0; 106 | end else if (write_addr_i == r && write_en_i) begin 107 | buffer_q[r][c] <= wdata_i[c]; 108 | end 109 | end 110 | end 111 | end 112 | end 113 | end 114 | 115 | endmodule : redmule_w_buffer_scm 116 | -------------------------------------------------------------------------------- /rtl/x_buffer/redmule_x_buffer_scm.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2025 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Andrea Belano 6 | // 7 | 8 | module redmule_x_buffer_scm 9 | import redmule_pkg::*; 10 | #( 11 | parameter int unsigned WORD_SIZE = 32, 12 | parameter int unsigned WIDTH = 1 , 13 | parameter int unsigned HEIGHT = 2 , 14 | parameter int unsigned N_OUTPUTS = 1 , 15 | parameter int unsigned USE_LATCHES = LATCH_BUFFERS 16 | ) ( 17 | input logic clk_i , 18 | input logic rst_ni , 19 | input logic clear_i , 20 | input logic write_en_i , 21 | input logic [$clog2(N_OUTPUTS)+$clog2(HEIGHT)-1:0] write_addr_i , 22 | input logic [WIDTH-1:0][WORD_SIZE-1:0] wdata_i , 23 | input logic read_en_i , 24 | input logic [$clog2(N_OUTPUTS)+$clog2(HEIGHT)-1:0] read_addr_i , 25 | output logic [N_OUTPUTS-1:0][WIDTH-1:0][WORD_SIZE-1:0] rdata_o 26 | ); 27 | logic [HEIGHT-1:0][N_OUTPUTS-1:0][WIDTH-1:0][WORD_SIZE-1:0] buffer_q; 28 | logic [WIDTH-1:0][WORD_SIZE-1:0] wdata_q; 29 | logic [N_OUTPUTS-1:0][$clog2(HEIGHT)-1:0] read_addr_q; 30 | 31 | logic [$clog2(N_OUTPUTS)-1:0] row_w_addr; 32 | logic [$clog2(HEIGHT)-1:0] slot_w_addr; 33 | 34 | logic [HEIGHT-1:0][N_OUTPUTS-1:0] clk_w; 35 | 36 | for (genvar o = 0; o < N_OUTPUTS; o++) begin : gen_read_addr_registers 37 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_raddr 38 | if(~rst_ni) begin 39 | read_addr_q[o] <= '0; 40 | end else begin 41 | if (clear_i) begin 42 | read_addr_q[o] <= '0; 43 | end if (read_en_i && read_addr_i[$clog2(N_OUTPUTS)-1:0] == o) begin 44 | read_addr_q[o] <= read_addr_i[$clog2(N_OUTPUTS)+:$clog2(HEIGHT)]; 45 | end 46 | end 47 | end 48 | end 49 | 50 | for (genvar o = 0; o < N_OUTPUTS; o++) begin : gen_output_assignment 51 | assign rdata_o[o] = buffer_q[read_addr_q[o]][o]; 52 | end 53 | 54 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_wdata 55 | if(~rst_ni) begin 56 | wdata_q <= '0; 57 | end else begin 58 | if (clear_i) begin 59 | wdata_q <= '0; 60 | end if (write_en_i) begin 61 | wdata_q <= wdata_i; 62 | end 63 | end 64 | end 65 | 66 | assign row_w_addr = write_addr_i[$clog2(N_OUTPUTS)-1:0]; 67 | assign slot_w_addr = write_addr_i[$clog2(N_OUTPUTS)+:$clog2(HEIGHT)]; 68 | 69 | if (USE_LATCHES) begin : gen_latches 70 | for (genvar h = 0; h < HEIGHT; h++) begin : gen_slots_cg 71 | for (genvar o = 0; o < N_OUTPUTS; o++) begin : gen_rows_cg 72 | tc_clk_gating i_row_cg ( 73 | .clk_i ( clk_i ), 74 | .en_i ( row_w_addr == o && slot_w_addr == h && write_en_i || clear_i ), 75 | .test_en_i ( '0 ), 76 | .clk_o ( clk_w[h][o] ) 77 | ); 78 | end 79 | end 80 | 81 | for (genvar h = 0; h < HEIGHT; h++) begin : gen_slots 82 | for (genvar o = 0; o < N_OUTPUTS; o++) begin : gen_rows 83 | always_latch begin : wdata 84 | if (clk_w[h][o]) begin 85 | buffer_q[h][o] = wdata_q; 86 | end 87 | end 88 | end 89 | end 90 | end else begin : gen_flip_flops 91 | for (genvar h = 0; h < HEIGHT; h++) begin : gen_slots 92 | for (genvar o = 0; o < N_OUTPUTS; o++) begin : gen_rows 93 | always_ff @(posedge clk_i or negedge rst_ni) begin : wdata 94 | if (~rst_ni) begin 95 | buffer_q[h][o] <= '0; 96 | end else begin 97 | if (clear_i) begin 98 | buffer_q[h][o] <= '0; 99 | end else if (row_w_addr == o && slot_w_addr == h && write_en_i) begin 100 | buffer_q[h][o] <= wdata_i; 101 | end 102 | end 103 | end 104 | end 105 | end 106 | end 107 | 108 | endmodule : redmule_x_buffer_scm 109 | -------------------------------------------------------------------------------- /rtl/x_buffer/redmule_x_pad_scm.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2025 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Andrea Belano 6 | // 7 | 8 | module redmule_x_pad_scm 9 | import redmule_pkg::*; 10 | #( 11 | parameter int unsigned WORD_SIZE = 32, 12 | parameter int unsigned ROWS = 1 , 13 | parameter int unsigned COLS = 1 , 14 | parameter int unsigned USE_LATCHES = LATCH_BUFFERS 15 | ) ( 16 | input logic clk_i , 17 | input logic rst_ni , 18 | input logic clear_i , 19 | input logic write_en_i , 20 | input logic [$clog2(ROWS)-1:0] write_addr_i , 21 | input logic [COLS-1:0][WORD_SIZE-1:0] wdata_i , 22 | input logic read_en_i , 23 | input logic [$clog2(COLS)-1:0] read_addr_i , 24 | output logic [ROWS-1:0][WORD_SIZE-1:0] rdata_o 25 | ); 26 | logic [ROWS-1:0][COLS-1:0][WORD_SIZE-1:0] buffer_q; 27 | logic [COLS-1:0][WORD_SIZE-1:0] wdata_q; 28 | logic [$clog2(COLS)-1:0] read_addr_q; 29 | 30 | logic [ROWS-1:0] clk_w; 31 | 32 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_raddr 33 | if(~rst_ni) begin 34 | read_addr_q <= '0; 35 | end else begin 36 | if (clear_i) begin 37 | read_addr_q <= '0; 38 | end else if (read_en_i) begin 39 | read_addr_q <= read_addr_i; 40 | end 41 | end 42 | end 43 | 44 | for (genvar r = 0; r < ROWS; r++) begin : gen_output_assignment 45 | assign rdata_o[r] = buffer_q[r][read_addr_q]; 46 | end 47 | 48 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_wdata 49 | if(~rst_ni) begin 50 | wdata_q <= '0; 51 | end else begin 52 | if (clear_i) begin 53 | wdata_q <= '0; 54 | end if (write_en_i) begin 55 | wdata_q <= wdata_i; 56 | end 57 | end 58 | end 59 | 60 | if (USE_LATCHES) begin : gen_latches 61 | for (genvar r = 0; r < ROWS; r++) begin : gen_write_clock_gates 62 | tc_clk_gating i_rows_cg ( 63 | .clk_i ( clk_i ), 64 | .en_i ( write_addr_i == r && write_en_i || clear_i ), 65 | .test_en_i ( '0 ), 66 | .clk_o ( clk_w[r] ) 67 | ); 68 | end 69 | 70 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 71 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 72 | always_latch begin : wdata 73 | if (clk_w[r]) begin 74 | buffer_q[r][c] = wdata_q[c]; 75 | end 76 | end 77 | end 78 | end 79 | end else begin : gen_flip_flops 80 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 81 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 82 | always_ff @(posedge clk_i or negedge rst_ni) begin : wdata 83 | if (~rst_ni) begin 84 | buffer_q[r][c] <= '0; 85 | end else begin 86 | if (clear_i) begin 87 | buffer_q[r][c] <= '0; 88 | end else if (write_addr_i == r && write_en_i) begin 89 | buffer_q[r][c] <= wdata_i[c]; 90 | end 91 | end 92 | end 93 | end 94 | end 95 | end 96 | 97 | endmodule : redmule_x_pad_scm 98 | -------------------------------------------------------------------------------- /rtl/z_buffer/redmule_z_buffer.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // Andrea Belano 7 | // 8 | 9 | module redmule_z_buffer 10 | import fpnew_pkg::*; 11 | import redmule_pkg::*; 12 | #( 13 | parameter int unsigned DW = 288, 14 | parameter fpnew_pkg::fp_format_e FpFormat = fpnew_pkg::FP16, 15 | parameter int unsigned Width = ARRAY_WIDTH, // Number of parallel index 16 | localparam int unsigned BITW = fpnew_pkg::fp_width(FpFormat), // Number of bits for the given format 17 | localparam int unsigned W = Width, 18 | localparam int unsigned D = DW/BITW 19 | )( 20 | input logic clk_i , 21 | input logic rst_ni , 22 | input logic clear_i , 23 | input logic reg_enable_i, 24 | input z_buffer_ctrl_t ctrl_i , 25 | input logic [W-1:0][BITW-1:0] z_buffer_i , 26 | input logic [DW-1:0] y_buffer_i , 27 | output logic [DW-1:0] z_buffer_o , 28 | output logic [W-1:0][BITW-1:0] y_buffer_o , 29 | output logic [DW/8-1:0] z_strb_o , 30 | output z_buffer_flgs_t flags_o 31 | ); 32 | 33 | typedef enum logic [1:0] { 34 | EMPTY, 35 | LOADED, 36 | PUSHED 37 | } redmule_z_state_e; 38 | 39 | redmule_z_state_e current_state, next_state; 40 | 41 | logic rst_fill , 42 | rst_w_load , 43 | rst_d_count; 44 | 45 | logic [$clog2(D)-1:0] fill_shift, d_index; 46 | logic [$clog2(W)-1:0] store_shift_d, store_shift_q, w_index; 47 | 48 | logic load_en, store_en; 49 | 50 | redmule_z_buffer_scm #( 51 | .WORD_SIZE ( BITW ), 52 | .ROWS ( D ), 53 | .COLS ( W ) 54 | ) i_z_buf ( 55 | .clk_i ( clk_i ), 56 | .rst_ni ( rst_ni ), 57 | .clear_i ( clear_i ), 58 | .row_write_en_i ( ctrl_i.fill ), 59 | .col_write_en_i ( load_en && ctrl_i.y_valid ), 60 | .row_write_addr_i ( fill_shift ), 61 | .col_write_addr_i ( w_index ), 62 | .row_wdata_i ( z_buffer_i ), 63 | .col_wdata_i ( y_buffer_i ), 64 | .col_read_en_i ( store_en && ctrl_i.ready ), 65 | .row_read_en_i ( ctrl_i.y_push_enable ), 66 | .col_read_addr_i ( store_shift_d ), 67 | .row_read_addr_i ( d_index ), 68 | .col_rdata_o ( z_buffer_o ), 69 | .row_rdata_o ( y_buffer_o ) 70 | ); 71 | 72 | assign flags_o.y_ready = load_en && ctrl_i.y_valid; 73 | assign flags_o.z_valid = store_en && ctrl_i.ready; 74 | 75 | always_ff @(posedge clk_i or negedge rst_ni) begin : state_register 76 | if(~rst_ni) begin 77 | current_state <= EMPTY; 78 | end else begin 79 | if (clear_i) begin 80 | current_state <= EMPTY; 81 | end else begin 82 | current_state <= next_state; 83 | end 84 | end 85 | end 86 | 87 | always_comb begin : fsm 88 | next_state = current_state; 89 | 90 | case (current_state) 91 | EMPTY: begin 92 | if (w_index == ctrl_i.y_width-1 && load_en && ctrl_i.y_valid) begin 93 | next_state = LOADED; 94 | end 95 | 96 | // This handles the case where the height of the buffer is 1 97 | if (fill_shift == ctrl_i.z_height-1 && ctrl_i.fill) begin 98 | next_state = PUSHED; 99 | end 100 | end 101 | 102 | LOADED: begin 103 | if (d_index == ctrl_i.y_height-1 && ctrl_i.y_push_enable && ~ctrl_i.fill) begin 104 | next_state = EMPTY; 105 | end 106 | 107 | if (fill_shift == ctrl_i.z_height-1 && ctrl_i.fill) begin 108 | next_state = PUSHED; 109 | end 110 | end 111 | 112 | PUSHED: begin 113 | if (store_shift_q == ctrl_i.z_width-1 && store_en && ctrl_i.ready) begin 114 | next_state = EMPTY; 115 | end 116 | end 117 | endcase 118 | end 119 | 120 | // With very small leftovers on K it may happen that the z submatrix is completely stored before the current matrix of biases is fully pushed. 121 | // Therefore, we have to check that we are not in the process of pushing biases into the array before storing 122 | assign load_en = current_state == EMPTY && ~ctrl_i.fill && d_index == '0; 123 | assign store_en = current_state == PUSHED; 124 | 125 | // Counter to track when the output buffer is full 126 | always_ff @(posedge clk_i or negedge rst_ni) begin : buffer_fill_counter 127 | if(~rst_ni) begin 128 | fill_shift <= '0; 129 | end else begin 130 | if (rst_fill || clear_i) 131 | fill_shift <= '0; 132 | else if (ctrl_i.fill) 133 | fill_shift <= fill_shift + 1; 134 | else 135 | fill_shift <= fill_shift; 136 | end 137 | end 138 | 139 | // Reset for the fill value 140 | always_comb begin : fill_shift_rst 141 | rst_fill = 1'b0; 142 | if (fill_shift == ctrl_i.z_height-1 && ctrl_i.fill) begin 143 | rst_fill = 1'b1; 144 | end else begin 145 | rst_fill = 1'b0; 146 | end 147 | end 148 | 149 | // Counter to track the number of store rows 150 | always_ff @(posedge clk_i or negedge rst_ni) begin : stored_rows_counter 151 | if(~rst_ni) begin 152 | store_shift_q <= '0; 153 | end else begin 154 | if (clear_i) 155 | store_shift_q <= '0; 156 | else if (store_en && ctrl_i.ready) 157 | store_shift_q <= store_shift_d; 158 | else 159 | store_shift_q <= store_shift_q; 160 | end 161 | end 162 | 163 | assign store_shift_d = store_shift_q == ctrl_i.z_width-1 ? '0 : store_shift_q + 1; 164 | 165 | assign flags_o.empty = (store_shift_q == ctrl_i.z_width-1 && store_en && ctrl_i.ready) || (current_state == LOADED && d_index == ctrl_i.y_height-1 && ctrl_i.y_push_enable && ctrl_i.first_load); 166 | 167 | // Counter to track the rows that have to be loaded 168 | always_ff @(posedge clk_i or negedge rst_ni) begin : row_loaded_counter 169 | if(~rst_ni) begin 170 | w_index <= '0; 171 | end else begin 172 | if (rst_w_load || clear_i) 173 | w_index <= '0; 174 | else if (load_en && ctrl_i.y_valid) 175 | w_index <= w_index + 1; 176 | else 177 | w_index <= w_index; 178 | end 179 | end 180 | 181 | assign flags_o.loaded = current_state == EMPTY && w_index == ctrl_i.y_width-1 && ctrl_i.y_valid || 182 | current_state == LOADED; 183 | 184 | always_comb begin : reset_y_load_counter 185 | rst_w_load = 1'b0; 186 | if (w_index == ctrl_i.y_width-1 && load_en && ctrl_i.y_valid) begin 187 | rst_w_load = 1'b1; 188 | end else begin 189 | rst_w_load = 1'b0; 190 | end 191 | end 192 | 193 | always_ff @(posedge clk_i or negedge rst_ni) begin : depth_read_counter 194 | if(~rst_ni) begin 195 | d_index <= '0; 196 | end else begin 197 | if (rst_d_count || clear_i) 198 | d_index <= '0; 199 | else if (ctrl_i.y_push_enable) 200 | d_index <= d_index + 1; 201 | else 202 | d_index <= d_index; 203 | end 204 | end 205 | 206 | always_comb begin : reset_depth_counter 207 | rst_d_count = 1'b0; 208 | flags_o.y_pushed = 1'b0; 209 | if (d_index == ctrl_i.y_height-1 && ctrl_i.y_push_enable) begin 210 | rst_d_count = 1'b1; 211 | flags_o.y_pushed = 1'b1; 212 | end else begin 213 | rst_d_count = 1'b0; 214 | flags_o.y_pushed = 1'b0; 215 | end 216 | end 217 | 218 | always_comb begin : z_strb_assignment 219 | z_strb_o = '0; 220 | 221 | for (int i = 0; i < ctrl_i.z_height; i++) begin 222 | z_strb_o[i*BITW/8+:BITW/8] = '1; 223 | end 224 | end 225 | 226 | endmodule : redmule_z_buffer 227 | -------------------------------------------------------------------------------- /rtl/z_buffer/redmule_z_buffer_scm.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2025 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Andrea Belano 6 | // 7 | 8 | module redmule_z_buffer_scm 9 | import redmule_pkg::*; 10 | #( 11 | parameter int unsigned WORD_SIZE = 32, 12 | parameter int unsigned ROWS = 1 , 13 | parameter int unsigned COLS = 1 , 14 | parameter int unsigned USE_LATCHES = LATCH_BUFFERS 15 | ) ( 16 | input logic clk_i , 17 | input logic rst_ni , 18 | input logic clear_i , 19 | input logic row_write_en_i , 20 | input logic col_write_en_i , 21 | input logic [$clog2(ROWS)-1:0] row_write_addr_i , 22 | input logic [$clog2(COLS)-1:0] col_write_addr_i , 23 | input logic [COLS-1:0][WORD_SIZE-1:0] row_wdata_i , 24 | input logic [ROWS-1:0][WORD_SIZE-1:0] col_wdata_i , 25 | input logic col_read_en_i , 26 | input logic row_read_en_i , 27 | input logic [$clog2(COLS)-1:0] col_read_addr_i , 28 | input logic [$clog2(ROWS)-1:0] row_read_addr_i , 29 | output logic [ROWS-1:0][WORD_SIZE-1:0] col_rdata_o , 30 | output logic [COLS-1:0][WORD_SIZE-1:0] row_rdata_o 31 | ); 32 | logic [ROWS-1:0][COLS-1:0][WORD_SIZE-1:0] buffer_q; 33 | logic [COLS-1:0][WORD_SIZE-1:0] row_wdata_q; 34 | logic [ROWS-1:0][WORD_SIZE-1:0] col_wdata_q; 35 | logic row_write_en_q; 36 | logic [$clog2(COLS)-1:0] col_read_addr_q; 37 | logic [$clog2(ROWS)-1:0] row_read_addr_q; 38 | 39 | logic [ROWS-1:0][COLS-1:0] clk_w; 40 | 41 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_col_raddr 42 | if(~rst_ni) begin 43 | col_read_addr_q <= '0; 44 | end else begin 45 | if (clear_i) begin 46 | col_read_addr_q <= '0; 47 | end else if (col_read_en_i) begin 48 | col_read_addr_q <= col_read_addr_i; 49 | end 50 | end 51 | end 52 | 53 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_row_raddr 54 | if(~rst_ni) begin 55 | row_read_addr_q <= '0; 56 | end else begin 57 | if (clear_i) begin 58 | row_read_addr_q <= '0; 59 | end else if (row_read_en_i) begin 60 | row_read_addr_q <= row_read_addr_i; 61 | end 62 | end 63 | end 64 | 65 | for (genvar r = 0; r < ROWS; r++) begin : gen_output_columns_assignment 66 | assign col_rdata_o[r] = buffer_q[r][col_read_addr_q]; 67 | end 68 | 69 | for (genvar c = 0; c < COLS; c++) begin : gen_output_rows_assignment 70 | assign row_rdata_o[c] = buffer_q[row_read_addr_q][c]; 71 | end 72 | 73 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_row_wdata 74 | if(~rst_ni) begin 75 | row_wdata_q <= '0; 76 | end else begin 77 | if (clear_i) begin 78 | row_wdata_q <= '0; 79 | end else if (row_write_en_i) begin 80 | row_wdata_q <= row_wdata_i; 81 | end 82 | end 83 | end 84 | 85 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_col_wdata 86 | if(~rst_ni) begin 87 | col_wdata_q <= '0; 88 | end else begin 89 | if (clear_i) begin 90 | col_wdata_q <= '0; 91 | end if (col_write_en_i) begin 92 | col_wdata_q <= col_wdata_i; 93 | end 94 | end 95 | end 96 | 97 | always_ff @(posedge clk_i or negedge rst_ni) begin : sample_row_write_enable 98 | if(~rst_ni) begin 99 | row_write_en_q <= '0; 100 | end else begin 101 | if (clear_i) begin 102 | row_write_en_q <= '0; 103 | end else if (col_write_en_i || row_write_en_i) begin 104 | row_write_en_q <= row_write_en_i; 105 | end 106 | end 107 | end 108 | 109 | if (USE_LATCHES) begin : gen_latches 110 | for (genvar r = 0; r < ROWS; r++) begin : gen_write_clock_gates 111 | for (genvar c = 0; c < COLS; c++) begin : gen_col_write_clock_gates 112 | tc_clk_gating i_rows_cg ( 113 | .clk_i ( clk_i ), 114 | .en_i ( row_write_addr_i == r && row_write_en_i || col_write_addr_i == c && col_write_en_i || clear_i ), 115 | .test_en_i ( '0 ), 116 | .clk_o ( clk_w[r][c] ) 117 | ); 118 | end 119 | end 120 | 121 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 122 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 123 | always_latch begin : wdata 124 | if (clk_w[r][c]) begin 125 | buffer_q[r][c] = row_write_en_q ? row_wdata_q[c] : col_wdata_q[r]; 126 | end 127 | end 128 | end 129 | end 130 | end else begin : gen_flip_flops 131 | for (genvar r = 0; r < ROWS; r++) begin : gen_rows 132 | for (genvar c = 0; c < COLS; c++) begin : gen_cols 133 | always_ff @(posedge clk_i or negedge rst_ni) begin : wdata 134 | if (~rst_ni) begin 135 | buffer_q[r][c] <= '0; 136 | end else begin 137 | if (clear_i) begin 138 | buffer_q[r][c] <= '0; 139 | end else if (row_write_addr_i == r && row_write_en_i || col_write_addr_i == c && col_write_en_i) begin 140 | buffer_q[r][c] <= row_write_en_i ? row_wdata_i[c] : col_wdata_i[r]; 141 | end 142 | end 143 | end 144 | end 145 | end 146 | end 147 | 148 | endmodule : redmule_z_buffer_scm 149 | -------------------------------------------------------------------------------- /scripts/parse_s19.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | 6 | import sys 7 | 8 | # Values for hex conversion 9 | h2v = {'0': 0, '1': 1, '2': 2, '3': 3, 10 | '4': 4, '5': 5, '6': 6, '7': 7, 11 | '8': 8, '9': 9, 'A': 10, 'B': 11, 12 | 'C': 12, 'D': 13, 'E': 14, 'F': 15} 13 | v2h = {v: k for k, v in h2v.items()} 14 | 15 | def hex2int(h): 16 | """Convert a hexadecimal string to an integer, always big-endian""" 17 | return sum(h2v[c] * (16 ** i) for i, c in enumerate(reversed(h))) 18 | 19 | def int2hex(i): 20 | """Convert a long integer to hexadecimal string, always big-endian, 8 digits""" 21 | h = '' 22 | for n in range(8): 23 | e = 16 ** (7 - n) 24 | if e > i: 25 | h += '0' 26 | else: 27 | d = i // e 28 | h += v2h[d] 29 | i -= d * (16 ** (7 - n)) 30 | return h 31 | 32 | def swap_endian(d): 33 | """Swap endian: reverse the byte order in a hexadecimal string""" 34 | return ''.join(reversed([d[i:i+2] for i in range(0, len(d), 2)])) 35 | 36 | def main(): 37 | m = {} # Memory hash (for 8-bit entries) 38 | m32 = {} # Memory hash (for 32-bit entries) 39 | prev_addr = 0 40 | 41 | # Read from stdin 42 | for line in sys.stdin: 43 | line = line.strip() 44 | if not line or not line.startswith("S3"): # Skip empty lines or non-S3 lines 45 | continue 46 | 47 | line = line[4:].rstrip() # Remove the first 4 chars and trailing whitespace 48 | 49 | addr = line[:8] # Address is the first 8 characters 50 | aint = hex2int(addr) # Address as an integer 51 | data = line[8:10] # Data is the next 2 characters 52 | 53 | m[aint] = data # Store in memory hash 54 | 55 | # Convert 8-bit entries to 32-bit entries 56 | for aint in sorted(m.keys()): 57 | data = m[aint] 58 | 59 | addr = (aint // 4) * 4 # Align address to 4-byte boundary 60 | 61 | if (addr - prev_addr > 4) and (addr % 8 != 0): # Insert padding if needed 62 | m32[addr - 4] = "00000000" 63 | 64 | prev = "00000000" 65 | if addr in m32: 66 | prev = m32[addr] 67 | 68 | byte0, byte1, byte2, byte3 = prev[0:2], prev[2:4], prev[4:6], prev[6:8] 69 | 70 | # Switch case to place data in the right position 71 | if aint % 4 == 0: 72 | prev = f"{data}{byte1}{byte2}{byte3}" 73 | elif aint % 4 == 1: 74 | prev = f"{byte0}{data}{byte2}{byte3}" 75 | elif aint % 4 == 2: 76 | prev = f"{byte0}{byte1}{data}{byte3}" 77 | elif aint % 4 == 3: 78 | prev = f"{byte0}{byte1}{byte2}{data}" 79 | 80 | m32[addr] = prev 81 | prev_addr = addr 82 | 83 | # Output the 32-bit values in the required format 84 | all_addresses = sorted(m32.keys()) 85 | i = 0 86 | 87 | while i <= len(all_addresses) - 1: 88 | a = all_addresses[i] 89 | if a % 8 == 4: 90 | a -= 4 91 | i += 1 92 | else: 93 | i += 2 94 | 95 | h = int2hex(a) 96 | lo, hi = "00000000", "00000000" 97 | miss = 0 98 | 99 | if a in m32: 100 | lo = swap_endian(m32[a]) 101 | else: 102 | miss += 1 103 | 104 | if a + 4 in m32: 105 | hi = swap_endian(m32[a + 4]) 106 | else: 107 | miss += 1 108 | 109 | if miss <= 1: 110 | print(f"{h}_{hi}{lo}") 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /scripts/regression-list.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Andrea Belano 6 | # Yvan Tortorella 7 | # 8 | 9 | #!/bin/bash 10 | Red="\e[31m" 11 | Green="\e[32m" 12 | EndColor="\e[0m" 13 | Start=1 14 | 15 | if [ -z "$Target" ]; then 16 | echo -e "${Red}Error: no Target defined. Set the Target variable to "vsim" or "verilator" before continue.${EndColor}" 17 | Start=0 18 | fi 19 | 20 | if [ -z "$Gcc" ]; then 21 | echo -e "${Red}Error: no Gcc toolchain defined. Set the Gcc variable to your toolchain path before continue.${EndColor}" 22 | Start=0 23 | fi 24 | 25 | if [[ "$Start" -eq 1 ]]; then 26 | 27 | BASE_TIMEOUT=500 28 | 29 | PARAMS=( 30 | 96 96 96 31 | 128 128 128 32 | 48 48 48 33 | 12 16 16 34 | 24 16 16 35 | 48 32 32 36 | 30 32 17 37 | 24 32 1 38 | 31 32 16 39 | 17 32 16 40 | 31 32 31 41 | 17 32 3 42 | 5 32 17 43 | 5 32 3 44 | 36 31 32 45 | 12 31 16 46 | 23 31 31 47 | 24 17 32 48 | 24 20 32 49 | 23 17 33 50 | 23 20 33 51 | 3 11 32 52 | 17 13 16 53 | 17 13 17 54 | ) 55 | 56 | i=0 57 | error=0 58 | 59 | make hw-clean hw-build target=$Target 1>/dev/null 2>&1 60 | 61 | while [[ $i -lt ${#PARAMS[@]} ]] 62 | do 63 | M=${PARAMS[$i]} 64 | N=${PARAMS[$(( $i + 1 ))]} 65 | K=${PARAMS[$(( $i + 2 ))]} 66 | 67 | i=$(( $i + 3 )) 68 | 69 | make golden M=$M N=$N K=$K 1>/dev/null 2>&1 70 | make sw-clean sw-build 1>/dev/null 2>&1 71 | timeout $BASE_TIMEOUT make hw-run target=$Target > $PWD/target/sim/$Target/transcript_${M}_${N}_${K} 72 | if grep -rn "\[TB\] - Success!" "$PWD/target/sim/$Target/transcript_${M}_${N}_${K}" 1>/dev/null 2>&1; then 73 | echo -e "${Green}OK ${EndColor}: M=$M N=$N K=$K" 74 | else 75 | echo -e "${Red}ERROR ${EndColor}: M=$M N=$N K=$K" 76 | ((error++)) 77 | fi 78 | done 79 | 80 | if [ "$error" -gt 0 ]; then 81 | exit 1 82 | fi 83 | 84 | fi 85 | -------------------------------------------------------------------------------- /scripts/s19tomem.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | 6 | import numpy as np 7 | import sys 8 | 9 | # Instructions start at 0x1c00_0000 10 | # Data starts at 0x1c02_0000 11 | # Stack starts at 0x1c05_0000 12 | # We only keep last 2 bytes so memory will be filled with no offset. 13 | # The CPU will also reference it as to not have any offset. 14 | MEM_START = 0x1c000000 15 | INSTR_SIZE = 0x8000 16 | INSTR_END = MEM_START + INSTR_SIZE 17 | DATA_BASE = MEM_START + 0x10000 18 | DATA_SIZE = 0x30000 19 | DATA_END = DATA_BASE + DATA_SIZE 20 | 21 | INSTR_MEM_SIZE = 32*1024 22 | DATA_MEM_SIZE = 6*8192 23 | 24 | with open(sys.argv[1], "r") as f: 25 | s = f.read() 26 | 27 | if len(sys.argv) >= 4: 28 | instr_txt = sys.argv[2] 29 | data_txt = sys.argv[3] 30 | else: 31 | instr_txt = "stim_instr.txt" 32 | data_txt = "stim_data.txt" 33 | 34 | instr_mem = np.zeros(INSTR_MEM_SIZE, dtype='int') 35 | data_mem = np.zeros(DATA_MEM_SIZE, dtype='int') 36 | 37 | for l in s.split(): 38 | addr = int(l[0:8], 16) 39 | wh = int(l[9:17], 16) 40 | wl = int(l[17:25], 16) 41 | rel_data_addr = addr - DATA_BASE 42 | rel_imem_addr = addr - MEM_START 43 | if addr >= DATA_BASE and addr < DATA_END: 44 | data_mem [int(rel_data_addr / 4)] = wl 45 | data_mem [int(rel_data_addr / 4) + 1] = wh 46 | elif addr >= MEM_START and addr < INSTR_END: 47 | instr_mem[int(rel_imem_addr / 4)] = wl 48 | instr_mem[int(rel_imem_addr / 4) + 1] = wh 49 | 50 | s = "" 51 | for m in instr_mem: 52 | s += "%08x\n" % m 53 | with open(instr_txt, "w") as f: 54 | f.write(s) 55 | 56 | s = "" 57 | for m in data_mem: 58 | s += "%08x\n" % m 59 | with open(data_txt, "w") as f: 60 | f.write(s.rstrip('\n')) 61 | 62 | -------------------------------------------------------------------------------- /scripts/setup-complex.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | export BENDER_DIR=$(pwd)/hw/bender 9 | echo "Exporting bender path to $BENDER_DIR" 10 | export PATH=$BENDER_DIR:$PATH 11 | unset BENDER_DIR 12 | echo "Exporting SDK and GCC Toolchain paths" 13 | export PATH=/usr/pack/pulpsdk-1.0-kgf/artifactory/pulp-sdk-release/pkg/pulp_riscv_gcc/1.0.16/bin:$PATH 14 | export PATH=/usr/pack/gcc-5.2.0-af/x86_64-rhe6-linux/bin:$PATH 15 | export CXX=g++-13.2.0 16 | export Questa=questa-2023.4 17 | export Verilator="verilator-5.020 verilator" 18 | export VerilatorRoot=/usr/pack/verilator-5.006-zr/verilator-5.006 19 | export Gcc= 20 | export XLEN=32 21 | export XTEN=im 22 | export REDMULE_COMPLEX=1 23 | -------------------------------------------------------------------------------- /scripts/setup-hwpe.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | export BENDER_DIR=$(pwd)/hw/bender 9 | echo "Exporting bender path to $BENDER_DIR" 10 | export PATH=$BENDER_DIR:$PATH 11 | unset BENDER_DIR 12 | echo "Exporting SDK and GCC Toolchain paths" 13 | export PATH=/usr/pack/pulpsdk-1.0-kgf/artifactory/pulp-sdk-release/pkg/pulp_riscv_gcc/1.0.16/bin:$PATH 14 | export PATH=/usr/pack/gcc-5.2.0-af/x86_64-rhe6-linux/bin:$PATH 15 | export CXX=g++-13.2.0 16 | export Questa=questa-2023.4 17 | export Verilator="verilator-5.020 verilator" 18 | export VerilatorRoot=/usr/pack/verilator-5.006-zr/verilator-5.006 19 | export Gcc= 20 | export XLEN=32 21 | export XTEN=imc 22 | export REDMULE_COMPLEX=0 23 | -------------------------------------------------------------------------------- /scripts/setup64.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | 8 | export BENDER_DIR=$(pwd)/hw/bender 9 | echo "Exporting bender path to $BENDER_DIR" 10 | export PATH=$BENDER_DIR:$PATH 11 | unset BENDER_DIR 12 | echo "Exporting SDK and GCC Toolchain paths" 13 | export PATH=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0/bin:$PATH 14 | export PULP_RISCV_GCC_TOOLCHAIN=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0 15 | export PATH=/usr/pack/gcc-5.2.0-af/x86_64-rhe6-linux/bin:$PATH 16 | export CXX=g++-13.2.0 17 | export Questa=questa-2023.4 18 | export Verilator="verilator-5.020 verilator" 19 | export VerilatorRoot=/usr/pack/verilator-5.006-zr/verilator-5.006 20 | export Gcc= 21 | export XTEN=im_ziscr 22 | export XLEN=64 23 | -------------------------------------------------------------------------------- /scripts/stack_init.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | 6 | import sys 7 | 8 | def generate_null_words(file_path): 9 | num_words = 192 * 1024 10 | hex_word = "00000000" 11 | with open(file_path, "w") as f: 12 | for _ in range(num_words): 13 | f.write(hex_word + "\n") 14 | 15 | if __name__ == "__main__": 16 | if len(sys.argv) != 2: 17 | print("Usage: python generate_null_words.py ") 18 | sys.exit(1) 19 | 20 | file_path = sys.argv[1] 21 | generate_null_words(file_path) 22 | print(f"File scritto con successo in: {file_path}") 23 | -------------------------------------------------------------------------------- /sw/archi_redmule.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | #ifndef __ARCHI_REDMULE_H__ 9 | #define __ARCHI_REDMULE_H__ 10 | 11 | /* 12 | * |========================================================================| 13 | * || || 14 | * ||Control and generic configuration register layout || 15 | * |========================================================================| 16 | * || # reg | offset | bits | bitmask || content || 17 | * ||-------+----------+---------+--------------++-------------------------|| 18 | * || 0 | 0x0000 | 31: 0 | 0xFFFFFFFF || TRIGGER || 19 | * || 1 | 0x0004 | 31: 0 | 0xFFFFFFFF || ACQUIRE || 20 | * || 2 | 0x0008 | 31: 0 | 0xFFFFFFFF || EVT_ENABLE || 21 | * || 3 | 0x000c | 31: 0 | 0xFFFFFFFF || STATUS || 22 | * || 4 | 0x0010 | 31: 0 | 0xFFFFFFFF || RUNNING_JOB || 23 | * || 5 | 0x0014 | 31: 0 | 0xFFFFFFFF || SOFT_CLEAR || 24 | * |========================================================================| 25 | * || || 26 | * ||Job-dependent registers layout || 27 | * |========================================================================| 28 | * || # reg | offset | bits | bitmask || content || 29 | * ||-------+----------+---------+--------------++-------------------------|| 30 | * || 0 | 0x0040 | 31: 0 | 0xFFFFFFFF || X_ADDR || 31 | * ||-------+----------+---------+--------------++-------------------------|| 32 | * || 1 | 0x0044 | 31: 0 | 0xFFFFFFFF || W_ADDR || 33 | * ||-------+----------+---------+--------------++-------------------------|| 34 | * || 2 | 0x0048 | 31: 0 | 0xFFFFFFFF || Z_ADDR || 35 | * ||-------+----------+---------+--------------++-------------------------|| 36 | * || 3 | 0x004C | | || Matrix Config 0 Reg || 37 | * || | | 31:16 | 0xFFFF0000 || K Size (W Columns) || 38 | * || | | 15: 0 | 0x0000FFFF || M Size (X Rows) || 39 | * ||-------+----------+---------+--------------++-------------------------|| 40 | * || 4 | 0x0050 | | || Matrix Config 1 Reg || 41 | * || | | 31:16 | 0xFFFFFFFF || N Size (X Cols/W Rows) || 42 | * ||-------+----------+---------+--------------++-------------------------|| 43 | * || 5 | 0x0054 | | || Matrix Arithmetic Reg || 44 | * || | | 12:10 | 0x00001C00 || Operation selection || 45 | * || | | 9: 7 | 0x00000380 || Input/Output format || 46 | * |========================================================================| 47 | * 48 | */ 49 | 50 | #define ARCHI_CL_EVT_ACC0 0 51 | #define ARCHI_CL_EVT_ACC1 1 52 | 53 | // RedMulE architecture 54 | #define ADDR_WIDTH 32 55 | #define DATA_WIDTH 256 56 | #define REDMULE_FMT 16 57 | #define ARRAY_HEIGHT 4 58 | #define PIPE_REGS 3 59 | #define ARRAY_WIDTH 12 /* Superior limit is ARRAY_HEIGHT*PIPE_REGS */ 60 | 61 | // Base address 62 | #define REDMULE_BASE_ADD 0x00100000 63 | 64 | // Commands 65 | #define REDMULE_TRIGGER 0x00 66 | #define REDMULE_ACQUIRE 0x04 67 | #define REDMULE_FINISHED 0x08 68 | #define REDMULE_STATUS 0x0C 69 | #define REDMULE_RUNNING_JOB 0x10 70 | #define REDMULE_SOFT_CLEAR 0x14 71 | 72 | // Registers 73 | #define REDMULE_REG_OFFS 0x40 74 | #define REDMULE_REG_X_PTR 0x00 75 | #define REDMULE_REG_W_PTR 0x04 76 | #define REDMULE_REG_Z_PTR 0x08 77 | #define REDMULE_MCFG0_PTR 0x0C 78 | #define REDMULE_MCFG1_PTR 0x10 79 | #define REDMULE_ARITH_PTR 0x14 80 | 81 | // OPs definition 82 | #define MATMUL 0x0 83 | #define GEMM 0x1 84 | #define ADDMAX 0x2 85 | #define ADDMIN 0x3 86 | #define MULMAX 0x4 87 | #define MULMIN 0x5 88 | #define MAXMIN 0x6 89 | #define MINMAX 0x7 90 | 91 | // GEMM formats 92 | #define Float8 0x0 93 | #define Float16 0x1 94 | #define Float8Alt 0x2 95 | #define Float16Alt 0x3 96 | 97 | // FP Formats encoding 98 | #define FP16 0x2 99 | #define FP8 0x3 100 | #define FP16ALT 0x4 101 | #define FP8ALT 0x5 102 | 103 | #define REDMULE_SUBSYSTEM_REGS_ADDR 0x00200000 104 | #define REDMULE_SUBSYSTEM_RETURN_ADDR REDMULE_SUBSYSTEM_REGS_ADDR + 0x00 105 | #define REDMULE_SUBSYSTEM_EOC_ADDR REDMULE_SUBSYSTEM_REGS_ADDR + 0x04 106 | #define REDMULE_SUBSYSTEM_REDMULE_CLK_GATE_ADDR REDMULE_SUBSYSTEM_REGS_ADDR + 0x08 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /sw/hal_redmule.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | #ifndef __HAL_REDMULE_H__ 9 | #define __HAL_REDMULE_H__ 10 | 11 | #include "tensor_dim.h" 12 | 13 | /* LOW-LEVEL HAL */ 14 | #define REDMULE_ADDR_BASE REDMULE_BASE_ADD 15 | #define REDMULE_ADDR_SPACE 0x00000100 16 | 17 | #define HWPE_WRITE(value, offset) *(int *)(REDMULE_ADDR_BASE + offset) = value 18 | #define HWPE_READ(offset) *(int *)(REDMULE_ADDR_BASE + offset) 19 | 20 | static inline void redmule_x_add_set(unsigned int value) { 21 | HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_X_PTR); 22 | } 23 | 24 | static inline void redmule_w_add_set(unsigned int value) { 25 | HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_W_PTR); 26 | } 27 | 28 | static inline void redmule_z_add_set(unsigned int value) { 29 | HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Z_PTR); 30 | } 31 | 32 | static inline void redmule_mcfg_set(uint32_t mcfg0, uint32_t mcfg1) { 33 | HWPE_WRITE(mcfg0, REDMULE_REG_OFFS + REDMULE_MCFG0_PTR); 34 | HWPE_WRITE(mcfg1, REDMULE_REG_OFFS + REDMULE_MCFG1_PTR); 35 | } 36 | 37 | static inline void redmule_arith_set(uint32_t arith) { 38 | HWPE_WRITE(arith, REDMULE_REG_OFFS + REDMULE_ARITH_PTR); 39 | } 40 | 41 | static inline void hwpe_trigger_job() { HWPE_WRITE(0, REDMULE_TRIGGER); } 42 | 43 | static inline int hwpe_acquire_job() { return HWPE_READ(REDMULE_ACQUIRE); } 44 | 45 | static inline unsigned int hwpe_get_status() { return HWPE_READ(REDMULE_STATUS); } 46 | 47 | static inline void hwpe_soft_clear() { 48 | volatile int i; 49 | HWPE_WRITE(0, REDMULE_SOFT_CLEAR); 50 | } 51 | 52 | static inline void hwpe_cg_enable() { return; } 53 | 54 | static inline void hwpe_cg_disable() { return; } 55 | 56 | void redmule_cfg(unsigned int x, unsigned int w, unsigned int z, uint16_t m_size, uint16_t n_size, 57 | uint16_t k_size, uint8_t gemm_op, uint8_t gemm_fmt) { 58 | 59 | uint32_t mcfg_reg0 = 0; 60 | uint32_t mcfg_reg1 = 0; 61 | uint32_t arith_reg = 0; 62 | 63 | mcfg_reg0 = (k_size << 16) | (m_size << 0); 64 | mcfg_reg1 = n_size << 0; 65 | 66 | arith_reg = (gemm_op << 10) | (gemm_fmt << 7); 67 | 68 | redmule_x_add_set((unsigned int)x); 69 | redmule_w_add_set((unsigned int)w); 70 | redmule_z_add_set((unsigned int)z); 71 | redmule_mcfg_set((unsigned int)mcfg_reg0, (unsigned int)mcfg_reg1); 72 | redmule_arith_set((unsigned int)arith_reg); 73 | } 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /sw/kernel/crt0.S: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | .section .text 9 | .global _start 10 | _start: 11 | 12 | # Cluster PEs will also starts here to avoid aligning another entry point 13 | # Just re-route them to the right entry 14 | csrr a0, mhartid 15 | andi a1, a0, 0x1f 16 | srli a0, a0, 5 17 | # Enabling CV32E40P mstatus.MIE 18 | li t0, 0x1 19 | csrrs zero, mstatus, t0 20 | # Enabling CV32E40P SW interrupt (mie[3]) 21 | li t0, 0x8 22 | csrrs zero, mie, t0 23 | 24 | # clear the bss segment 25 | la t0, _bss_start 26 | la t1, _bss_end 27 | 1: 28 | sw zero, 0(t0) 29 | addi t0, t0, 4 30 | bltu t0, t1, 1b 31 | 32 | /* Stack initialization */ 33 | la x2, stack 34 | 35 | .section .text 36 | 37 | // On all other chips we simply pass 0. 38 | addi a0, x0, 0 39 | addi a1, x0, 0 40 | 41 | // Jump to main program entry point (argc = a0, argv = a1). 42 | la t2, main 43 | jalr x1, t2 44 | mv s0, a0 45 | 46 | /* If program returns from main, call exit routine */ 47 | mv a0, s0 48 | wfi 49 | 50 | .global _init 51 | .global _fini 52 | _init: 53 | _fini: 54 | # These don't have to do anything since we use init_array/fini_array. 55 | ret 56 | 57 | .section .vectors, "ax" 58 | .option norvc; 59 | 60 | .org 0x80 61 | jal x0, _start 62 | 63 | -------------------------------------------------------------------------------- /sw/kernel/link.ld: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 ETH Zurich and University of Bologna. */ 2 | /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ 3 | /* SPDX-License-Identifier: Apache-2.0 */ 4 | 5 | /* Yvan Tortorella */ 6 | 7 | SEARCH_DIR(.) 8 | __DYNAMIC = 0; 9 | 10 | MEMORY 11 | { 12 | instrram : ORIGIN = 0x1c000000, LENGTH = 0x08000 13 | dataram : ORIGIN = 0x1c010000, LENGTH = 0x30000 14 | stack : ORIGIN = 0x1c040000, LENGTH = 0x30000 15 | } 16 | 17 | /* Stack information variables */ 18 | _min_stack = 0x1000; /* 4K - minimum stack space to reserve */ 19 | _stack_len = LENGTH(stack); 20 | _stack_start = ORIGIN(stack) + LENGTH(stack); 21 | 22 | /* We have to align each sector to word boundaries as our current s19->slm 23 | * conversion scripts are not able to handle non-word aligned sections. */ 24 | 25 | SECTIONS 26 | { 27 | .vectors : 28 | { 29 | . = ALIGN(4); 30 | KEEP(*(.vectors)) 31 | } > instrram 32 | 33 | .text : { 34 | . = ALIGN(4); 35 | _stext = .; 36 | *(.text) 37 | _etext = .; 38 | __CTOR_LIST__ = .; 39 | LONG((__CTOR_END__ - __CTOR_LIST__) / 4 - 2) 40 | *(.ctors) 41 | LONG(0) 42 | __CTOR_END__ = .; 43 | __DTOR_LIST__ = .; 44 | LONG((__DTOR_END__ - __DTOR_LIST__) / 4 - 2) 45 | *(.dtors) 46 | LONG(0) 47 | __DTOR_END__ = .; 48 | *(.lit) 49 | *(.shdata) 50 | _endtext = .; 51 | } > instrram 52 | 53 | /*--------------------------------------------------------------------*/ 54 | /* Global constructor/destructor segment */ 55 | /*--------------------------------------------------------------------*/ 56 | 57 | .preinit_array : 58 | { 59 | PROVIDE_HIDDEN (__preinit_array_start = .); 60 | KEEP (*(.preinit_array)) 61 | PROVIDE_HIDDEN (__preinit_array_end = .); 62 | } > dataram 63 | 64 | .init_array : 65 | { 66 | PROVIDE_HIDDEN (__init_array_start = .); 67 | KEEP (*(SORT(.init_array.*))) 68 | KEEP (*(.init_array )) 69 | PROVIDE_HIDDEN (__init_array_end = .); 70 | } > dataram 71 | 72 | .fini_array : 73 | { 74 | PROVIDE_HIDDEN (__fini_array_start = .); 75 | KEEP (*(SORT(.fini_array.*))) 76 | KEEP (*(.fini_array )) 77 | PROVIDE_HIDDEN (__fini_array_end = .); 78 | } > dataram 79 | 80 | .rodata : { 81 | . = ALIGN(4); 82 | *(.rodata); 83 | *(.rodata.*) 84 | } > dataram 85 | 86 | .shbss : 87 | { 88 | . = ALIGN(4); 89 | *(.shbss) 90 | } > dataram 91 | 92 | .data : { 93 | . = ALIGN(4); 94 | sdata = .; 95 | _sdata = .; 96 | *(.data); 97 | *(.data.*) 98 | edata = .; 99 | _edata = .; 100 | } > dataram 101 | 102 | .bss : 103 | { 104 | . = ALIGN(4); 105 | _bss_start = .; 106 | *(.bss) 107 | *(.bss.*) 108 | *(.sbss) 109 | *(.sbss.*) 110 | *(COMMON) 111 | _bss_end = .; 112 | } > dataram 113 | 114 | /* ensure there is enough room for stack */ 115 | .stack (NOLOAD): { 116 | . = ALIGN(4); 117 | . = . + _min_stack ; 118 | . = ALIGN(4); 119 | stack = . ; 120 | _stack = . ; 121 | } > stack 122 | 123 | } 124 | 125 | -------------------------------------------------------------------------------- /sw/redmule.c: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | #include 9 | #include "redmule_utils.h" 10 | #include "archi_redmule.h" 11 | #include "hal_redmule.h" 12 | 13 | #include "x_input.h" 14 | #include "w_input.h" 15 | #include "y_input.h" 16 | #include "z_output.h" 17 | #include "golden.h" 18 | 19 | int main() { 20 | 21 | uint16_t m_size = M_SIZE; 22 | uint16_t n_size = N_SIZE; 23 | uint16_t k_size = K_SIZE; 24 | 25 | uint8_t *x = x_inp; 26 | uint8_t *w = w_inp; 27 | uint8_t *y = y_inp; 28 | uint8_t *z = z_oup; // golden_out //1c010000 29 | 30 | volatile int errors = 0; 31 | 32 | #ifdef COMPLEX_OFFLOADER 33 | 34 | uint32_t x_addr = *(uint32_t *)&x; 35 | uint32_t w_addr = *(uint32_t *)&w; 36 | uint32_t y_addr = *(uint32_t *)&y; 37 | uint32_t cfg_reg0 = ((k_size << 16) | (m_size << 0)); 38 | uint32_t cfg_reg1 = (n_size << 0); 39 | asm volatile("addi t0, %0, 0" ::"r"(x_addr)); 40 | asm volatile("addi t1, %0, 0" ::"r"(w_addr)); 41 | asm volatile("addi t2, %0, 0" ::"r"(y_addr)); 42 | asm volatile("addi t3, %0, 0" ::"r"(cfg_reg0)); 43 | asm volatile("addi t4, %0, 0" ::"r"(cfg_reg1)); 44 | 45 | /* mcnfig instruction */ 46 | // asm volatile( 47 | // ".word (0x0 << 25) | \ /* Empty */ 48 | // (0b11101 << 20) | \ /* Rs2 */ 49 | // (0b11100 << 15) | \ /* Rs1 */ 50 | // (0x00 << 7) | \ /* Empty */ 51 | // (0b0001011 << 0) \n"); /* OpCode */ 52 | 53 | asm volatile(".word (0x0 << 25) | \ 54 | (0b11101 << 20) | \ 55 | (0b11100 << 15) | \ 56 | (0x00 << 7) | \ 57 | (0b0001011 << 0) \n"); 58 | /* marith instruction */ 59 | // sm volatile( 60 | // ".word (0b00111 << 27) | \ /* Rs3 */ 61 | // (0b00 << 25) | \ /* Empty*/ 62 | // (0b00110 << 20) | \ /* Rs2 */ 63 | // (0b00101 << 15) | \ /* Rs1 */ 64 | // (0b0 << 14) | \ /* Custom format enable/disable */ 65 | // (0b0 << 13) | \ /* Widening enable/disable */ 66 | // (0b001 << 10) | \ /* Operation selection */ 67 | // (0b001 << 7) | \ /* Data format */ 68 | // (0b0101011 << 0) \n"); /* OpCode */ 69 | 70 | asm volatile(".word (0b00111 << 27) | \ 71 | (0b00 << 25) | \ 72 | (0b00110 << 20) | \ 73 | (0b00101 << 15) | \ 74 | (0b0 << 14) | \ 75 | (0b0 << 13) | \ 76 | (0b001 << 10) | \ 77 | (0b001 << 7) | \ 78 | (0b0101011 << 0) \n"); 79 | 80 | asm volatile("wfi" ::: "memory"); 81 | 82 | errors = redmule16_compare_int(y, golden, m_size * k_size / 2); 83 | 84 | #else // COMPLEX_OFFLOADER not defined 85 | 86 | uint8_t float_fmt = (SRC_FMT == FP8) ? (uint8_t)Float8 87 | : (SRC_FMT == FP8ALT) ? (uint8_t)Float8Alt 88 | : (SRC_FMT == FP16) ? (uint8_t)Float16 89 | : (SRC_FMT == FP16ALT) ? (uint8_t)Float16Alt 90 | : (uint8_t)Float16; 91 | 92 | int gold_sum = 0, check_sum = 0; 93 | int i, j; 94 | 95 | int offload_id_tmp, offload_id; 96 | 97 | // Enable RedMulE 98 | hwpe_cg_enable(); 99 | 100 | hwpe_soft_clear(); 101 | 102 | while ((offload_id_tmp = hwpe_acquire_job()) < 0) 103 | ; 104 | 105 | redmule_cfg((unsigned int)x, (unsigned int)w, (unsigned int)y, m_size, n_size, k_size, 106 | (uint8_t)gemm_ops, float_fmt); 107 | 108 | // Start RedMulE operation and sleeping until the end of computation 109 | printf("Triggering accelerator and going to sleep...\n"); 110 | hwpe_trigger_job(); 111 | 112 | asm volatile("wfi" ::: "memory"); 113 | 114 | // At the end of accelerator's computation, we resume and check on results 115 | printf("Resumed!\n"); 116 | 117 | // Disable RedMulE 118 | hwpe_cg_disable(); 119 | 120 | if (float_fmt == Float16 || float_fmt == Float16Alt) 121 | errors = redmule16_compare_int(y, golden, m_size * k_size / 2); 122 | else if (float_fmt == Float8 || float_fmt == Float8Alt) 123 | errors = redmule8_compare_int(y, golden, m_size * k_size / 4); 124 | 125 | #endif // #ifded COMPLEX_OFFLOADER 126 | 127 | *(int *)0x80000000 = errors; 128 | 129 | tfp_printf("Terminated test with %d errors. See you!\n", errors); 130 | 131 | return errors; 132 | } 133 | -------------------------------------------------------------------------------- /sw/utils/redmule_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | #include "tinyprintf.h" 9 | 10 | #ifndef REDMULE_UTILS_H 11 | #define REDMULE_UTILS_H 12 | 13 | #define ERR 0x0011 14 | 15 | int redmule16_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { 16 | uint32_t actual_word = 0; 17 | uint16_t actual_MSHWord, actual_LSHWord; 18 | uint32_t golden_word = 0; 19 | uint16_t golden_MSHWord, golden_LSHWord; 20 | uint32_t actual = 0; 21 | uint32_t golden = 0; 22 | 23 | int errors = 0; 24 | int error; 25 | 26 | for (int i = 0; i < len; i++) { 27 | error = 0; 28 | actual_word = *(actual_z + i); 29 | golden_word = *(golden_z + i); 30 | 31 | // int error = ((actual_word ^ golden_word) & ~IGNORE_BITS_COMPARE) ? 1 : 0; 32 | uint16_t diff = 0; 33 | 34 | // Chechink Least Significant Half-Word 35 | actual_LSHWord = (uint16_t)(actual_word & 0x0000FFFF); 36 | golden_LSHWord = (uint16_t)(golden_word & 0x0000FFFF); 37 | 38 | diff = (actual_LSHWord > golden_LSHWord) ? (actual_LSHWord - golden_LSHWord) 39 | : (actual_LSHWord < golden_LSHWord) ? (golden_LSHWord - actual_LSHWord) 40 | : 0; 41 | 42 | if (diff > ERR) { 43 | error = 1; 44 | #ifdef VERBOSE 45 | tfp_printf("diff: 0x%08x\n", diff); 46 | tfp_printf("LSW: Error!\n"); 47 | #endif 48 | } 49 | 50 | // Checking Most Significant Half-Word 51 | actual_MSHWord = (uint16_t)((actual_word >> 16) & 0x0000FFFF); 52 | golden_MSHWord = (uint16_t)((golden_word >> 16) & 0x0000FFFF); 53 | 54 | diff = (actual_MSHWord > golden_MSHWord) ? (actual_MSHWord - golden_MSHWord) 55 | : (actual_MSHWord < golden_MSHWord) ? (golden_MSHWord - actual_MSHWord) 56 | : 0; 57 | 58 | if (diff > ERR) { 59 | error = 1; 60 | #ifdef VERBOSE 61 | tfp_printf("diff: 0x%08x\n", diff); 62 | tfp_printf("MSW: Error!\n"); 63 | #endif 64 | } 65 | 66 | errors += error; 67 | 68 | #ifdef DEBUG 69 | tfp_printf("Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); 70 | #endif 71 | 72 | #ifdef VERBOSE 73 | if (error) { 74 | if (errors == 1) tfp_printf(" golden <- actual @ address @ index\n"); 75 | tfp_printf("0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z + i), 76 | i * 4); 77 | } 78 | #endif 79 | } 80 | return errors; 81 | } 82 | 83 | int redmule8_compare_int(uint32_t *actual_z, uint32_t *golden_z, int len) { 84 | uint32_t actual_word = 0; 85 | uint8_t actual_Byte0, actual_Byte1, actual_Byte2, actual_Byte3; 86 | uint32_t golden_word = 0; 87 | uint8_t golden_Byte0, golden_Byte1, golden_Byte2, golden_Byte3; 88 | uint32_t actual = 0; 89 | uint32_t golden = 0; 90 | 91 | int errors = 0; 92 | int error; 93 | 94 | for (int i = 0; i < len; i++) { 95 | error = 0; 96 | actual_word = *(actual_z + i); 97 | golden_word = *(golden_z + i); 98 | 99 | // int error = ((actual_word ^ golden_word) & ~IGNORE_BITS_COMPARE) ? 1 : 0; 100 | uint8_t diff = 0; 101 | 102 | // Cheching Byte0 103 | actual_Byte0 = (uint8_t)(actual_word & 0x000000FF); 104 | golden_Byte0 = (uint8_t)(golden_word & 0x000000FF); 105 | 106 | diff = (actual_Byte0 > golden_Byte0) ? (actual_Byte0 - golden_Byte0) 107 | : (actual_Byte0 < golden_Byte0) ? (golden_Byte0 - actual_Byte0) 108 | : 0; 109 | 110 | if (diff > ERR) { 111 | error = 1; 112 | #ifdef VERBOSE 113 | tfp_printf("diff: 0x%08x\n", diff); 114 | tfp_printf("Byte0: Error!\n"); 115 | #endif 116 | } 117 | 118 | // Cheching Byte1 119 | actual_Byte1 = (uint8_t)((actual_word >> 8) & 0x000000FF); 120 | golden_Byte1 = (uint8_t)((golden_word >> 8) & 0x000000FF); 121 | 122 | diff = (actual_Byte1 > golden_Byte1) ? (actual_Byte1 - golden_Byte1) 123 | : (actual_Byte1 < golden_Byte1) ? (golden_Byte1 - actual_Byte1) 124 | : 0; 125 | 126 | if (diff > ERR) { 127 | error = 1; 128 | #ifdef VERBOSE 129 | tfp_printf("diff: 0x%08x\n", diff); 130 | tfp_printf("Byte1: Error!\n"); 131 | #endif 132 | } 133 | 134 | // Cheching Byte2 135 | actual_Byte2 = (uint8_t)((actual_word >> 16) & 0x000000FF); 136 | golden_Byte2 = (uint8_t)((golden_word >> 16) & 0x000000FF); 137 | 138 | diff = (actual_Byte2 > golden_Byte2) ? (actual_Byte2 - golden_Byte2) 139 | : (actual_Byte2 < golden_Byte2) ? (golden_Byte2 - actual_Byte2) 140 | : 0; 141 | 142 | if (diff > ERR) { 143 | error = 1; 144 | #ifdef VERBOSE 145 | tfp_printf("diff: 0x%08x\n", diff); 146 | tfp_printf("Byte2: Error!\n"); 147 | #endif 148 | } 149 | 150 | // Cheching Byte3 151 | actual_Byte3 = (uint8_t)((actual_word >> 24) & 0x000000FF); 152 | golden_Byte3 = (uint8_t)((golden_word >> 24) & 0x000000FF); 153 | 154 | diff = (actual_Byte3 > golden_Byte3) ? (actual_Byte3 - golden_Byte3) 155 | : (actual_Byte3 < golden_Byte3) ? (golden_Byte3 - actual_Byte3) 156 | : 0; 157 | 158 | if (diff > ERR) { 159 | error = 1; 160 | #ifdef VERBOSE 161 | tfp_printf("diff: 0x%08x\n", diff); 162 | tfp_printf("Byte3: Error!\n"); 163 | #endif 164 | } 165 | 166 | errors += error; 167 | 168 | #ifdef DEBUG 169 | tfp_printf("Golden: 0x%08x; Actual: 0x%08x,\n", golden_word, actual_word); 170 | #endif 171 | 172 | #ifdef VERBOSE 173 | if (error) { 174 | if (errors == 1) tfp_printf(" golden <- actual @ address @ index\n"); 175 | tfp_printf(" 0x%08x <- 0x%08x @ 0x%08x @ 0x%08x\n", golden_word, actual_word, (actual_z + i), 176 | i * 4); 177 | } 178 | #endif 179 | } 180 | return errors; 181 | } 182 | 183 | #endif 184 | -------------------------------------------------------------------------------- /target/sim/src/redmule_tb.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | // SPDX-License-Identifier: Apache-2.0 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | #define Stringify(x) #x 9 | #define ToString(x) Stringify(x) 10 | 11 | #define ConcatenatePrim(a, b) a##b 12 | #define Concatenate(a, b) ConcatenatePrim(a, b) 13 | 14 | #ifndef TbName 15 | #error "TbName must be set to the name of the toplevel." 16 | #else 17 | #pragma message("TbName is set to: " ToString(TbName)) 18 | #endif 19 | 20 | #define TbHeader TbName.h 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include ToString(TbHeader) 34 | 35 | // Path to the waveform dump 36 | #ifndef WafeformPath 37 | #define WafeformPath "./target/sim/verilator/redmule.vcd" 38 | #else 39 | #pragma message("Wave dump is set to: " ToString(WafeformPath)) 40 | #endif 41 | #define Waveforms ToString(WafeformPath) 42 | 43 | vluint64_t sim_time = 0; 44 | 45 | void dut_reset(Vredmule_tb *dut, vluint64_t &sim_time, vluint64_t rst_time, vluint64_t rst_cycles) { 46 | dut->rst_ni = 0; 47 | if (sim_time > rst_time && sim_time < rst_time + rst_cycles) dut->rst_ni = 1; 48 | 49 | if (sim_time > rst_time + rst_cycles && sim_time < rst_time + 2 * rst_cycles) dut->rst_ni = 0; 50 | 51 | if (sim_time > rst_time + 2 * rst_cycles) dut->rst_ni = 1; 52 | } 53 | 54 | void dut_set_fetch_en(Vredmule_tb *dut, vluint64_t &sim_time, bool value) { 55 | dut->fetch_enable_i = 0; 56 | if (sim_time > 100) { 57 | dut->fetch_enable_i = value; 58 | } 59 | } 60 | 61 | int main(int argc, char **argv, char **env) { 62 | // Random values used to initialize signals 63 | Verilated::commandArgs(argc, argv); 64 | Vredmule_tb *dut = new Vredmule_tb; 65 | 66 | Verilated::traceEverOn(true); 67 | VerilatedVcdC *m_trace = new VerilatedVcdC; 68 | dut->trace(m_trace, 5); 69 | m_trace->open(Waveforms); 70 | 71 | while (!Verilated::gotFinish()) { 72 | // Reset DUT 73 | dut_reset(dut, sim_time, 20, 10); 74 | // Start clock toggling 75 | dut->clk_i ^= 1; 76 | // Set fetch enable to core 77 | dut_set_fetch_en(dut, sim_time, 1); 78 | dut->eval(); 79 | m_trace->dump(sim_time); 80 | sim_time++; 81 | } 82 | 83 | m_trace->close(); 84 | delete dut; 85 | exit(EXIT_SUCCESS); 86 | } 87 | -------------------------------------------------------------------------------- /target/sim/src/redmule_tb_wrap.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Yvan Tortorella 6 | // 7 | 8 | timeunit 1ps; timeprecision 1ps; 9 | 10 | module redmule_tb_wrap; 11 | import redmule_pkg::*; 12 | 13 | localparam TCP = 1.0ns; // clock period, 1 GHz clock 14 | localparam TA = 0.2ns; // application time 15 | localparam TT = 0.8ns; // test time 16 | parameter logic UseXif = 1'b0; // Wether to use CV32E40P/X (passed through the Makefile) 17 | parameter int unsigned PROB_STALL = 0; // Dummy memories stall probability (passed through the Makefile) 18 | 19 | logic clk, rst_n, fetch_enable; 20 | 21 | redmule_tb #( 22 | .TCP ( TCP ), 23 | .TA ( TA ), 24 | .TT ( TT ), 25 | .UseXif ( UseXif ), 26 | .PROB_STALL ( PROB_STALL ) 27 | ) i_redmule_tb ( 28 | .clk_i ( clk ), 29 | .rst_ni ( rst_n ), 30 | .fetch_enable_i ( fetch_enable ) 31 | ); 32 | 33 | // Performs one entire clock cycle. 34 | task cycle; 35 | clk <= #(TCP/2) 0; 36 | clk <= #TCP 1; 37 | #TCP; 38 | endtask 39 | 40 | initial begin 41 | clk <= 1'b0; 42 | rst_n <= 1'b0; 43 | fetch_enable <= 1'b0; 44 | 45 | for (int i = 0; i < 20; i++) cycle(); 46 | 47 | rst_n <= #TA 1'b1; 48 | 49 | for (int i = 0; i < 10; i++) cycle(); 50 | 51 | rst_n <= #TA 1'b0; 52 | 53 | for (int i = 0; i < 10; i++) cycle(); 54 | 55 | rst_n <= #TA 1'b1; 56 | 57 | #(100*TCP); 58 | fetch_enable = 1'b1; 59 | 60 | while(1) cycle(); 61 | 62 | end 63 | 64 | endmodule // redmule_tb_wrap 65 | -------------------------------------------------------------------------------- /target/sim/src/tb_dummy_memory.sv: -------------------------------------------------------------------------------- 1 | // Copyright 2023 ETH Zurich and University of Bologna. 2 | // Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | // SPDX-License-Identifier: SHL-0.51 4 | // 5 | // Francesco Conti 6 | // 7 | 8 | timeunit 1ps; timeprecision 1ps; 9 | 10 | module tb_dummy_memory 11 | #( 12 | parameter MP = 1, 13 | parameter MEMORY_SIZE = 1024, 14 | parameter BASE_ADDR = 0, 15 | parameter PROB_STALL = 0.0, 16 | parameter time TCP = 1.0, // clock period, 1GHz clock 17 | parameter time TA = 0.2, // application time 18 | parameter time TT = 0.8 // test time 19 | ) ( 20 | input logic clk_i, 21 | input logic rst_ni, 22 | input logic clk_delayed_i, 23 | input logic randomize_i, 24 | input logic enable_i, 25 | input logic stallable_i, 26 | hwpe_stream_intf_tcdm.slave tcdm [MP-1:0] 27 | ); 28 | 29 | logic [31:0] memory [MEMORY_SIZE]; 30 | int cnt = 0; 31 | 32 | int cnt_req [MP-1:0]; 33 | int cnt_rval [MP-1:0]; 34 | int cnt_rd [MP-1:0]; 35 | int cnt_wr [MP-1:0]; 36 | 37 | logic [MP-1:0] tcdm_req; 38 | logic [MP-1:0] tcdm_gnt; 39 | logic [MP-1:0][31:0] tcdm_add; 40 | logic [MP-1:0] tcdm_wen; 41 | logic [MP-1:0][3:0] tcdm_be; 42 | logic [MP-1:0][31:0] tcdm_data; 43 | logic [MP-1:0][31:0] tcdm_r_data; 44 | logic [MP-1:0] tcdm_r_valid; 45 | logic [MP-1:0][31:0] tcdm_r_data_int; 46 | logic [MP-1:0] tcdm_r_valid_int; 47 | 48 | real probs [MP-1:0]; 49 | 50 | logic clk_delayed; 51 | 52 | always_ff @(posedge clk_i) 53 | begin : probs_proc 54 | for (int i=0; i> 2][(j+1)*8-1:j*8]; 95 | if(tcdm_be[i][j]) 96 | write_data[i][(j+1)*8-1:j*8] = tcdm_data[i][(j+1)*8-1:j*8]; 97 | end 98 | endgenerate 99 | 100 | always_ff @(posedge clk_i or negedge rst_ni) begin : dummy_proc 101 | if (~rst_ni) begin 102 | tcdm_r_data_int <= '0; 103 | tcdm_r_valid_int <= '0; 104 | end else begin 105 | for (int i=0; i> 2]; 114 | tcdm_r_valid_int [i] <= tcdm_gnt[i]; 115 | end 116 | // write 117 | else if (tcdm_gnt[i] & ~tcdm_wen[i]) begin 118 | memory[(tcdm_add[i]-BASE_ADDR) >> 2] <= write_data [i]; 119 | tcdm_r_data_int [i] <= write_data [i]; 120 | tcdm_r_valid_int [i] <= 1'b1; 121 | end 122 | // no-grant 123 | else if (~tcdm_gnt[i]) begin 124 | tcdm_r_data_int [i] <= '0; 125 | tcdm_r_valid_int [i] <= 1'b0; 126 | end 127 | else begin 128 | tcdm_r_data_int [i] <= '0; 129 | tcdm_r_valid_int [i] <= 1'b0; 130 | end 131 | end 132 | end 133 | end 134 | end 135 | 136 | assign tcdm_r_data = tcdm_r_data_int; 137 | assign tcdm_r_valid = tcdm_r_valid_int; 138 | 139 | generate 140 | 141 | for(genvar ii=0; ii 6 | # 7 | # Makefragment for Verilator simulation. 8 | 9 | Verilator ?= $(VerilatorInstallDir)/bin/verilator 10 | GtkWave ?= gtkwave 11 | VerilatorRoot ?= $(VerilatorInstallDir)/share/verilator 12 | Module := redmule 13 | ObjDirName := obj_dir 14 | Vmodule := V$(Module) 15 | VerilatorDir := $(SimDir)/$(target) 16 | VerilatorSrc := $(SimDir)/src 17 | VerilatorObjDir := $(VerilatorPath)/$(ObjDirName) 18 | VerilatorAbsObjDir := $(VerilatorDir)/$(ObjDirName) 19 | VerilatorCompileScript := $(VerilatorDir)/compile.$(target).tcl 20 | VerilatorWaves := $(VerilatorDir)/redmule.vcd 21 | 22 | hw-clean: 23 | rm -rf $(VerilatorAbsObjDir) $(VerilatorCompileScript) $(VerilatorWaves) 24 | 25 | hw-script: 26 | $(Bender) update 27 | $(Bender) script $(target) \ 28 | $(common_targs) $(common_defs) \ 29 | $(sim_targs) $(sim_defs) \ 30 | > $(VerilatorCompileScript) 31 | 32 | hw-build: hw-script 33 | $(Verilator) --trace --timing --bbox-unsup \ 34 | -Wall -Wno-fatal --Wno-lint --Wno-UNOPTFLAT --Wno-MODDUP -Wno-BLKANDNBLK \ 35 | --x-assign unique --x-initial unique --top-module $(Module)_tb --Mdir $(VerilatorAbsObjDir) \ 36 | -GPROB_STALL=$(P_STALL) -GUseXif=$(UseXif) \ 37 | -CFLAGS "-DTbName=$(Vmodule)_tb -DWafeformPath=$(VerilatorWaves)" \ 38 | -sv -cc -f $(VerilatorCompileScript) --exe $(VerilatorSrc)/$(Module)_tb.cpp 39 | make -C $(VerilatorAbsObjDir) -f $(Vmodule)_tb.mk $(Vmodule)_tb 40 | 41 | hw-run: 42 | cd $(VerilatorDir); \ 43 | ./$(ObjDirName)/$(Vmodule)_tb +STIM_INSTR=$(STIM_INSTR) +STIM_DATA=$(STIM_DATA) +STACK_INIT=$(STACK_INIT) 44 | ifeq ($(gui),1) 45 | $(GtkWave) $(VerilatorWaves) 46 | endif 47 | 48 | hw-all: hw-clean hw-script hw-build hw-run 49 | -------------------------------------------------------------------------------- /target/sim/vsim/vsim.mk: -------------------------------------------------------------------------------- 1 | # Copyright 2023 ETH Zurich and University of Bologna. 2 | # Licensed under the Apache License, Version 2.0, see LICENSE for details. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Yvan Tortorella 6 | # 7 | # Makefragment for Verilator simulation. 8 | 9 | Questa ?= questa-2023.4 10 | Module := redmule 11 | VsimDir := $(SimDir)/$(target) 12 | VsimCompileScript := $(VsimDir)/compile.$(target).tcl 13 | VsimWaves := $(VsimDir)/wave.tcl 14 | 15 | Tb := redmule_tb_wrap 16 | CompileFlags := +acc -permissive -floatparameters+$(Tb) -suppress 2583 -suppress 13314 -suppress vlog-1952 17 | VsimFlags += -suppress vsim-3009 18 | ifeq ($(UseXif),1) 19 | CoreTraces := "+log_file=$(VsimDir)/trace_core_00000000.log" 20 | endif 21 | ifeq ($(gui),1) 22 | VsimFlags += -do "set XifSel $(UseXif)" \ 23 | -do "log -r /*" \ 24 | -do "source $(VsimWaves)" 25 | else 26 | VsimFlags += -c 27 | endif 28 | 29 | VsimFlags += -suppress 3009 30 | 31 | hw-clean: 32 | rm -rf $(VsimCompileScript) $(VsimDir)/transcript $(VsimDir)/modelsim.ini $(VsimDir)/*.wlf $(VsimDir)/work $(VsimDir)/trace* 33 | 34 | hw-script: 35 | $(Bender) update 36 | $(Bender) script $(target) \ 37 | --vlog-arg="$(CompileFlags)" \ 38 | --vcom-arg="-pedanticerrors" \ 39 | $(common_targs) $(common_defs) \ 40 | $(sim_targs) $(sim_defs) \ 41 | > $(VsimCompileScript) 42 | echo 'vopt $(CompileFlags) $(Tb) -o $(Tb)_opt' >> $(VsimCompileScript) 43 | 44 | hw-build: hw-script 45 | cd $(VsimDir); \ 46 | $(Questa) $(target) -c \ 47 | -do 'echo XifSel' \ 48 | -do 'quit -code [source $(VsimCompileScript)]' 49 | 50 | hw-run: 51 | cd $(VsimDir); \ 52 | $(Questa) $(target) $(Tb)_opt \ 53 | $(VsimFlags) \ 54 | $(CoreTraces) \ 55 | +STIM_INSTR=$(STIM_INSTR) \ 56 | +STIM_DATA=$(STIM_DATA) \ 57 | +STACK_INIT=$(STACK_INIT) \ 58 | -gPROB_STALL=$(P_STALL) \ 59 | -gUseXif=$(UseXif) \ 60 | -do "run -a" 61 | 62 | hw-all: hw-clean hw-script hw-build hw-run 63 | -------------------------------------------------------------------------------- /target/sim/vsim/wave.tcl: -------------------------------------------------------------------------------- 1 | # Copyright 2021 ETH Zurich and University of Bologna. 2 | # Solderpad Hardware License, Version 0.51, see LICENSE for details. 3 | # SPDX-License-Identifier: SHL-0.51 4 | # 5 | # Yvan Tortorella 6 | 7 | onerror {resume} 8 | quietly WaveActivateNextPane {} 0 9 | 10 | set Testbench redmule_tb_wrap/i_redmule_tb 11 | set DutPath i_dut 12 | set TopLevelPath $DutPath/i_redmule_top 13 | if {$XifSel == {1}} { 14 | set CorePath $DutPath/gen_cv32e40x/i_core 15 | } else { 16 | set CorePath $DutPath/gen_cv32e40p/i_core 17 | } 18 | set MinHeight 16 19 | set MaxHeight 32 20 | set WavesRadix hexadecimal 21 | 22 | # Core 23 | add wave -noupdate -group Core -group top -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$CorePath/* 24 | # Top level 25 | add wave -noupdate -group RedMulE -group top -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/* 26 | add wave -noupdate -group RedMulE -group periph -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/periph/* 27 | add wave -noupdate -group RedMulE -group tcdm -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/tcdm/* 28 | # Streamer 29 | add wave -noupdate -group Streamer -group top -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/* 30 | add wave -noupdate -group Streamer -group LDST-Mux -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/i_ldst_mux/* 31 | ## X stream 32 | add wave -noupdate -group Streamer -group X-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/gen_tcdm2stream[0]/i_load_tcdm_fifo/* 33 | ## W stream 34 | add wave -noupdate -group Streamer -group W-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/gen_tcdm2stream[1]/i_load_tcdm_fifo/* 35 | ## Y stream 36 | add wave -noupdate -group Streamer -group Y-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/gen_tcdm2stream[2]/i_load_tcdm_fifo/* 37 | ## Z stream 38 | add wave -noupdate -group Streamer -group Z-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/i_stream_sink/* 39 | add wave -noupdate -group Streamer -group Z-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/i_store_cast/* 40 | add wave -noupdate -group Streamer -group Z-Stream -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_streamer/i_store_fifo/* 41 | # Buffers and FIFOs 42 | ## X 43 | add wave -noupdate -group X-channel -group x-buffer_fifo -group fifo_interface -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/x_buffer_fifo/* 44 | add wave -noupdate -group X-channel -group x-buffer_fifo -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_x_buffer_fifo/* 45 | add wave -noupdate -group X-channel -group x-buffer -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_x_buffer/* 46 | ## W 47 | add wave -noupdate -group W-channel -group w-buffer_fifo -group fifo_interface -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/w_buffer_fifo/* 48 | add wave -noupdate -group W-channel -group w-buffer_fifo -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_w_buffer_fifo/* 49 | add wave -noupdate -group W-channel -group w-buffer -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_w_buffer/* 50 | ## Y 51 | add wave -noupdate -group Y-channel -group y-buffer_fifo -group fifo_interface -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/y_buffer_fifo/* 52 | add wave -noupdate -group Y-channel -group y-buffer_fifo -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_y_buffer_fifo/* 53 | ## Z 54 | add wave -noupdate -group Z-channel -group z-buffer_fifo -group fifo_interface -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/z_buffer_fifo/* 55 | add wave -noupdate -group Z-channel -group z-buffer_fifo -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_z_buffer_fifo/* 56 | add wave -noupdate -group Z-channel -group z-buffer -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_z_buffer/* 57 | # Engine 58 | set NumRows [examine -radix dec redmule_pkg::ARRAY_WIDTH] 59 | set NumCols [examine -radix dec redmule_pkg::ARRAY_HEIGHT] 60 | 61 | for {set row 0} {$row < $NumRows} {incr row} { 62 | for {set col 0} {$col < $NumCols} {incr col} { 63 | add wave -noupdate -group Engine -group row_$row -group CE_$col -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_redmule_engine/gen_redmule_rows[$row]/i_row/gen_computing_element[$col]/i_computing_element/* 64 | } 65 | } 66 | # Scheduler 67 | add wave -noupdate -group Scheduler -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_scheduler/* 68 | # Memory scheduler 69 | add wave -noupdate -group Scheduler -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_memory_scheduler/* 70 | # Controller 71 | add wave -noupdate -group Controller -color {} -height $MinHeight -max $MaxHeight -radix $WavesRadix $Testbench/$TopLevelPath/i_control/* 72 | 73 | # Remove the hierarchial strip from signals 74 | config wave -signalnamewidth 1 75 | --------------------------------------------------------------------------------