├── .github └── workflows │ └── deploy-docs.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── demo ├── gem5 │ ├── .gitignore │ ├── README.rst │ ├── convert_onnx_model.py │ ├── docker │ │ ├── Dockerfile │ │ └── docker-compose.yaml │ ├── iree │ │ ├── Makefile │ │ └── toolchain.generic.cmake │ └── vlsid-riscv-fs.py └── sst │ ├── README.rst │ ├── docker │ ├── Dockerfile │ ├── docker-compose.yaml │ └── sst-elements.patch │ ├── instruction-level-simulation │ ├── .gitignore │ ├── scale_out.py │ └── scale_up.py │ ├── packet-level-simulation │ ├── large_config.json │ ├── small_config.json │ └── training_llm.py │ └── software │ ├── .gitignore │ ├── Makefile │ ├── check_mpi.c │ ├── gemm_OMP.c │ ├── hello_MPI.c │ ├── hello_MPI_OMP.c │ ├── mha_MPI_OMP.c │ ├── mha_OMP.c │ ├── riscv64.make │ ├── riscv64 │ ├── gemm_OMP │ ├── hello_MPI │ ├── hello_MPI_OMP │ ├── mha_MPI_OMP │ ├── mha_OMP_16 │ ├── mha_OMP_32 │ ├── mha_OMP_64 │ └── mha_OMP_8 │ └── x86.make ├── docs ├── 512px-LOGO-IMEC_black.svg.png ├── conf.py ├── gem5.rst ├── images │ ├── gem5 │ │ ├── gem5-system.svg │ │ └── mnist-8.svg │ ├── sst │ │ ├── core.svg │ │ ├── cpu.svg │ │ ├── mha.svg │ │ ├── mha_mpi.svg │ │ ├── node.svg │ │ └── system.svg │ └── transformer │ │ ├── 3d_parallelism_1.svg │ │ ├── 3d_parallelism_2.svg │ │ ├── data_parallelism.svg │ │ ├── pipeline_parallelism_1.svg │ │ ├── pipeline_parallelism_2.svg │ │ ├── tensor_parallelism.svg │ │ └── transformer_arch.svg ├── index.rst ├── references.bib ├── slides │ ├── 2025_05_ISPASS_Presentation.pdf │ └── VLSID25_Tutorial_Slides_imec_CSA.pdf └── sst.rst ├── external ├── .gitignore ├── INSTALL.rst ├── mvapich2-2.3.7-1.patch ├── mvapich2-2.3.7-1.tar.gz └── sst │ └── libRDMA │ └── Makefile └── requirements.txt /.github/workflows/deploy-docs.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main # Run workflow on pushes to the main branch 7 | 8 | jobs: 9 | build-deploy: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | # Step 1: Check out the repository 14 | - name: Checkout repository 15 | uses: actions/checkout@v3 16 | 17 | # Step 2: Set up Python 18 | - name: Set up Python 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.10' # Adjust to your desired Python version 22 | 23 | # Step 3: Install dependencies 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | 29 | # Step 4: Build the HTML documentation 30 | - name: Build Sphinx documentation 31 | run: | 32 | make html 33 | 34 | # Step 5: Deploy to GitHub Pages (docs branch) 35 | - name: Deploy to GitHub Pages 36 | uses: peaceiris/actions-gh-pages@v4 37 | with: 38 | github_token: ${{ secrets.GITHUB_TOKEN }} 39 | publish_dir: build/html 40 | publish_branch: docs # The branch where GitHub Pages will serve from 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | build 35 | imec_tut_2025 36 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/sst/sst-elements"] 2 | path = external/sst/sst-elements 3 | url = https://github.com/sstsimulator/sst-elements.git 4 | branch = v14.1.0_Final 5 | [submodule "external/sst/sst-core"] 6 | path = external/sst/sst-core 7 | url = https://github.com/sstsimulator/sst-core.git 8 | branch = v14.1.0_Final 9 | [submodule "external/riscv-gnu-toolchain"] 10 | path = external/riscv-gnu-toolchain 11 | url = https://github.com/riscv-collab/riscv-gnu-toolchain.git 12 | branch = 2024.11.22 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 imec, Belgium 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ## This source code is licensed under the MIT license found in the 2 | ## LICENSE file in the root directory of this source tree. 3 | ## 4 | ## Copyright (c) 2025 IMEC. All rights reserved. 5 | ## ****************************************************************************** 6 | 7 | # Minimal makefile for Sphinx documentation 8 | # 9 | 10 | # You can set these variables from the command line, and also 11 | # from the environment for the first two. 12 | SPHINXOPTS ?= 13 | SPHINXBUILD ?= sphinx-build 14 | SOURCEDIR = docs 15 | BUILDDIR = build 16 | 17 | # Put it first so that "make" without argument is like "make help". 18 | help: 19 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | 21 | .PHONY: help Makefile 22 | 23 | # Catch-all target: route all unknown targets to Sphinx using the new 24 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 25 | %: Makefile 26 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A tutorial on Scalable System Simulations 2 | *Looking at RISC-V architectures and performance analysis for machine learning workloads* 3 | 4 | Dive into the world of system-level simulations! 5 | - Explore RISC-V modelling and workload representation using gem5+MLIR. 6 | - Learn to scale your system simulations effortlessly with the power of SST. 7 | 8 | This tutorial bridges cutting-edge open-source tools and techniques to empower your hardware-software co-design journey. This tutorial has been presented 9 | at [International Conference on VLSI Design 2025](https://vlsid.org/). 10 | 11 | If you use this repository, please cite it as follows: 12 | 13 | ```bibtex 14 | @misc{sim-learning-tutorial, 15 | author = {Erwan Lenormand, Tommaso Marinelli, Debjyoti Bhattacharjee}, 16 | title = {A tutorial on Scalable System Simulations}, 17 | year = {2025}, 18 | version = {v1.0.0}, 19 | howpublished = {Presented at the International Conference on VLSI Design 2025}, 20 | note = {\url{https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/} Accessed: 2025-01-02} 21 | } 22 | ``` 23 | 24 | ### Getting Started 25 | Follow the documentation [online](https://csa-infra.github.io/RISCV-Scalable-Simulation-tutorial/index.html) or build the documentation yourself. 26 | Minimum required version of python is 3.10. The tutorial has been tested on Linux based systems. 27 | 28 | ``` 29 | python3 -m venv imec_tut_2025 30 | source imec_tut_2025/bin/activate 31 | pip3 install -r requirements.txt 32 | make html 33 | ``` 34 | ### Need help? 35 | If you need help or clarification regarding any part of the tutorial, file an [issue](https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/issues/new) in the repository. 36 | -------------------------------------------------------------------------------- /demo/gem5/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !README.md 4 | !convert_onnx_model.py 5 | !docker/ 6 | !docker/* 7 | !images 8 | !iree/ 9 | !iree/Makefile 10 | !iree/toolchain.generic.cmake 11 | !vlsid-riscv-fs.py 12 | -------------------------------------------------------------------------------- /demo/gem5/README.rst: -------------------------------------------------------------------------------- 1 | Application-oriented system modeling and optimization 2 | ***************************************************** 3 | 4 | *i.e. how to lower an AI/ML model to simulated RISC-V hardware for system-level 5 | exploration* 6 | 7 | The goal of this tutorial is to introduce the attendees to architectural 8 | simulation targeting machine learning workloads. The main tool we will be 9 | using to model a sample RISC-V system and run applications on top is 10 | \ `gem5 `__\ . The ML benchmarks are derived from 11 | ONNX files, translated into machine-optimized code and executed though a 12 | ligthweight runtime. This process is carried out with the help of the 13 | \ `IREE `__\ workflow. 14 | 15 | Prerequisites 16 | ------------- 17 | 18 | - A Linux-based x86-64 system (native or WSL2/VM) 19 | - Docker or Podman 20 | 21 | Containerized environment 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | .. note:: 25 | The container is executed in privileged mode to 26 | allow mounting the disk image as a loop device. If you don’t like this, 27 | remove the corresponding option from ``docker-compose.yaml``. 28 | 29 | Dealing with all the software dependencies that this setup needs can be 30 | complicated. For this reason, a container file has been provided, which 31 | allows to generate a virtual environment with all the dependencies 32 | installed. Assuming that Docker is present in your system, you can prepare 33 | the environment this way: 34 | 35 | :: 36 | 37 | git clone https://github.com/CSA-infra/RISCV-Scalable-Simulation-tutorial.git vlsid-csa-tutorial 38 | cd vlsid-csa-tutorial/demo/gem5/docker 39 | docker compose up -d 40 | 41 | If it doesn’t work, try with ``docker-compose`` alternatively. 42 | 43 | To enter the container: 44 | 45 | :: 46 | 47 | docker exec -it docker_vlsid-iree-gem5_1 /bin/bash 48 | 49 | If you stop the container (e.g. reboot), you can easily return back to 50 | it with: 51 | 52 | :: 53 | 54 | docker start docker_vlsid-iree-gem5_1 55 | docker exec -it docker_vlsid-iree-gem5_1 /bin/bash 56 | 57 | Finally, if you want to destroy the container, you can do it with: 58 | 59 | :: 60 | 61 | cd vlsid-csa-tutorial/demo/gem5/docker 62 | docker compose down 63 | 64 | The working directory inside the container is ``/opt/vlsid-iree-gem5``. 65 | We will assume that every command is executed from that folder. 66 | 67 | Environment Setup 68 | ----------------- 69 | 70 | Part 1: Prepare benchmark 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | 73 | The IREE workflow is used to first convert a ML model to a supported 74 | intermediate representation, then compile and optimize the model for a 75 | target architecture. The output of the process is a Virtual Machine 76 | FlatBuffer (VMFB) file than can be run by the IREE runtime. 77 | 78 | A simple MNIST image classification model will be used as example, but 79 | the process is generalizable to other models too. The file format for the 80 | model is ONNX. Note that IREE also supports other formats (e.g. TF/TFLite), 81 | it is possible to convert them to MLIR using the right importers. 82 | 83 | .. figure:: images/gem5/mnist-8.svg 84 | :align: center 85 | 86 | Visual representation of the MNIST model 87 | 88 | - Download ONNX model 89 | 90 | :: 91 | 92 | wget https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mnist/model/mnist-8.onnx -O mnist-8-orig.onnx 93 | 94 | - `Upgrade ONNX 95 | opset `__ 96 | 97 | :: 98 | 99 | ./convert_onnx_model.py mnist-8-orig.onnx mnist-8.onnx 100 | 101 | - Use IREE to convert ONNX file to MLIR Torch ONNX dialect 102 | 103 | :: 104 | 105 | iree-import-onnx mnist-8.onnx > mnist-8.mlir 106 | 107 | - Compile MLIR model to VMFB 108 | 109 | :: 110 | 111 | iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c mnist-8.mlir -o mnist-8.vmfb 112 | 113 | Part 2: Compile IREE run module 114 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 115 | 116 | The IREE run module allows the execution of a compiled module using the 117 | IREE runtime. This module has to be added to the final disk image 118 | together with the benchmarks, since we don’t want to pull the entire 119 | IREE distribution. 120 | 121 | Even if pre-built binaries are available, as of now they are not 122 | compiled for any RISC-V architecture. Thus, we will have to compile this 123 | module from source. A Makefile has been provided to simplify the 124 | process. 125 | 126 | :: 127 | 128 | make -C iree 129 | 130 | Part 3: Compile m5 utility 131 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 132 | 133 | The m5 utility is used to send pseudo-instructions to the simulator. 134 | This allows a number of operations, like checkpointing, resetting 135 | statistics, etc. We want to include this utility in our final image. 136 | Note that will need the cross-compiler employed in the previous step to 137 | generate the binary. 138 | 139 | - Get the gem5 simulator 140 | 141 | :: 142 | 143 | git clone https://github.com/gem5/gem5.git -b v24.1 144 | 145 | - Compile the m5 utility 146 | 147 | :: 148 | 149 | export PATH=$PATH:$(realpath toolchain-riscv64/bin) 150 | scons riscv.CROSS_COMPILE=riscv64-buildroot-linux-musl- -C gem5/util/m5 build/riscv/out/m5 151 | 152 | Part 4: Prepare RISC-V disk image 153 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 154 | 155 | .. warning:: 156 | If using Podman or rootless Docker, this steps must be done 157 | outside the container, as they typically require sudo permissions. 158 | Pay attention when executing each command! 159 | 160 | The last part of the setup consists in packing the benchmarks and IREE 161 | runtime into a disk image. For this task, we will use a pre-built 162 | minimal image from the gem5 community and modify it. 163 | 164 | - Get and extract `base 165 | image `__ 166 | 167 | :: 168 | 169 | wget https://storage.googleapis.com/dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz 170 | gzip -d riscv-disk.img.gz 171 | cp riscv-disk.img vlsid-disk.img 172 | 173 | - Mount image (execute with sudo if outside the container) 174 | 175 | :: 176 | 177 | mkdir /tmp/rootfs 178 | mount vlsid-disk.img /tmp/rootfs 179 | 180 | - Copy benchmark (execute with sudo if outside the container) 181 | 182 | :: 183 | 184 | cp mnist-8.vmfb /tmp/rootfs/root/ 185 | 186 | - Copy IREE run module (execute with sudo if outside the container) 187 | 188 | :: 189 | 190 | cp iree/iree-build-riscv64/install/bin/iree-run-module /tmp/rootfs/bin/ 191 | 192 | - Copy m5 utility (execute with sudo if outside the container) 193 | 194 | :: 195 | 196 | cp gem5/util/m5/build/riscv/out/m5 /tmp/rootfs/sbin/ 197 | 198 | - Unmount image (execute with sudo if outside the container) 199 | 200 | :: 201 | 202 | umount /tmp/rootfs 203 | 204 | Machine Learning Workload Execution 205 | ----------------------------------- 206 | 207 | At this point, we are ready to run the experiment. A gem5 configuration 208 | file is present in this directory, which is derived from the 209 | ``riscv-fs.py`` sample script of gem5. The main difference is that 210 | instead of using the default disk image it will pick the one that we 211 | have just generated. 212 | 213 | The script defines a simple RISC-V system comprising a processor, a two-level 214 | cache hierarchy, a main memory and a generic board with some basic devices 215 | (UART controller, RNG, disk interface, etc.). An auto-generated diagram of the 216 | simulated system is presented below. You may need to zoom in to find out about 217 | all the individual components and connections. 218 | 219 | .. figure:: images/gem5/gem5-system.svg 220 | :align: center 221 | 222 | Composition of the simulated system 223 | 224 | - Compile gem5 225 | 226 | .. note:: 227 | This step will take a while. 228 | 229 | :: 230 | 231 | scons build/RISCV/gem5.opt -C gem5 -j$(nproc) 232 | 233 | - Compile m5term 234 | 235 | :: 236 | 237 | make -C gem5/util/term 238 | 239 | - Run the script 240 | 241 | .. note:: 242 | This step will take a while. We will speed up following 243 | executions through checkpointing. 244 | 245 | :: 246 | 247 | ./build/RISCV/gem5.opt vlsid-riscv-fs.py 248 | 249 | While the simulation is running, its output is not immediately visible, 250 | as it is redirected to a separate console. To view it, open another 251 | terminal and use the m5term utility. 252 | 253 | :: 254 | 255 | ./gem5/util/term/m5term 3456 256 | 257 | The boot process is going to take several minutes. After that, you will 258 | se a login shell. Enter user “root” and password “root” to proceed. 259 | After login, you can launch your IREE benchmark. This is the command to 260 | execute for MNIST: 261 | 262 | :: 263 | 264 | iree-run-module --module=/root/mnist-8.vmfb --device=local-task --input="1x1x28x28xf32=0" 265 | 266 | For simplicity we are assuming an input tensor filled with zeros. You 267 | should see this output after some time: 268 | 269 | :: 270 | 271 | EXEC @CNTKGraph 272 | result[0]: hal.buffer_view 273 | 1x10xf32=[-0.044856 0.00779166 0.0681008 0.0299937 -0.12641 0.140219 -0.0552849 -0.0493838 0.0843221 -0.0545404] 274 | 275 | Congratulations! You are ready to go! 276 | 277 | Extra: Checkpoints 278 | ------------------ 279 | 280 | You will have noticed that booting the Linux kernel and reaching the 281 | login shell takes several minutes, even with a minimal image like the 282 | one we are using. We want to avoid waiting so long for each one of the 283 | experiments. One of the commonly used techniques to deal with these 284 | situations is checkpointing: we can “take a picture” of the system at a 285 | certain moment of time and start other simulations from that point. 286 | Technically speaking, this requires saving the main memory content and 287 | the processors context. Cache content is not saved, but since we will 288 | execute our benchmarks from scratch this is not a big deal. 289 | 290 | In order to dump a checkpoint, after entering the shell in the simulated 291 | environment type this command: 292 | 293 | :: 294 | 295 | m5 checkpoint 296 | 297 | After terminating the simulation, you will see that in the output folder 298 | (e.g. ``m5out``) a folder named ``cpt.`` has appeared. This 299 | contains the checkpoint we have just dumped. We strongly suggest to move 300 | this folder outside the ``m5out`` directory. 301 | 302 | :: 303 | 304 | mv m5out/cpt. checkpoint 305 | 306 | From now on, it will be possible to execute a simulation starting from 307 | this checkpoint. It is sufficient to add an argument to the gem5 308 | command, specifying the position of the folder containing the checkpoint 309 | files: 310 | 311 | :: 312 | 313 | ./build/RISCV/gem5.opt vlsid-riscv-fs.py --restore-from checkpoint 314 | 315 | This way, you will be immediately dropped to the shell. Huge 316 | improvement! 317 | 318 | Experimental Studies 319 | -------------------- 320 | 321 | Now that you are able to run complete simulations, it is time to explore 322 | a few knobs and analyze their impact on the system performance. 323 | 324 | Part 1: Change CPU model 325 | ~~~~~~~~~~~~~~~~~~~~~~~~ 326 | 327 | The gem5 simulator supports different `CPU 328 | models `__. 329 | By default, the script runs with an *atomic* CPU, which implies atomic 330 | accesses to the memory system with fixed latencies. This model is fast 331 | and simple, but inaccurate. 332 | 333 | The first task is to replace the CPU type with a more detailed one. 334 | There are three possible choices: 335 | 336 | - **TimingSimpleCPU:** simple timing CPU, 1-stage pipeline 337 | - **MinorCPU:** in-order CPU, 4-stages pipeline 338 | - **O3CPU:** out-of-order CPU, 7-stages pipeline 339 | 340 | These CPU models are highly configurable, but for this experiment it is 341 | fine to stick with the default parameters set. 342 | 343 | To implement such change, open the ``vlsid-riscv-fs.py`` script and 344 | change ``CPUTypes.ATOMIC`` (line 78) to ``CPUTypes.TIMING``, 345 | ``CPUTypes.MINOR`` and ``CPUTypes.O3``. After each execution, have a 346 | look at the ``stats.txt`` file in the output folder (default: 347 | ``m5out``). In particular, look at how these statistics change: 348 | 349 | :: 350 | 351 | simSeconds -> Simulated system execution time 352 | hostSeconds -> Host system simulation time 353 | board.processor.cores.core.ipc -> IPC of simulated CPU 354 | board.memory.mem_ctrl.dram.bwTotal::total -> DRAM memory bandwidth 355 | 356 | **Tip 1:** Wrap your benchmark execution around the commands “m5 357 | resetstats” and “m5 exit”, to make sure that the statistics only reflect 358 | the benchmark execution and not the system boot or idle time. E.g.: 359 | 360 | :: 361 | 362 | m5 resetstats && iree-run-module [...] && m5 exit 363 | 364 | **Tip 2:** You can specify different output folders for each experiment. 365 | E.g.: 366 | 367 | :: 368 | 369 | gem5.opt -d ./experiment1 vlsid-riscv-fs.py 370 | 371 | Part 2: Change cache hierarchy 372 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 373 | 374 | The cache configuration can have a significant impact on the system 375 | performance, depending on the data locality and access patterns of the 376 | executed applications. This is one of the knobs we can easily change in 377 | the ``vlsid-riscv-fs.py`` configuration file (line 70). 378 | 379 | The second task consists in performing the experiments after applying 380 | the following modifications (one by one): 381 | 382 | - Decrease L1I (instruction cache) and L1D (data cache) size from 32 kB 383 | to 8 kB 384 | - Increase L2 (last-level cache) size from 512 kB to 2 MB 385 | 386 | Use MinorCPU or O3CPU. Compare the output statistic with the baseline 387 | configuration, to check if there is a change in performance and how 388 | appreciable that is. You can also have a look at cache-specific metrics, 389 | e.g. the miss rates: 390 | 391 | :: 392 | 393 | board.cache_hierarchy.l1d-cache-0.overallMissRate::total 394 | board.cache_hierarchy.l1i-cache-0.overallMissRate::total 395 | board.cache_hierarchy.l2-cache-0.overallMissRate::total 396 | 397 | Part 3: Vectorization 398 | ~~~~~~~~~~~~~~~~~~~~~ 399 | 400 | The RISC-V architecture we are simulating supports the RVV vector 401 | extension v1.0. This means that the IREE compiler can optimize the 402 | application by enabling SIMD support. The default VLEN for the simulated 403 | hardware is of 256 bits. 404 | 405 | For this step, we will need to recompile the benchmark and add it to the 406 | disk image. The following command will create an RVV-enabled benchmark: 407 | 408 | :: 409 | 410 | iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c,+v,+zvl256b -riscv-v-vector-bits-min=256 -riscv-v-fixed-length-vector-lmul-max=8 mnist-8.mlir -o mnist-8 411 | -v.vmfb 412 | 413 | Execute this new version of the benchmark and compare the output with 414 | the non-vectorized version. You should notice an improvement of the 415 | performance. 416 | 417 | **Note:** Like other microarchitectural parameters, the latencies of the 418 | vector units are not calibrated on any specific design, and default 419 | values are used. Do not expect fully realistic numbers. 420 | 421 | Part 4: New benchmarks 422 | ~~~~~~~~~~~~~~~~~~~~~~ 423 | 424 | .. warning:: 425 | The execution time can be much higher for more complex 426 | benchmarks, even in atomic mode. We suggest you to try out these 427 | tests after the tutorial, keeping the simulations as background tasks 428 | until they complete. 429 | 430 | Now that you know how to run the full workflow, you can try out new 431 | benchmarks. Bear in mind that not all the models are supported with the 432 | current version of IREE, and compatibility issues may arise when 433 | compiling. We will provide you with a few examples that are guaranteed 434 | to succeed. 435 | 436 | :: 437 | 438 | https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mobilenet/model/mobilenetv2-10.onnx 439 | https://github.com/onnx/models/raw/refs/heads/main/validated/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx 440 | 441 | The launch commands for these models are: 442 | 443 | :: 444 | 445 | iree-run-module --module=/root/mobilenetv2-10.vmfb --device=local-task --input="1x1x672x672xf32=0" 446 | iree-run-module --module=/root/super-resolution-10.vmfb --device=local-task --input="1x1x224x224xf32=0" 447 | 448 | **Tip:** If you want to store multiple models in your image, or models 449 | that exceed the image capacity, you may run out of space. You can resize 450 | the image to a bigger size (e.g. 150 MB) with the following commands: 451 | 452 | :: 453 | 454 | e2fsck -f vlsid-disk.img 455 | resize2fs vlsid-disk.img 150M 456 | -------------------------------------------------------------------------------- /demo/gem5/convert_onnx_model.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # Script derived and adapted from this source: 3 | # https://iree.dev/guides/ml-frameworks/onnx/#troubleshooting 4 | 5 | import argparse 6 | import onnx 7 | 8 | parser = argparse.ArgumentParser("ONNX Version Converter") 9 | parser.add_argument("input", type=str, help="Input ONNX file") 10 | parser.add_argument("output", type=str, help="Output ONNX file") 11 | args = parser.parse_args() 12 | 13 | original_model = onnx.load_model(args.input) 14 | converted_model = onnx.version_converter.convert_version(original_model, 17) 15 | onnx.save(converted_model, args.output) 16 | -------------------------------------------------------------------------------- /demo/gem5/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 The Regents of the University of California 2 | # Copyright (c) 2024 imec v.z.w. 3 | # All Rights Reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are 7 | # met: redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer; 9 | # redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution; 12 | # neither the name of the copyright holders nor the names of its 13 | # contributors may be used to endorse or promote products derived from 14 | # this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | FROM --platform=${BUILDPLATFORM} ubuntu:24.04 29 | 30 | ENV DEBIAN_FRONTEND=noninteractive 31 | RUN apt -y update && apt -y upgrade && apt -y install \ 32 | build-essential \ 33 | scons \ 34 | python3-dev \ 35 | git \ 36 | pre-commit \ 37 | zlib1g \ 38 | zlib1g-dev \ 39 | libprotobuf-dev \ 40 | protobuf-compiler \ 41 | libprotoc-dev \ 42 | libgoogle-perftools-dev \ 43 | libboost-all-dev \ 44 | libhdf5-serial-dev \ 45 | python3-pip \ 46 | python3-pydot \ 47 | python3-venv \ 48 | python3-tk \ 49 | mypy \ 50 | m4 \ 51 | libcapstone-dev \ 52 | libpng-dev \ 53 | libelf-dev \ 54 | pkg-config \ 55 | wget \ 56 | cmake \ 57 | doxygen 58 | 59 | RUN python3 -m pip install --break-system-packages \ 60 | tensorflow \ 61 | iree-base-runtime==3.4.0 \ 62 | iree-base-compiler==3.4.0 \ 63 | matplotlib \ 64 | onnx \ 65 | pandas-stubs 66 | 67 | # pre-commit, as installed via apt in 24.04, attempts to create a cache 68 | # directory at "${HOME}/.cache/pre-commit". If running docker with non-root, 69 | # the HOME directory is set to "/". Since non-root users do not have permission 70 | # to write to this directory, an error is returned when pre-commit is executed. 71 | # pre-commit's default cache directory can be changed via the `XDG_CACHE_HOME` 72 | # enivoronment variable. Here we set it to "/tmp". With this pre-commit will 73 | # create a "/tmp/pre-commit" directory to use for caching. "/tmp" was chosen 74 | # as it's a directory any user can access and write to. Given this only stores 75 | # caching information, the "/tmp" directory being wiped is not a concern. 76 | ENV XDG_CACHE_HOME=/tmp/ 77 | -------------------------------------------------------------------------------- /demo/gem5/docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 imec v.z.w. 2 | # All Rights Reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer; 8 | # redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution; 11 | # neither the name of the copyright holders nor the names of its 12 | # contributors may be used to endorse or promote products derived from 13 | # this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | services: 28 | vlsid-iree-gem5: 29 | privileged: true 30 | stdin_open: true 31 | tty: true 32 | build: 33 | context: . 34 | dockerfile: Dockerfile 35 | volumes: 36 | - ..:/opt/vlsid-iree-gem5 37 | -------------------------------------------------------------------------------- /demo/gem5/iree/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 imec v.z.w. 2 | # All Rights Reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer; 8 | # redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution; 11 | # neither the name of the copyright holders nor the names of its 12 | # contributors may be used to endorse or promote products derived from 13 | # this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | IREE_GIT_TAG=v3.4.0 28 | TOOLCHAIN_ROOT=.. 29 | TOOLCHAIN_URL=https://toolchains.bootlin.com/downloads/releases/toolchains/riscv64-lp64d/tarballs/riscv64-lp64d--musl--stable-2024.05-1.tar.xz 30 | 31 | IREE_BUILD_OPTS=\ 32 | -DIREE_BUILD_COMPILER=OFF \ 33 | -DIREE_BUILD_TESTS=OFF \ 34 | -DIREE_BUILD_SAMPLES=OFF \ 35 | -DIREE_BUILD_DOCS=OFF \ 36 | -DIREE_HAL_DRIVER_VULKAN=OFF 37 | 38 | toolchan_root_abspath=$(shell realpath $(TOOLCHAIN_ROOT)) 39 | toolchain_path=$(toolchan_root_abspath)/toolchain-riscv64 40 | sysroot_path=$(toolchain_path)/riscv64-buildroot-linux-musl/sysroot 41 | 42 | default: iree-build-riscv64 43 | 44 | iree-dist: 45 | git clone --depth 1 --branch $(IREE_GIT_TAG) https://github.com/iree-org/iree.git iree-dist 46 | cd iree-dist && git submodule update --init \ 47 | third_party/benchmark \ 48 | third_party/cpuinfo \ 49 | third_party/flatcc \ 50 | third_party/googletest \ 51 | third_party/hip-build-deps \ 52 | third_party/musl \ 53 | third_party/spirv_cross \ 54 | third_party/tracy \ 55 | third_party/vulkan_headers \ 56 | third_party/webgpu-headers \ 57 | third_party/hsa-runtime-headers 58 | 59 | iree-build-native: iree-dist 60 | cd $< && cmake -G "Unix Makefiles" \ 61 | $(IREE_BUILD_OPTS) \ 62 | -DCMAKE_INSTALL_PREFIX=../$@/install \ 63 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 64 | . -B ../$@ 65 | cd $< && cmake --build ../$@ --target install 66 | 67 | orig_dir=$(patsubst %.tar.gz,%,$(patsubst %.tar.bz2,%,$(patsubst %.tar.xz,%,$(lastword $(subst /, ,$(TOOLCHAIN_URL)))))) 68 | 69 | .PHONY: toolchain-riscv64 70 | toolchain-riscv64: $(toolchain_path) 71 | $(toolchain_path): 72 | wget $(TOOLCHAIN_URL) -O $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2 73 | tar -xvf $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2 -C $(TOOLCHAIN_ROOT) 74 | mv $(TOOLCHAIN_ROOT)/$(orig_dir) $(toolchain_path) 75 | $(toolchain_path)/relocate-sdk.sh 76 | rm $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2 77 | 78 | iree-build-riscv64: iree-dist iree-build-native $(toolchain_path) 79 | cd $< && cmake -G "Unix Makefiles" \ 80 | $(IREE_BUILD_OPTS) \ 81 | -DIREE_ENABLE_CPUINFO=OFF \ 82 | -DCMAKE_INSTALL_PREFIX=../$@/install \ 83 | -DIREE_HOST_BIN_DIR=../iree-build-native/install/bin \ 84 | -DCMAKE_TOOLCHAIN_FILE="../toolchain.generic.cmake" \ 85 | -DTOOLCHAIN_TARGET=riscv64 \ 86 | -DTOOLCHAIN_PATH=$(toolchain_path) \ 87 | -DTOOLCHAIN_PREFIX=riscv64-buildroot-linux-musl- \ 88 | . -B ../$@ 89 | cd $< && cmake --build ../$@ --target install 90 | 91 | .PHONY: clean 92 | clean: 93 | rm -rf iree-dist iree-build-* 94 | 95 | .PHONY: distclean 96 | distclean: clean 97 | rm -rf $(toolchain_path) 98 | -------------------------------------------------------------------------------- /demo/gem5/iree/toolchain.generic.cmake: -------------------------------------------------------------------------------- 1 | # Script derived and adapted from this source: 2 | # https://kubasejdak.com/how-to-cross-compile-for-embedded-with-cmake-like-a-champ 3 | 4 | set(CMAKE_SYSTEM_NAME Generic) 5 | set(CMAKE_SYSTEM_PROCESSOR ${TOOLCHAIN_TARGET}) 6 | 7 | # Without that flag CMake is not able to pass test compilation check 8 | set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) 9 | 10 | set(CMAKE_AR ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ar${CMAKE_EXECUTABLE_SUFFIX}) 11 | set(CMAKE_ASM_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX}) 12 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX}) 13 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}g++${CMAKE_EXECUTABLE_SUFFIX}) 14 | set(CMAKE_LINKER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ld${CMAKE_EXECUTABLE_SUFFIX}) 15 | set(CMAKE_OBJCOPY ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}objcopy${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "") 16 | set(CMAKE_RANLIB ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ranlib${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "") 17 | set(CMAKE_SIZE ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}size${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "") 18 | set(CMAKE_STRIP ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}strip${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "") 19 | 20 | set(CMAKE_C_FLAGS "-static -Wno-psabi -fdata-sections -ffunction-sections -Wl,--gc-sections" CACHE INTERNAL "") 21 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -fno-exceptions" CACHE INTERNAL "") 22 | 23 | set(CMAKE_C_FLAGS_DEBUG "-Os -g" CACHE INTERNAL "") 24 | set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG" CACHE INTERNAL "") 25 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" CACHE INTERNAL "") 26 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}" CACHE INTERNAL "") 27 | 28 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 29 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 30 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 31 | -------------------------------------------------------------------------------- /demo/gem5/vlsid-riscv-fs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 The Regents of the University of California 2 | # Copyright (c) 2024 imec v.z.w. 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are 7 | # met: redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer; 9 | # redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution; 12 | # neither the name of the copyright holders nor the names of its 13 | # contributors may be used to endorse or promote products derived from 14 | # this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | """ 29 | This example runs a simple linux boot. It uses the 'riscv-disk-img' resource. 30 | It is built with the sources in `src/riscv-fs` in [gem5 resources]( 31 | https://github.com/gem5/gem5-resources). 32 | 33 | Characteristics 34 | --------------- 35 | 36 | * Runs exclusively on the RISC-V ISA with the classic caches 37 | * Assumes that the kernel is compiled into the bootloader 38 | * Automatically generates the DTB file 39 | * Will boot but requires a user to login using `m5term` (username: `root`, 40 | password: `root`) 41 | """ 42 | 43 | import argparse 44 | from pathlib import Path 45 | 46 | from gem5.components.boards.riscv_board import RiscvBoard 47 | from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import ( 48 | PrivateL1PrivateL2WalkCacheHierarchy, 49 | ) 50 | from gem5.components.memory import SingleChannelDDR3_1600 51 | from gem5.components.processors.cpu_types import CPUTypes 52 | from gem5.components.processors.simple_processor import SimpleProcessor 53 | from gem5.isas import ISA 54 | from gem5.resources.resource import DiskImageResource, obtain_resource 55 | from gem5.simulate.simulator import Simulator 56 | from gem5.utils.requires import requires 57 | 58 | # Run a check to ensure the right version of gem5 is being used. 59 | requires(isa_required=ISA.RISCV) 60 | 61 | # Instantiate argument parser 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument("--restore-from", type=Path, help="Checkpoint directory") 64 | parser.add_argument( 65 | "--disk-image", 66 | type=Path, 67 | help="Disk image path", 68 | default="vlsid-disk.img" 69 | ) 70 | args = parser.parse_args() 71 | 72 | # Setup the cache hierarchy. 73 | # For classic, PrivateL1PrivateL2 and NoCache have been tested. 74 | # For Ruby, MESI_Two_Level and MI_example have been tested. 75 | cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy( 76 | l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB" 77 | ) 78 | 79 | # Setup the system memory. 80 | memory = SingleChannelDDR3_1600() 81 | 82 | # Setup a single core Processor. 83 | processor = SimpleProcessor( 84 | cpu_type=CPUTypes.ATOMIC, isa=ISA.RISCV, num_cores=1 85 | ) 86 | 87 | # Setup the board. 88 | board = RiscvBoard( 89 | clk_freq="1GHz", 90 | processor=processor, 91 | memory=memory, 92 | cache_hierarchy=cache_hierarchy, 93 | ) 94 | 95 | # Set the Full System workload. 96 | board.set_kernel_disk_workload( 97 | kernel=obtain_resource("riscv-linux-6.6.33-kernel", 98 | resource_version="1.0.0"), 99 | bootloader=obtain_resource("riscv-bootloader-opensbi-1.3.1", 100 | resource_version="1.0.0"), 101 | disk_image=DiskImageResource(args.disk_image), 102 | checkpoint=args.restore_from, 103 | ) 104 | 105 | simulator = Simulator(board=board) 106 | print("Beginning simulation!") 107 | # Note: This simulation will never stop. You can access the terminal upon boot 108 | # using m5term (`./util/term`): `./m5term localhost `. Note the `` 109 | # value is obtained from the gem5 terminal stdout. Look out for 110 | # "system.platform.terminal: Listening for connections on port ". 111 | simulator.run() 112 | -------------------------------------------------------------------------------- /demo/sst/README.rst: -------------------------------------------------------------------------------- 1 | Scale-out system simulation with SST 2 | ************************************ 3 | 4 | **How to perform a scale-out system simulation with instruction-level simulation and packet-level simulation?** 5 | The goal of the second part of this tutorial is to introduce the Structural Simulation 6 | Toolkit (SST) framework which allows to simulate a scale-out. 7 | 8 | Instruction-level simulation 9 | ============================ 10 | 11 | Environment Setup 12 | ----------------- 13 | 14 | To run the SST experiments you need to install SST. Please refer to `Installation instructions`_. 15 | 16 | 17 | System under exploration 18 | ------------------------ 19 | .. _cpu figure: 20 | 21 | .. figure:: images/sst/cpu.svg 22 | :width: 400 23 | :align: center 24 | 25 | Microarchitecture of a cpu core. 26 | 27 | 28 | The system under exploration is made up of multi-threaded RISC-V CPU cores. As illustrated 29 | in Figure :numref:`cpu figure`, a CPU core is attached to an L1 data cache and an L1 30 | instruction cache. The two caches are interconnect to a second level of cache (L2 cache) 31 | with a memory bus. The core itself is composed of one decoder for each thread, one branch 32 | unit and one dispatch unit, one register file for floating point numbers and another one 33 | for integers, a load store unit (or load store queue), multiple ALU and multiple FPU. The 34 | core is attached to each cache through a TLB and a memory interface. TLBs are managed by 35 | the operating system. 36 | 37 | 38 | .. _node figure: 39 | 40 | .. figure:: images/sst/node.svg 41 | :width: 600 42 | :align: center 43 | 44 | Microarchitecture of a compute node. 45 | 46 | As shown in Figure :numref:`node figure`, the RISC-V cores are integrated into a compute 47 | node. The number of cores per node is configurable from the script. The set of L2 caches 48 | are federated with a directory which maintains coherency in the node. The L2s and the 49 | directory are interconnected through a NoC. The directory is attached to a DRAM 50 | controller. In addition, a node integrates a component that emulates an operating systems. 51 | The latter manages the virtual memory and is attached to every CPU core to provide the 52 | minimal service required to run applications. 53 | 54 | .. _system figure: 55 | 56 | .. figure:: images/sst/system.svg 57 | :width: 800 58 | :align: center 59 | :alt: Scale-out system microarchitecture 60 | 61 | Microarchitecture of a multi-node system. 62 | 63 | Multi-node can be interconnect with a network to build a scale-out system, as illustrated 64 | in Figure :numref:`system figure`. Each node has an independent operating system and a 65 | private memory space. To allow communication between node, we can use 66 | Message Passing Interface (MPI). To do that, each node integrates a NIC in addition. The 67 | latter is interconnected to the NoC. 68 | 69 | The inter-node network is built with pymerlin (a python script provided in SST-elements). 70 | Thanks to that script we can defined different topologies easily (e.g., single router, fat 71 | tree, dragonfly, torus, mesh, torus, etc). 72 | 73 | 74 | Every components or sub-components are configurable, for instance you can configure the 75 | latency of the ALU or the capacity of each cache. You can find more information on the 76 | parameters and their impact on the simulated system using **sst-info** command. 77 | 78 | .. list-table:: How to find the available parameters 79 | :widths: 25 50 80 | :header-rows: 1 81 | 82 | * - Command 83 | - Description 84 | * - sst-info vanadis 85 | - Parameters of the cpu core and the operating system 86 | * - sst-info mmu 87 | - Parameters of the TBL and MMU 88 | * - sst-info sst-info memHierarchy 89 | - Parameters of the cache, directory controller, DRAM, memory bus 90 | * - sst-info merlin 91 | - Parameters of the NoC and internode network components 92 | * - sst-info sst-info rdmaNic 93 | - Parameters of the NIC 94 | 95 | 96 | Workload under evaluation 97 | ------------------------- 98 | 99 | 100 | The workload under evaluation is inspired by a Multi-head attention, one of the 101 | calculation layers of transformers :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 102 | 103 | .. _OMP_MHA figure: 104 | 105 | .. figure:: images/sst/mha.svg 106 | :width: 600 107 | :align: center 108 | :alt: Multi-head attention block 109 | 110 | Illustration of the workload run on a single-node system. 111 | 112 | As shown in Figure :numref:`OMP_MHA figure`, the application multiplies an *Embeddings* 113 | matrix of Seq\ :sub:`len`\ x D\ :sub:`model` \ elements with 3 matrices of 114 | D\ :sub:`model` x D\ :sub:`model` weights, producing 3 matrices of Seq\ :sub:`len`\ x D\ :sub:`model` \ elements, 115 | called Keys, Queries and Values. In fact, the weight matrices are divided into *heads*. 116 | Each head of Queries matrix are multiplied with the corresponding transposed head of Keys 117 | matrix, producing *QK* matrix. The latter is then scaled. Then the *softmax* of each row of 118 | the scaled *QK* is computed. Afterward, the result of the *softmax* is multiplied with 119 | Values matrix, producing *QKV* matrix. Finally, *QKV* is summed with the *Embeddings* 120 | matrix. 121 | 122 | 123 | .. _mha_OMP: ../../demo/sst/software/mha_OMP.c 124 | 125 | The corresponding code is implemented in **C** `mha_OMP`_, and is parallelized with **OpenMP**. 126 | 127 | 128 | Matrix-Matrix multiplication is the heaviest workload in this application. To minimize the 129 | data movement, a tiled GEMM is implemented. *TILE_SIZE* macro defines the dimension of the 130 | tiles. 131 | 132 | .. code-block:: C 133 | :linenos: 134 | :emphasize-lines: 1 135 | 136 | const int bsize = TILE_SIZE; 137 | int ii0, ii1, ii2; 138 | int i0, i1, i2; 139 | int h; 140 | int start_head, stop_head; 141 | data_t pp; 142 | #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2) 143 | for (h=0; h < heads; h++) { 144 | for (ii0 = 0; ii0