├── .github
    └── workflows
    │   └── deploy-docs.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── demo
    ├── gem5
    │   ├── .gitignore
    │   ├── README.rst
    │   ├── convert_onnx_model.py
    │   ├── docker
    │   │   ├── Dockerfile
    │   │   └── docker-compose.yaml
    │   ├── iree
    │   │   ├── Makefile
    │   │   └── toolchain.generic.cmake
    │   └── vlsid-riscv-fs.py
    └── sst
    │   ├── README.rst
    │   ├── docker
    │       ├── Dockerfile
    │       ├── docker-compose.yaml
    │       └── sst-elements.patch
    │   ├── instruction-level-simulation
    │       ├── .gitignore
    │       ├── scale_out.py
    │       └── scale_up.py
    │   ├── packet-level-simulation
    │       ├── large_config.json
    │       ├── small_config.json
    │       └── training_llm.py
    │   └── software
    │       ├── .gitignore
    │       ├── Makefile
    │       ├── check_mpi.c
    │       ├── gemm_OMP.c
    │       ├── hello_MPI.c
    │       ├── hello_MPI_OMP.c
    │       ├── mha_MPI_OMP.c
    │       ├── mha_OMP.c
    │       ├── riscv64.make
    │       ├── riscv64
    │           ├── gemm_OMP
    │           ├── hello_MPI
    │           ├── hello_MPI_OMP
    │           ├── mha_MPI_OMP
    │           ├── mha_OMP_16
    │           ├── mha_OMP_32
    │           ├── mha_OMP_64
    │           └── mha_OMP_8
    │       └── x86.make
├── docs
    ├── 512px-LOGO-IMEC_black.svg.png
    ├── conf.py
    ├── gem5.rst
    ├── images
    │   ├── gem5
    │   │   ├── gem5-system.svg
    │   │   └── mnist-8.svg
    │   ├── sst
    │   │   ├── core.svg
    │   │   ├── cpu.svg
    │   │   ├── mha.svg
    │   │   ├── mha_mpi.svg
    │   │   ├── node.svg
    │   │   └── system.svg
    │   └── transformer
    │   │   ├── 3d_parallelism_1.svg
    │   │   ├── 3d_parallelism_2.svg
    │   │   ├── data_parallelism.svg
    │   │   ├── pipeline_parallelism_1.svg
    │   │   ├── pipeline_parallelism_2.svg
    │   │   ├── tensor_parallelism.svg
    │   │   └── transformer_arch.svg
    ├── index.rst
    ├── references.bib
    ├── slides
    │   ├── 2025_05_ISPASS_Presentation.pdf
    │   └── VLSID25_Tutorial_Slides_imec_CSA.pdf
    └── sst.rst
├── external
    ├── .gitignore
    ├── INSTALL.rst
    ├── mvapich2-2.3.7-1.patch
    ├── mvapich2-2.3.7-1.tar.gz
    └── sst
    │   └── libRDMA
    │       └── Makefile
└── requirements.txt


/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Deploy Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main  # Run workflow on pushes to the main branch
 7 | 
 8 | jobs:
 9 |   build-deploy:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       # Step 1: Check out the repository
14 |       - name: Checkout repository
15 |         uses: actions/checkout@v3
16 | 
17 |       # Step 2: Set up Python
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v4
20 |         with:
21 |           python-version: '3.10'  # Adjust to your desired Python version
22 | 
23 |       # Step 3: Install dependencies
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install -r requirements.txt
28 | 
29 |       # Step 4: Build the HTML documentation
30 |       - name: Build Sphinx documentation
31 |         run: |
32 |           make html
33 | 
34 |       # Step 5: Deploy to GitHub Pages (docs branch)
35 |       - name: Deploy to GitHub Pages
36 |         uses: peaceiris/actions-gh-pages@v4
37 |         with:
38 |           github_token: ${{ secrets.GITHUB_TOKEN }}
39 |           publish_dir: build/html
40 |           publish_branch: docs  # The branch where GitHub Pages will serve from
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | build
35 | imec_tut_2025
36 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "external/sst/sst-elements"]
 2 | 	path = external/sst/sst-elements
 3 | 	url = https://github.com/sstsimulator/sst-elements.git
 4 | 	branch = v14.1.0_Final
 5 | [submodule "external/sst/sst-core"]
 6 | 	path = external/sst/sst-core
 7 | 	url = https://github.com/sstsimulator/sst-core.git
 8 | 	branch = v14.1.0_Final
 9 | [submodule "external/riscv-gnu-toolchain"]
10 | 	path = external/riscv-gnu-toolchain
11 | 	url = https://github.com/riscv-collab/riscv-gnu-toolchain.git
12 |    branch = 2024.11.22
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 imec, Belgium
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ## This source code is licensed under the MIT license found in the
 2 | ## LICENSE file in the root directory of this source tree.
 3 | ##
 4 | ## Copyright (c) 2025 IMEC. All rights reserved.
 5 | ## ******************************************************************************
 6 | 
 7 | # Minimal makefile for Sphinx documentation
 8 | #
 9 | 
10 | # You can set these variables from the command line, and also
11 | # from the environment for the first two.
12 | SPHINXOPTS    ?=
13 | SPHINXBUILD   ?= sphinx-build
14 | SOURCEDIR     = docs
15 | BUILDDIR      = build
16 | 
17 | # Put it first so that "make" without argument is like "make help".
18 | help:
19 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 
21 | .PHONY: help Makefile
22 | 
23 | # Catch-all target: route all unknown targets to Sphinx using the new
24 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
25 | %: Makefile
26 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A tutorial on Scalable System Simulations
 2 | *Looking at RISC-V architectures and performance analysis for machine learning workloads*
 3 | 
 4 | Dive into the world of system-level simulations!
 5 | - Explore RISC-V modelling and workload representation using gem5+MLIR.
 6 | - Learn to scale your system simulations effortlessly with the power of SST.
 7 | 
 8 | This tutorial bridges cutting-edge open-source tools and techniques to empower your hardware-software co-design journey. This tutorial has been presented 
 9 | at [International Conference on VLSI Design 2025](https://vlsid.org/).
10 | 
11 | If you use this repository, please cite it as follows:
12 | 
13 | ```bibtex
14 | @misc{sim-learning-tutorial,
15 |   author       = {Erwan Lenormand, Tommaso Marinelli, Debjyoti Bhattacharjee},
16 |   title        = {A tutorial on Scalable System Simulations},
17 |   year         = {2025},
18 |   version      = {v1.0.0},
19 |   howpublished = {Presented at the International Conference on VLSI Design 2025},
20 |   note         = {\url{https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/}  Accessed: 2025-01-02}
21 | }
22 | ```
23 | 
24 | ### Getting Started
25 | Follow the documentation [online](https://csa-infra.github.io/RISCV-Scalable-Simulation-tutorial/index.html) or build the documentation  yourself. 
26 | Minimum required version of python is 3.10. The tutorial has been tested on Linux based systems.
27 | 
28 | ```
29 | python3 -m venv imec_tut_2025
30 | source imec_tut_2025/bin/activate
31 | pip3 install -r requirements.txt
32 | make html
33 | ```
34 | ### Need help?
35 | If you need help or clarification regarding any part of the tutorial, file an [issue](https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/issues/new) in the repository.
36 | 


--------------------------------------------------------------------------------
/demo/gem5/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !.gitignore
 3 | !README.md
 4 | !convert_onnx_model.py
 5 | !docker/
 6 | !docker/*
 7 | !images
 8 | !iree/
 9 | !iree/Makefile
10 | !iree/toolchain.generic.cmake
11 | !vlsid-riscv-fs.py
12 | 


--------------------------------------------------------------------------------
/demo/gem5/README.rst:
--------------------------------------------------------------------------------
  1 | Application-oriented system modeling and optimization
  2 | *****************************************************
  3 | 
  4 | *i.e. how to lower an AI/ML model to simulated RISC-V hardware for system-level
  5 | exploration*
  6 | 
  7 | The goal of this tutorial is to introduce the attendees to architectural
  8 | simulation targeting machine learning workloads. The main tool we will be
  9 | using to model a sample RISC-V system and run applications on top is
 10 | \ `gem5 <https://www.gem5.org/>`__\ . The ML benchmarks are derived from
 11 | ONNX files, translated into machine-optimized code and executed though a
 12 | ligthweight runtime. This process is carried out with the help of the
 13 | \ `IREE <https://iree.dev/>`__\  workflow.
 14 | 
 15 | Prerequisites
 16 | -------------
 17 | 
 18 | - A Linux-based x86-64 system (native or WSL2/VM)
 19 | - Docker or Podman
 20 | 
 21 | Containerized environment
 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 23 | 
 24 | .. note::
 25 |    The container is executed in privileged mode to
 26 |    allow mounting the disk image as a loop device. If you don’t like this,
 27 |    remove the corresponding option from ``docker-compose.yaml``.
 28 | 
 29 | Dealing with all the software dependencies that this setup needs can be
 30 | complicated. For this reason, a container file has been provided, which
 31 | allows to generate a virtual environment with all the dependencies
 32 | installed. Assuming that Docker is present in your system, you can prepare
 33 | the environment this way:
 34 | 
 35 | ::
 36 | 
 37 |    git clone https://github.com/CSA-infra/RISCV-Scalable-Simulation-tutorial.git vlsid-csa-tutorial
 38 |    cd vlsid-csa-tutorial/demo/gem5/docker
 39 |    docker compose up -d
 40 | 
 41 | If it doesn’t work, try with ``docker-compose`` alternatively.
 42 | 
 43 | To enter the container:
 44 | 
 45 | ::
 46 | 
 47 |    docker exec -it docker_vlsid-iree-gem5_1 /bin/bash
 48 | 
 49 | If you stop the container (e.g. reboot), you can easily return back to
 50 | it with:
 51 | 
 52 | ::
 53 | 
 54 |    docker start docker_vlsid-iree-gem5_1
 55 |    docker exec -it docker_vlsid-iree-gem5_1 /bin/bash
 56 | 
 57 | Finally, if you want to destroy the container, you can do it with:
 58 | 
 59 | ::
 60 | 
 61 |    cd vlsid-csa-tutorial/demo/gem5/docker
 62 |    docker compose down
 63 | 
 64 | The working directory inside the container is ``/opt/vlsid-iree-gem5``.
 65 | We will assume that every command is executed from that folder.
 66 | 
 67 | Environment Setup
 68 | -----------------
 69 | 
 70 | Part 1: Prepare benchmark
 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 72 | 
 73 | The IREE workflow is used to first convert a ML model to a supported
 74 | intermediate representation, then compile and optimize the model for a
 75 | target architecture. The output of the process is a Virtual Machine
 76 | FlatBuffer (VMFB) file than can be run by the IREE runtime.
 77 | 
 78 | A simple MNIST image classification model will be used as example, but
 79 | the process is generalizable to other models too. The file format for the
 80 | model is ONNX. Note that IREE also supports other formats (e.g. TF/TFLite),
 81 | it is possible to convert them to MLIR using the right importers.
 82 | 
 83 | .. figure:: images/gem5/mnist-8.svg
 84 |    :align: center
 85 | 
 86 |    Visual representation of the MNIST model
 87 | 
 88 | - Download ONNX model
 89 | 
 90 | ::
 91 | 
 92 |    wget https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mnist/model/mnist-8.onnx -O mnist-8-orig.onnx
 93 | 
 94 | - `Upgrade ONNX
 95 |   opset <https://iree.dev/guides/ml-frameworks/onnx/#troubleshooting>`__
 96 | 
 97 | ::
 98 | 
 99 |    ./convert_onnx_model.py mnist-8-orig.onnx mnist-8.onnx
100 | 
101 | - Use IREE to convert ONNX file to MLIR Torch ONNX dialect
102 | 
103 | ::
104 | 
105 |    iree-import-onnx mnist-8.onnx > mnist-8.mlir
106 | 
107 | - Compile MLIR model to VMFB
108 | 
109 | ::
110 | 
111 |    iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c mnist-8.mlir -o mnist-8.vmfb
112 | 
113 | Part 2: Compile IREE run module
114 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115 | 
116 | The IREE run module allows the execution of a compiled module using the
117 | IREE runtime. This module has to be added to the final disk image
118 | together with the benchmarks, since we don’t want to pull the entire
119 | IREE distribution.
120 | 
121 | Even if pre-built binaries are available, as of now they are not
122 | compiled for any RISC-V architecture. Thus, we will have to compile this
123 | module from source. A Makefile has been provided to simplify the
124 | process.
125 | 
126 | ::
127 | 
128 |    make -C iree
129 | 
130 | Part 3: Compile m5 utility
131 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
132 | 
133 | The m5 utility is used to send pseudo-instructions to the simulator.
134 | This allows a number of operations, like checkpointing, resetting
135 | statistics, etc. We want to include this utility in our final image.
136 | Note that will need the cross-compiler employed in the previous step to
137 | generate the binary.
138 | 
139 | - Get the gem5 simulator
140 | 
141 | ::
142 | 
143 |    git clone https://github.com/gem5/gem5.git -b v24.1
144 | 
145 | - Compile the m5 utility
146 | 
147 | ::
148 | 
149 |    export PATH=$PATH:$(realpath toolchain-riscv64/bin)
150 |    scons riscv.CROSS_COMPILE=riscv64-buildroot-linux-musl- -C gem5/util/m5 build/riscv/out/m5
151 | 
152 | Part 4: Prepare RISC-V disk image
153 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
154 | 
155 | .. warning::
156 |    If using Podman or rootless Docker, this steps must be done
157 |    outside the container, as they typically require sudo permissions.
158 |    Pay attention when executing each command!
159 | 
160 | The last part of the setup consists in packing the benchmarks and IREE
161 | runtime into a disk image. For this task, we will use a pre-built
162 | minimal image from the gem5 community and modify it.
163 | 
164 | - Get and extract `base
165 |   image <https://resources.gem5.org/resources/riscv-disk-img?version=1.0.0>`__
166 | 
167 | ::
168 | 
169 |    wget https://storage.googleapis.com/dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz
170 |    gzip -d riscv-disk.img.gz
171 |    cp riscv-disk.img vlsid-disk.img
172 | 
173 | - Mount image (execute with sudo if outside the container)
174 | 
175 | ::
176 | 
177 |    mkdir /tmp/rootfs
178 |    mount vlsid-disk.img /tmp/rootfs
179 | 
180 | - Copy benchmark (execute with sudo if outside the container)
181 | 
182 | ::
183 | 
184 |    cp mnist-8.vmfb /tmp/rootfs/root/
185 | 
186 | - Copy IREE run module (execute with sudo if outside the container)
187 | 
188 | ::
189 | 
190 |    cp iree/iree-build-riscv64/install/bin/iree-run-module /tmp/rootfs/bin/
191 | 
192 | - Copy m5 utility (execute with sudo if outside the container)
193 | 
194 | ::
195 | 
196 |    cp gem5/util/m5/build/riscv/out/m5 /tmp/rootfs/sbin/
197 | 
198 | - Unmount image (execute with sudo if outside the container)
199 | 
200 | ::
201 | 
202 |    umount /tmp/rootfs
203 | 
204 | Machine Learning Workload Execution
205 | -----------------------------------
206 | 
207 | At this point, we are ready to run the experiment. A gem5 configuration
208 | file is present in this directory, which is derived from the
209 | ``riscv-fs.py`` sample script of gem5. The main difference is that
210 | instead of using the default disk image it will pick the one that we
211 | have just generated.
212 | 
213 | The script defines a simple RISC-V system comprising a processor, a two-level
214 | cache hierarchy, a main memory and a generic board with some basic devices
215 | (UART controller, RNG, disk interface, etc.). An auto-generated diagram of the
216 | simulated system is presented below. You may need to zoom in to find out about
217 | all the individual components and connections.
218 | 
219 | .. figure:: images/gem5/gem5-system.svg
220 |    :align: center
221 | 
222 |    Composition of the simulated system
223 | 
224 | - Compile gem5
225 | 
226 | .. note::
227 |    This step will take a while.
228 | 
229 | ::
230 | 
231 |    scons build/RISCV/gem5.opt -C gem5 -j$(nproc)
232 | 
233 | - Compile m5term
234 | 
235 | ::
236 | 
237 |    make -C gem5/util/term
238 | 
239 | - Run the script
240 | 
241 | .. note::
242 |    This step will take a while. We will speed up following
243 |    executions through checkpointing.
244 | 
245 | ::
246 | 
247 |    ./build/RISCV/gem5.opt vlsid-riscv-fs.py
248 | 
249 | While the simulation is running, its output is not immediately visible,
250 | as it is redirected to a separate console. To view it, open another
251 | terminal and use the m5term utility.
252 | 
253 | ::
254 | 
255 |    ./gem5/util/term/m5term 3456
256 | 
257 | The boot process is going to take several minutes. After that, you will
258 | se a login shell. Enter user “root” and password “root” to proceed.
259 | After login, you can launch your IREE benchmark. This is the command to
260 | execute for MNIST:
261 | 
262 | ::
263 | 
264 |    iree-run-module --module=/root/mnist-8.vmfb --device=local-task --input="1x1x28x28xf32=0"
265 | 
266 | For simplicity we are assuming an input tensor filled with zeros. You
267 | should see this output after some time:
268 | 
269 | ::
270 | 
271 |    EXEC @CNTKGraph
272 |    result[0]: hal.buffer_view
273 |    1x10xf32=[-0.044856 0.00779166 0.0681008 0.0299937 -0.12641 0.140219 -0.0552849 -0.0493838 0.0843221 -0.0545404]
274 | 
275 | Congratulations! You are ready to go!
276 | 
277 | Extra: Checkpoints
278 | ------------------
279 | 
280 | You will have noticed that booting the Linux kernel and reaching the
281 | login shell takes several minutes, even with a minimal image like the
282 | one we are using. We want to avoid waiting so long for each one of the
283 | experiments. One of the commonly used techniques to deal with these
284 | situations is checkpointing: we can “take a picture” of the system at a
285 | certain moment of time and start other simulations from that point.
286 | Technically speaking, this requires saving the main memory content and
287 | the processors context. Cache content is not saved, but since we will
288 | execute our benchmarks from scratch this is not a big deal.
289 | 
290 | In order to dump a checkpoint, after entering the shell in the simulated
291 | environment type this command:
292 | 
293 | ::
294 | 
295 |    m5 checkpoint
296 | 
297 | After terminating the simulation, you will see that in the output folder
298 | (e.g. ``m5out``) a folder named ``cpt.<somenumber>`` has appeared. This
299 | contains the checkpoint we have just dumped. We strongly suggest to move
300 | this folder outside the ``m5out`` directory.
301 | 
302 | ::
303 | 
304 |    mv m5out/cpt.<somenumber> checkpoint
305 | 
306 | From now on, it will be possible to execute a simulation starting from
307 | this checkpoint. It is sufficient to add an argument to the gem5
308 | command, specifying the position of the folder containing the checkpoint
309 | files:
310 | 
311 | ::
312 | 
313 |    ./build/RISCV/gem5.opt vlsid-riscv-fs.py --restore-from checkpoint
314 | 
315 | This way, you will be immediately dropped to the shell. Huge
316 | improvement!
317 | 
318 | Experimental Studies
319 | --------------------
320 | 
321 | Now that you are able to run complete simulations, it is time to explore
322 | a few knobs and analyze their impact on the system performance.
323 | 
324 | Part 1: Change CPU model
325 | ~~~~~~~~~~~~~~~~~~~~~~~~
326 | 
327 | The gem5 simulator supports different `CPU
328 | models <https://raw.githubusercontent.com/gem5bootcamp/gem5-bootcamp-env/main/assets/slides/using-gem5-05-gem5-cpus-tutorial%202.pdf>`__.
329 | By default, the script runs with an *atomic* CPU, which implies atomic
330 | accesses to the memory system with fixed latencies. This model is fast
331 | and simple, but inaccurate.
332 | 
333 | The first task is to replace the CPU type with a more detailed one.
334 | There are three possible choices:
335 | 
336 | - **TimingSimpleCPU:** simple timing CPU, 1-stage pipeline
337 | - **MinorCPU:** in-order CPU, 4-stages pipeline
338 | - **O3CPU:** out-of-order CPU, 7-stages pipeline
339 | 
340 | These CPU models are highly configurable, but for this experiment it is
341 | fine to stick with the default parameters set.
342 | 
343 | To implement such change, open the ``vlsid-riscv-fs.py`` script and
344 | change ``CPUTypes.ATOMIC`` (line 78) to ``CPUTypes.TIMING``,
345 | ``CPUTypes.MINOR`` and ``CPUTypes.O3``. After each execution, have a
346 | look at the ``stats.txt`` file in the output folder (default:
347 | ``m5out``). In particular, look at how these statistics change:
348 | 
349 | ::
350 | 
351 |    simSeconds -> Simulated system execution time
352 |    hostSeconds -> Host system simulation time
353 |    board.processor.cores.core.ipc -> IPC of simulated CPU
354 |    board.memory.mem_ctrl.dram.bwTotal::total -> DRAM memory bandwidth
355 | 
356 | **Tip 1:** Wrap your benchmark execution around the commands “m5
357 | resetstats” and “m5 exit”, to make sure that the statistics only reflect
358 | the benchmark execution and not the system boot or idle time. E.g.:
359 | 
360 | ::
361 | 
362 |    m5 resetstats && iree-run-module [...] && m5 exit
363 | 
364 | **Tip 2:** You can specify different output folders for each experiment.
365 | E.g.:
366 | 
367 | ::
368 | 
369 |    gem5.opt -d ./experiment1 vlsid-riscv-fs.py
370 | 
371 | Part 2: Change cache hierarchy
372 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
373 | 
374 | The cache configuration can have a significant impact on the system
375 | performance, depending on the data locality and access patterns of the
376 | executed applications. This is one of the knobs we can easily change in
377 | the ``vlsid-riscv-fs.py`` configuration file (line 70).
378 | 
379 | The second task consists in performing the experiments after applying
380 | the following modifications (one by one):
381 | 
382 | - Decrease L1I (instruction cache) and L1D (data cache) size from 32 kB
383 |   to 8 kB
384 | - Increase L2 (last-level cache) size from 512 kB to 2 MB
385 | 
386 | Use MinorCPU or O3CPU. Compare the output statistic with the baseline
387 | configuration, to check if there is a change in performance and how
388 | appreciable that is. You can also have a look at cache-specific metrics,
389 | e.g. the miss rates:
390 | 
391 | ::
392 | 
393 |    board.cache_hierarchy.l1d-cache-0.overallMissRate::total
394 |    board.cache_hierarchy.l1i-cache-0.overallMissRate::total
395 |    board.cache_hierarchy.l2-cache-0.overallMissRate::total
396 | 
397 | Part 3: Vectorization
398 | ~~~~~~~~~~~~~~~~~~~~~
399 | 
400 | The RISC-V architecture we are simulating supports the RVV vector
401 | extension v1.0. This means that the IREE compiler can optimize the
402 | application by enabling SIMD support. The default VLEN for the simulated
403 | hardware is of 256 bits.
404 | 
405 | For this step, we will need to recompile the benchmark and add it to the
406 | disk image. The following command will create an RVV-enabled benchmark:
407 | 
408 | ::
409 | 
410 |    iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c,+v,+zvl256b -riscv-v-vector-bits-min=256 -riscv-v-fixed-length-vector-lmul-max=8 mnist-8.mlir -o mnist-8
411 |    -v.vmfb
412 | 
413 | Execute this new version of the benchmark and compare the output with
414 | the non-vectorized version. You should notice an improvement of the
415 | performance.
416 | 
417 | **Note:** Like other microarchitectural parameters, the latencies of the
418 | vector units are not calibrated on any specific design, and default
419 | values are used. Do not expect fully realistic numbers.
420 | 
421 | Part 4: New benchmarks
422 | ~~~~~~~~~~~~~~~~~~~~~~
423 | 
424 | .. warning::
425 |    The execution time can be much higher for more complex
426 |    benchmarks, even in atomic mode. We suggest you to try out these
427 |    tests after the tutorial, keeping the simulations as background tasks
428 |    until they complete.
429 | 
430 | Now that you know how to run the full workflow, you can try out new
431 | benchmarks. Bear in mind that not all the models are supported with the
432 | current version of IREE, and compatibility issues may arise when
433 | compiling. We will provide you with a few examples that are guaranteed
434 | to succeed.
435 | 
436 | ::
437 | 
438 |    https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mobilenet/model/mobilenetv2-10.onnx
439 |    https://github.com/onnx/models/raw/refs/heads/main/validated/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx
440 | 
441 | The launch commands for these models are:
442 | 
443 | ::
444 | 
445 |    iree-run-module --module=/root/mobilenetv2-10.vmfb --device=local-task --input="1x1x672x672xf32=0"
446 |    iree-run-module --module=/root/super-resolution-10.vmfb --device=local-task --input="1x1x224x224xf32=0"
447 | 
448 | **Tip:** If you want to store multiple models in your image, or models
449 | that exceed the image capacity, you may run out of space. You can resize
450 | the image to a bigger size (e.g. 150 MB) with the following commands:
451 | 
452 | ::
453 | 
454 |    e2fsck -f vlsid-disk.img
455 |    resize2fs vlsid-disk.img 150M
456 | 


--------------------------------------------------------------------------------
/demo/gem5/convert_onnx_model.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | # Script derived and adapted from this source:
 3 | # https://iree.dev/guides/ml-frameworks/onnx/#troubleshooting
 4 | 
 5 | import argparse
 6 | import onnx
 7 | 
 8 | parser = argparse.ArgumentParser("ONNX Version Converter")
 9 | parser.add_argument("input", type=str, help="Input ONNX file")
10 | parser.add_argument("output", type=str, help="Output ONNX file")
11 | args = parser.parse_args()
12 | 
13 | original_model = onnx.load_model(args.input)
14 | converted_model = onnx.version_converter.convert_version(original_model, 17)
15 | onnx.save(converted_model, args.output)
16 | 


--------------------------------------------------------------------------------
/demo/gem5/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 The Regents of the University of California
 2 | # Copyright (c) 2024 imec v.z.w.
 3 | # All Rights Reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are
 7 | # met: redistributions of source code must retain the above copyright
 8 | # notice, this list of conditions and the following disclaimer;
 9 | # redistributions in binary form must reproduce the above copyright
10 | # notice, this list of conditions and the following disclaimer in the
11 | # documentation and/or other materials provided with the distribution;
12 | # neither the name of the copyright holders nor the names of its
13 | # contributors may be used to endorse or promote products derived from
14 | # this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | FROM --platform=${BUILDPLATFORM} ubuntu:24.04
29 | 
30 | ENV DEBIAN_FRONTEND=noninteractive
31 | RUN apt -y update && apt -y upgrade && apt -y install \
32 |     build-essential \
33 |     scons \
34 |     python3-dev \
35 |     git \
36 |     pre-commit \
37 |     zlib1g \
38 |     zlib1g-dev \
39 |     libprotobuf-dev \
40 |     protobuf-compiler \
41 |     libprotoc-dev \
42 |     libgoogle-perftools-dev \
43 |     libboost-all-dev \
44 |     libhdf5-serial-dev \
45 |     python3-pip \
46 |     python3-pydot \
47 |     python3-venv \
48 |     python3-tk \
49 |     mypy \
50 |     m4 \
51 |     libcapstone-dev \
52 |     libpng-dev \
53 |     libelf-dev \
54 |     pkg-config \
55 |     wget \
56 |     cmake \
57 |     doxygen
58 | 
59 | RUN python3 -m pip install --break-system-packages \
60 |     tensorflow \
61 |     iree-base-runtime==3.4.0 \
62 |     iree-base-compiler==3.4.0 \
63 |     matplotlib \
64 |     onnx \
65 |     pandas-stubs
66 | 
67 | # pre-commit, as installed via apt in 24.04, attempts to create a cache
68 | # directory at "${HOME}/.cache/pre-commit". If running docker with non-root,
69 | # the HOME directory is set to "/". Since non-root users do not have permission
70 | # to write to this directory, an error is returned when pre-commit is executed.
71 | # pre-commit's default cache directory can be changed via the `XDG_CACHE_HOME`
72 | # enivoronment variable. Here we set it to "/tmp". With this pre-commit will
73 | # create a "/tmp/pre-commit" directory to use for caching. "/tmp" was chosen
74 | # as it's a directory any user can access and write to. Given this only stores
75 | # caching information, the "/tmp" directory being wiped is not a concern.
76 | ENV XDG_CACHE_HOME=/tmp/
77 | 


--------------------------------------------------------------------------------
/demo/gem5/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 imec v.z.w.
 2 | # All Rights Reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: redistributions of source code must retain the above copyright
 7 | # notice, this list of conditions and the following disclaimer;
 8 | # redistributions in binary form must reproduce the above copyright
 9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution;
11 | # neither the name of the copyright holders nor the names of its
12 | # contributors may be used to endorse or promote products derived from
13 | # this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | services:
28 |   vlsid-iree-gem5:
29 |     privileged: true
30 |     stdin_open: true
31 |     tty: true
32 |     build:
33 |       context: .
34 |       dockerfile: Dockerfile
35 |     volumes:
36 |       - ..:/opt/vlsid-iree-gem5
37 | 


--------------------------------------------------------------------------------
/demo/gem5/iree/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 imec v.z.w.
 2 | # All Rights Reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: redistributions of source code must retain the above copyright
 7 | # notice, this list of conditions and the following disclaimer;
 8 | # redistributions in binary form must reproduce the above copyright
 9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution;
11 | # neither the name of the copyright holders nor the names of its
12 | # contributors may be used to endorse or promote products derived from
13 | # this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | IREE_GIT_TAG=v3.4.0
28 | TOOLCHAIN_ROOT=..
29 | TOOLCHAIN_URL=https://toolchains.bootlin.com/downloads/releases/toolchains/riscv64-lp64d/tarballs/riscv64-lp64d--musl--stable-2024.05-1.tar.xz
30 | 
31 | IREE_BUILD_OPTS=\
32 | 	-DIREE_BUILD_COMPILER=OFF \
33 | 	-DIREE_BUILD_TESTS=OFF \
34 | 	-DIREE_BUILD_SAMPLES=OFF \
35 | 	-DIREE_BUILD_DOCS=OFF \
36 | 	-DIREE_HAL_DRIVER_VULKAN=OFF
37 | 
38 | toolchan_root_abspath=$(shell realpath $(TOOLCHAIN_ROOT))
39 | toolchain_path=$(toolchan_root_abspath)/toolchain-riscv64
40 | sysroot_path=$(toolchain_path)/riscv64-buildroot-linux-musl/sysroot
41 | 
42 | default: iree-build-riscv64
43 | 
44 | iree-dist:
45 | 	git clone --depth 1 --branch $(IREE_GIT_TAG) https://github.com/iree-org/iree.git iree-dist
46 | 	cd iree-dist && git submodule update --init \
47 | 		third_party/benchmark \
48 | 		third_party/cpuinfo \
49 | 		third_party/flatcc \
50 | 		third_party/googletest \
51 | 		third_party/hip-build-deps \
52 | 		third_party/musl \
53 | 		third_party/spirv_cross \
54 | 		third_party/tracy \
55 | 		third_party/vulkan_headers \
56 | 		third_party/webgpu-headers \
57 | 		third_party/hsa-runtime-headers
58 | 
59 | iree-build-native: iree-dist
60 | 	cd $< && cmake -G "Unix Makefiles" \
61 | 		$(IREE_BUILD_OPTS) \
62 | 		-DCMAKE_INSTALL_PREFIX=../$@/install \
63 | 		-DCMAKE_BUILD_TYPE=RelWithDebInfo \
64 | 		. -B ../$@
65 | 	cd $< && cmake --build ../$@ --target install
66 | 
67 | orig_dir=$(patsubst %.tar.gz,%,$(patsubst %.tar.bz2,%,$(patsubst %.tar.xz,%,$(lastword $(subst /, ,$(TOOLCHAIN_URL))))))
68 | 
69 | .PHONY: toolchain-riscv64
70 | toolchain-riscv64: $(toolchain_path)
71 | $(toolchain_path):
72 | 	wget $(TOOLCHAIN_URL) -O $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2
73 | 	tar -xvf $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2 -C $(TOOLCHAIN_ROOT)
74 | 	mv $(TOOLCHAIN_ROOT)/$(orig_dir) $(toolchain_path)
75 | 	$(toolchain_path)/relocate-sdk.sh
76 | 	rm $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2
77 | 
78 | iree-build-riscv64: iree-dist iree-build-native $(toolchain_path)
79 | 	cd $< && cmake -G "Unix Makefiles" \
80 | 		$(IREE_BUILD_OPTS) \
81 | 		-DIREE_ENABLE_CPUINFO=OFF \
82 | 		-DCMAKE_INSTALL_PREFIX=../$@/install \
83 | 		-DIREE_HOST_BIN_DIR=../iree-build-native/install/bin \
84 | 		-DCMAKE_TOOLCHAIN_FILE="../toolchain.generic.cmake" \
85 | 		-DTOOLCHAIN_TARGET=riscv64 \
86 | 		-DTOOLCHAIN_PATH=$(toolchain_path) \
87 | 		-DTOOLCHAIN_PREFIX=riscv64-buildroot-linux-musl- \
88 | 		. -B ../$@
89 | 	cd $< && cmake --build ../$@ --target install
90 | 
91 | .PHONY: clean
92 | clean:
93 | 	rm -rf iree-dist iree-build-*
94 | 
95 | .PHONY: distclean
96 | distclean: clean
97 | 	rm -rf $(toolchain_path)
98 | 


--------------------------------------------------------------------------------
/demo/gem5/iree/toolchain.generic.cmake:
--------------------------------------------------------------------------------
 1 | # Script derived and adapted from this source:
 2 | # https://kubasejdak.com/how-to-cross-compile-for-embedded-with-cmake-like-a-champ
 3 | 
 4 | set(CMAKE_SYSTEM_NAME               Generic)
 5 | set(CMAKE_SYSTEM_PROCESSOR          ${TOOLCHAIN_TARGET})
 6 | 
 7 | # Without that flag CMake is not able to pass test compilation check
 8 | set(CMAKE_TRY_COMPILE_TARGET_TYPE   STATIC_LIBRARY)
 9 | 
10 | set(CMAKE_AR                        ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ar${CMAKE_EXECUTABLE_SUFFIX})
11 | set(CMAKE_ASM_COMPILER              ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX})
12 | set(CMAKE_C_COMPILER                ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX})
13 | set(CMAKE_CXX_COMPILER              ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}g++${CMAKE_EXECUTABLE_SUFFIX})
14 | set(CMAKE_LINKER                    ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ld${CMAKE_EXECUTABLE_SUFFIX})
15 | set(CMAKE_OBJCOPY                   ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}objcopy${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
16 | set(CMAKE_RANLIB                    ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ranlib${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
17 | set(CMAKE_SIZE                      ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}size${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
18 | set(CMAKE_STRIP                     ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}strip${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
19 | 
20 | set(CMAKE_C_FLAGS                   "-static -Wno-psabi -fdata-sections -ffunction-sections -Wl,--gc-sections" CACHE INTERNAL "")
21 | set(CMAKE_CXX_FLAGS                 "${CMAKE_C_FLAGS} -fno-exceptions" CACHE INTERNAL "")
22 | 
23 | set(CMAKE_C_FLAGS_DEBUG             "-Os -g" CACHE INTERNAL "")
24 | set(CMAKE_C_FLAGS_RELEASE           "-Os -DNDEBUG" CACHE INTERNAL "")
25 | set(CMAKE_CXX_FLAGS_DEBUG           "${CMAKE_C_FLAGS_DEBUG}" CACHE INTERNAL "")
26 | set(CMAKE_CXX_FLAGS_RELEASE         "${CMAKE_C_FLAGS_RELEASE}" CACHE INTERNAL "")
27 | 
28 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
29 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
30 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
31 | 


--------------------------------------------------------------------------------
/demo/gem5/vlsid-riscv-fs.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 The Regents of the University of California
  2 | # Copyright (c) 2024 imec v.z.w.
  3 | # All rights reserved.
  4 | #
  5 | # Redistribution and use in source and binary forms, with or without
  6 | # modification, are permitted provided that the following conditions are
  7 | # met: redistributions of source code must retain the above copyright
  8 | # notice, this list of conditions and the following disclaimer;
  9 | # redistributions in binary form must reproduce the above copyright
 10 | # notice, this list of conditions and the following disclaimer in the
 11 | # documentation and/or other materials provided with the distribution;
 12 | # neither the name of the copyright holders nor the names of its
 13 | # contributors may be used to endorse or promote products derived from
 14 | # this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | """
 29 | This example runs a simple linux boot. It uses the 'riscv-disk-img' resource.
 30 | It is built with the sources in `src/riscv-fs` in [gem5 resources](
 31 | https://github.com/gem5/gem5-resources).
 32 | 
 33 | Characteristics
 34 | ---------------
 35 | 
 36 | * Runs exclusively on the RISC-V ISA with the classic caches
 37 | * Assumes that the kernel is compiled into the bootloader
 38 | * Automatically generates the DTB file
 39 | * Will boot but requires a user to login using `m5term` (username: `root`,
 40 |   password: `root`)
 41 | """
 42 | 
 43 | import argparse
 44 | from pathlib import Path
 45 | 
 46 | from gem5.components.boards.riscv_board import RiscvBoard
 47 | from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
 48 |     PrivateL1PrivateL2WalkCacheHierarchy,
 49 | )
 50 | from gem5.components.memory import SingleChannelDDR3_1600
 51 | from gem5.components.processors.cpu_types import CPUTypes
 52 | from gem5.components.processors.simple_processor import SimpleProcessor
 53 | from gem5.isas import ISA
 54 | from gem5.resources.resource import DiskImageResource, obtain_resource
 55 | from gem5.simulate.simulator import Simulator
 56 | from gem5.utils.requires import requires
 57 | 
 58 | # Run a check to ensure the right version of gem5 is being used.
 59 | requires(isa_required=ISA.RISCV)
 60 | 
 61 | # Instantiate argument parser
 62 | parser = argparse.ArgumentParser()
 63 | parser.add_argument("--restore-from", type=Path, help="Checkpoint directory")
 64 | parser.add_argument(
 65 |         "--disk-image",
 66 |         type=Path,
 67 |         help="Disk image path",
 68 |         default="vlsid-disk.img"
 69 | )
 70 | args = parser.parse_args()
 71 | 
 72 | # Setup the cache hierarchy.
 73 | # For classic, PrivateL1PrivateL2 and NoCache have been tested.
 74 | # For Ruby, MESI_Two_Level and MI_example have been tested.
 75 | cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
 76 |     l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
 77 | )
 78 | 
 79 | # Setup the system memory.
 80 | memory = SingleChannelDDR3_1600()
 81 | 
 82 | # Setup a single core Processor.
 83 | processor = SimpleProcessor(
 84 |     cpu_type=CPUTypes.ATOMIC, isa=ISA.RISCV, num_cores=1
 85 | )
 86 | 
 87 | # Setup the board.
 88 | board = RiscvBoard(
 89 |     clk_freq="1GHz",
 90 |     processor=processor,
 91 |     memory=memory,
 92 |     cache_hierarchy=cache_hierarchy,
 93 | )
 94 | 
 95 | # Set the Full System workload.
 96 | board.set_kernel_disk_workload(
 97 |     kernel=obtain_resource("riscv-linux-6.6.33-kernel",
 98 |                            resource_version="1.0.0"),
 99 |     bootloader=obtain_resource("riscv-bootloader-opensbi-1.3.1",
100 |                                resource_version="1.0.0"),
101 |     disk_image=DiskImageResource(args.disk_image),
102 |     checkpoint=args.restore_from,
103 | )
104 | 
105 | simulator = Simulator(board=board)
106 | print("Beginning simulation!")
107 | # Note: This simulation will never stop. You can access the terminal upon boot
108 | # using m5term (`./util/term`): `./m5term localhost <port>`. Note the `<port>`
109 | # value is obtained from the gem5 terminal stdout. Look out for
110 | # "system.platform.terminal: Listening for connections on port <port>".
111 | simulator.run()
112 | 


--------------------------------------------------------------------------------
/demo/sst/README.rst:
--------------------------------------------------------------------------------
  1 | Scale-out system simulation with SST
  2 | ************************************
  3 | 
  4 | **How to perform a scale-out system simulation with instruction-level simulation and packet-level simulation?**
  5 | The goal of the second part of this tutorial is to introduce the Structural Simulation
  6 | Toolkit (SST) framework which allows to simulate a scale-out.
  7 | 
  8 | Instruction-level simulation
  9 | ============================
 10 | 
 11 | Environment Setup
 12 | -----------------
 13 | 
 14 | To run the SST experiments you need to install SST. Please refer to `Installation instructions`_.
 15 | 
 16 | 
 17 | System under exploration
 18 | ------------------------
 19 | .. _cpu figure:
 20 | 
 21 | .. figure:: images/sst/cpu.svg
 22 |    :width: 400
 23 |    :align: center
 24 | 
 25 |    Microarchitecture of a cpu core.
 26 | 
 27 | 
 28 | The system under exploration is made up of multi-threaded RISC-V CPU cores. As illustrated
 29 | in Figure :numref:`cpu figure`, a CPU core is attached to an L1 data cache and an L1
 30 | instruction cache. The two caches are interconnect to a second level of cache (L2 cache)
 31 | with a memory bus. The core itself is composed of one decoder for each thread, one branch
 32 | unit and one dispatch unit, one register file for floating point numbers and another one
 33 | for integers, a load store unit (or load store queue), multiple ALU and multiple FPU. The
 34 | core is attached to each cache through a TLB and a memory interface. TLBs are managed by
 35 | the operating system.
 36 | 
 37 | 
 38 | .. _node figure:
 39 | 
 40 | .. figure:: images/sst/node.svg
 41 |    :width: 600
 42 |    :align: center
 43 | 
 44 |    Microarchitecture of a compute node.
 45 | 
 46 | As shown in Figure :numref:`node figure`, the RISC-V cores are integrated into a compute
 47 | node. The number of cores per node is configurable from the script. The set of L2 caches
 48 | are federated with a directory which maintains coherency in the node. The L2s and the
 49 | directory are interconnected through a NoC. The directory is attached to a DRAM
 50 | controller. In addition, a node integrates a component that emulates an operating systems.
 51 | The latter manages the virtual memory and is attached to every CPU core to provide the
 52 | minimal service required to run applications.
 53 | 
 54 | .. _system figure:
 55 | 
 56 | .. figure:: images/sst/system.svg
 57 |    :width: 800
 58 |    :align: center
 59 |    :alt: Scale-out system microarchitecture
 60 | 
 61 |    Microarchitecture of a multi-node system.
 62 | 
 63 | Multi-node can be interconnect with a network to build a scale-out system, as illustrated
 64 | in Figure :numref:`system figure`. Each node has an independent operating system and a
 65 | private memory space. To allow communication between node, we can use
 66 | Message Passing Interface (MPI). To do that, each node integrates a NIC in addition. The
 67 | latter is interconnected to the NoC.
 68 | 
 69 | The inter-node network is built with pymerlin (a python script provided in SST-elements).
 70 | Thanks to that script we can defined different topologies easily (e.g., single router, fat
 71 | tree, dragonfly, torus, mesh, torus, etc).
 72 | 
 73 | 
 74 | Every components or sub-components are configurable, for instance you can configure the
 75 | latency of the ALU or the capacity of each cache. You can find more information on the
 76 | parameters and their impact on the simulated system using **sst-info** command.
 77 | 
 78 | .. list-table:: How to find the available parameters
 79 |    :widths: 25 50
 80 |    :header-rows: 1
 81 | 
 82 |    * - Command
 83 |      - Description
 84 |    * - sst-info vanadis
 85 |      - Parameters of the cpu core and the operating system
 86 |    * - sst-info mmu
 87 |      - Parameters of the TBL and MMU
 88 |    * - sst-info sst-info memHierarchy
 89 |      - Parameters of the cache, directory controller, DRAM, memory bus
 90 |    * - sst-info merlin
 91 |      - Parameters of the NoC and internode network components
 92 |    * - sst-info sst-info rdmaNic
 93 |      - Parameters of the NIC
 94 | 
 95 | 
 96 | Workload under evaluation
 97 | -------------------------
 98 | 
 99 | 
100 | The workload under evaluation is inspired by a Multi-head attention, one of the
101 | calculation layers of transformers :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
102 | 
103 | .. _OMP_MHA figure:
104 | 
105 | .. figure:: images/sst/mha.svg
106 |    :width: 600
107 |    :align: center
108 |    :alt: Multi-head attention block
109 | 
110 |    Illustration of the workload run on a single-node system.
111 | 
112 | As shown in Figure :numref:`OMP_MHA figure`, the application multiplies an *Embeddings*
113 | matrix of Seq\ :sub:`len`\ x D\ :sub:`model` \ elements with 3 matrices of
114 | D\ :sub:`model` x D\ :sub:`model` weights, producing 3 matrices of  Seq\ :sub:`len`\ x D\ :sub:`model` \ elements,
115 | called Keys, Queries and Values. In fact, the weight matrices are divided into *heads*.
116 | Each head of Queries matrix are multiplied with the corresponding transposed head of Keys
117 | matrix, producing *QK* matrix. The latter is then scaled. Then the *softmax* of each row of
118 | the scaled *QK* is computed. Afterward, the result of the *softmax* is multiplied with
119 | Values matrix, producing *QKV* matrix. Finally, *QKV* is summed with the *Embeddings*
120 | matrix.
121 | 
122 | 
123 | .. _mha_OMP: ../../demo/sst/software/mha_OMP.c
124 | 
125 | The corresponding code is implemented in **C** `mha_OMP`_, and is parallelized with **OpenMP**.
126 | 
127 | 
128 | Matrix-Matrix multiplication is the heaviest workload in this application. To minimize the
129 | data movement, a tiled GEMM is implemented. *TILE_SIZE* macro defines the dimension of the
130 | tiles.
131 | 
132 | .. code-block:: C
133 |    :linenos:
134 |    :emphasize-lines: 1
135 | 
136 |    const int bsize = TILE_SIZE;
137 |    int ii0, ii1, ii2;
138 |    int i0, i1, i2;
139 |    int h;
140 |    int start_head, stop_head;
141 |    data_t pp;
142 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
143 |    for (h=0; h < heads; h++) {
144 |       for (ii0 = 0; ii0<m; ii0+=bsize) {
145 |          for (ii1 = h*n; ii1<((h+1)*n); ii1+=bsize) {
146 |             for(ii2 = 0; ii2<k; ii2+=bsize) {
147 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
148 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,((h+1)*n)); i1++) {
149 |                      pp = 0;
150 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
151 |                         pp += src1[(i0+h*m)*(stride_1)+i2] * src2[i2*stride_2+i1];
152 |                      }
153 |                      dst[i0*(stride_0)+i1]+= pp;
154 |                   }
155 |                }
156 |             }
157 |          }
158 |       }
159 |    }
160 | 
161 | .. _OMP_MPI_MHA figure:
162 | 
163 | .. figure:: images/sst/mha_mpi.svg
164 |    :width: 600
165 |    :align: center
166 |    :alt: Multi-head attention block
167 | 
168 |    Illustration of the workload run on a multi-node system.
169 | 
170 | .. _mha_mpi_OMP: ../../demo/sst/software/mha_MPI_OMP.c
171 | 
172 | As the *heads* can be processed independently until the addition, the workload can be
173 | easily parallelized on a distributed memory system. As illustrated in Figure :numref:`OMP_MPI_MHA figure`,
174 | running the workload on a multi-node system requires only a few extra steps.
175 | The corresponding application is implemented with MPI to handle the communication between
176 | the nodes and OpenMP to parallelize the kernels within a node. The code is written in
177 | **C** as well `mha_mpi_OMP`_
178 | 
179 | 
180 | .. code-block:: C
181 |    :emphasize-lines: 24-25, 38-40, 64
182 |    :linenos:
183 | 
184 |    MPI_Init(&argc, &argv);
185 |    MPI_Comm_size(WORLD, &n_ranks);
186 |    MPI_Comm_rank(WORLD, &rank);
187 |    MPI_Datatype col, col_type;
188 | 
189 |    /*
190 |     ...
191 |     */
192 | 
193 |    MPI_Type_vector(dmodel, dmodel/n_ranks, dmodel, mpi_data_type, &col);
194 |    MPI_Type_commit(&col);
195 |    MPI_Type_create_resized(col, 0, dmodel/n_ranks*sizeof(data_t), &col_type);
196 |    MPI_Type_commit(&col_type);
197 | 
198 |    /*
199 |     ...
200 |     */
201 | 
202 |    if(rank == root) {
203 |       init_random_tensor(embeddings, data_type, dmodel*S);
204 |       init_random_tensor(ATTNw, data_type, dmodel*dmodel);
205 |    }
206 | 
207 |    MPI_Bcast(embeddings, dmodel*S, mpi_data_type, root, WORLD);
208 |    MPI_Bcast(ATTNw, dmodel*dmodel, mpi_data_type, root, WORLD);
209 | 
210 |    if(rank == root) {
211 |       Qw = calloc(dmodel*dmodel, sizeof(data_t));
212 |       init_random_tensor(Qw, data_type, dmodel*dmodel);
213 | 
214 |       Kw = calloc(dmodel*dmodel, sizeof(data_t));
215 |       init_random_tensor(Kw, data_type, dmodel*dmodel);
216 | 
217 |       Vw = calloc(dmodel*dmodel, sizeof(data_t));
218 |       init_random_tensor(Vw, data_type, dmodel*dmodel);
219 |    }
220 | 
221 |    MPI_Scatter(Qw, 1, col_type, Qw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
222 |    MPI_Scatter(Kw, 1, col_type, Kw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
223 |    MPI_Scatter(Vw, 1, col_type, Vw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
224 | 
225 |    /*
226 |     ...
227 |     */
228 | 
229 |    /* MHA */
230 | 
231 |    gemm(Q, embeddings, Qw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
232 |    gemm(K, embeddings, Kw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
233 |    gemm(V, embeddings, Vw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
234 | 
235 |    gemm_t(KQ, Q, K, data_type, h/n_ranks, S, S, dmodel/h, S, dmodel/n_ranks, dmodel/n_ranks);
236 | 
237 |    scale(KQ, KQ, ((void*)&scale_f), data_type, h/n_ranks*S, S);
238 | 
239 |    softmax(softmax_out, KQ, data_type, h/n_ranks*S, S);
240 | 
241 |    gemm(QKV, softmax_out, V, data_type, h/n_ranks, S, dmodel/h, S, dmodel/n_ranks, S, dmodel/n_ranks);
242 | 
243 |    gemm(ATTNout, QKV, ATTNw, data_type, 1, S, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel);
244 | 
245 |    add(&ATTNout[S/n_ranks*rank*dmodel], &ATTNout[S/n_ranks*rank*dmodel], &embeddings[S/n_ranks*rank*dmodel], data_type, S/n_ranks, dmodel);
246 | 
247 |    MPI_Allreduce(ATTNout, embeddings, S*dmodel, mpi_data_type, MPI_SUM, WORLD);
248 | 
249 |    /*
250 |     ...
251 |     */
252 | 
253 |    MPI_Finalize();
254 | 
255 | 
256 | 
257 | 
258 | Firstly, the *Embeddings* matrix needs to be locally stored in every memory space. To do that we
259 | use a broadcast. Every node produces different heads, hence only the required weights are
260 | stored in each memory domain (**scatter**). Consequently, less computation are required.
261 | After, the final addition, we need to gather the heads by executing a **MPI ALL REDUCE**,
262 | after that all the nodes have the *Output* result.
263 | 
264 | 
265 | 
266 | DEMO
267 | ----
268 | 
269 | For the demo, we will explore two systems. The first is a single-node system, the second
270 | is a scale-out system.
271 | 
272 | **Scale-up system**
273 | 
274 | 
275 | .. _scale_up: ../../demo/sst/instruction-level-simulation/scale_up.py
276 | 
277 | The python script `scale_up`_ build the system for the scale up system. You can explore
278 | the script to understand how a system is built with SST.
279 | 
280 | You can run a simulation by executing the following command in a terminal from
281 | *demo/sst/ssytem* folder:
282 | 
283 | .. code:: bash
284 | 
285 |    sst scale_up.py -- --stats
286 | 
287 | You can also store the statistics in a csv file by passing a file name:
288 | 
289 | .. code:: bash
290 | 
291 |    sst scale_up.py -- --stats stats.csv
292 | 
293 | You can configure the number of threads, the number of CPU, the dimensions of the
294 | workload (Seq\ :sub:`len`\, D\ :sub:`model` \, heads), and the binary version from the command line:
295 | 
296 | .. code:: bash
297 | 
298 |    sst scale_up.py -- --num_cpu_per_node 2 --num_threads_per_cpu 2 --app_args "64 128 8"
299 |    --exe "../software/riscv64/mha_OMP_8"
300 | 
301 | *First experiment: Impact of tiling dimension on performance*
302 | 
303 | For the first experiment, we will evaluate the impact of the GEMM tiles dimension on the
304 | **simulated performance**. 4 binaries are provided in *software* folder.
305 | You can run a simulation with each binary. To explain the performance difference, you can
306 | use the generated statistics.
307 | 
308 | .. hint:: Store the stats in a CSV file
309 |    Storing the statistics in a csv file makes analysis easier. You can open the file in
310 |    Excel to filter the stats by component or type.
311 | 
312 | 
313 | 
314 | *Second experiment: Scaling evaluation*
315 | 
316 | For the second experiment, we will observe the scaling of the **simulated system**
317 | (*i.e.*, performance of the application) and of the simulation (*i.e.*, performance of
318 | SST).
319 | 
320 | .. admonition:: Pick the correct binary
321 | 
322 |    Make sure to use the most efficient binary based on the first experiment
323 | 
324 | 
325 | .. admonition:: Run the simulations in parallel
326 | 
327 |    You can run the simulations with 4 threads (--num-threads=4)
328 | 
329 | .. admonition:: Measuring simulation time
330 | 
331 |    You can measure the simulation time by enabling --print-timing-info option.
332 | 
333 |    i.e `sst --print-timing-info scale_up.py ...`
334 | 
335 | 
336 | For the performance of the simulated system, you can fill the table below:
337 | 
338 | 
339 | +--------------------------+----------------------------------+----------------------------------+----------------------------------+
340 | |                          | 1 CPU                            | 2 CPU                            | 4 CPU                            |
341 | +--------------------------+----------+-----------+-----------+----------+-----------+-----------+----------+-----------+-----------+
342 | |                          | 1 thread | 2 threads | 4 threads | 1 thread | 2 threads | 4 threads | 1 thread | 2 threads | 4 threads |
343 | +==========================+==========+===========+===========+==========+===========+===========+==========+===========+===========+
344 | | Simulated time (ms)      |          |           |           |          |           |           |          |           |           |
345 | +--------------------------+----------+-----------+-----------+----------+-----------+-----------+----------+-----------+-----------+
346 | 
347 | 
348 | 
349 | For the performance of the simulated system, make sure to simulated a system with an intense
350 | activity (*e.g.*, 4 CPU 2 threads). You can fill the table below:
351 | 
352 | .. admonition:: Calculating the simulation speed in MIPS
353 | 
354 |    You can get the number of Million of instructions simulated per second by summing the
355 |    number of instructions executed by all the cores, then divided by the simulation time multiplied by one million.
356 | 
357 | 
358 | 
359 | +--------------------------------+----------+-----------+-----------+----------+
360 | | Number of simulation threads   | 1        | 2         | 4         | 8        |
361 | +================================+==========+===========+===========+==========+
362 | | Simulation time (s)            |          |           |           |          |
363 | +--------------------------------+----------+-----------+-----------+----------+
364 | | Million of instr. per second   |          |           |           |          |
365 | +--------------------------------+----------+-----------+-----------+----------+
366 | 
367 | **Scale-out system**
368 | 
369 | 
370 | .. _scale_out: ../../demo/sst/instruction-level-simulation/scale_out.py
371 | 
372 | The python script `scale_out`_ build the system for the scale out system. You can explore
373 | the script to understand how a system is built with SST.
374 | 
375 | You can run a simulation by executing the following command in a terminal from
376 | *demo/sst/ssytem* folder:
377 | 
378 | .. code:: bash
379 | 
380 |    sst scale_out.py
381 | 
382 | You can also run the simulation in parallel with MPI:
383 | 
384 | .. code:: bash
385 | 
386 |    mpirun -np 4 sst scale_out.py
387 | 
388 | 
389 | By default, the inter-node network instantiates a simple topology (single router).
390 | You can configure the number of node in the system from the command line by setting num_node_per_router argument:
391 | 
392 | .. code:: bash
393 | 
394 |    sst scale_out.py -- --num_node_per_router=4
395 | 
396 | 
397 | *First experiment: Changing the inter-node network topology*
398 | 
399 | 
400 | .. literalinclude:: ../demo/sst/instruction-level-simulation/scale_out.py
401 |    :language: python
402 |    :linenos:
403 |    :lineno-start: 35
404 |    :start-at: Network topology definition start
405 |    :end-at: Network topology definition end
406 | 
407 | 
408 | You can change the network topology by editing the python script from line 35 to 48.
409 | 
410 | 
411 | To use a **torus** topology, you need to comment the line 38 and uncomment the lines 40 to
412 | 42. *torus_width* defines the number of link between two routers. *torus_shape* defines
413 | the shape of the network: the size of the array defines the number of dimensions (i.e. 2
414 | elements means a 2D torus, 3 elements a 3D torus) and each element defines the number of
415 | router per dimension. The number of instantiated nodes is equal to the total number of
416 | routers times *num_node_per_router*.
417 | 
418 | 
419 | To use a **fat tree** topology, you need to comment the line 38 and uncomment the lines
420 | 44 to 46. *fattree_shape* defines the shape of the network.
421 | 
422 | 
423 | *Second experiment: Scaling evaluation*
424 | 
425 | 
426 | For the last experiment, we will observe the scaling of the **simulated system**
427 | (*i.e.*, performance of the application) and of the simulation (*i.e.*, performance of
428 | SST).
429 | 
430 | The objective is to observe the scaling of the simulated system to define the expectation
431 | for scaling of the simulation. Ideally, we would like to observe the simulation time
432 | decreasing with the simulated time.
433 | 
434 | +--------------------------------+----------+-----------+-----------+----------+
435 | | Number of node & MPI ranks     | 1        | 2         | 4         | 8        |
436 | +================================+==========+===========+===========+==========+
437 | | Simulated time (ms)            |          |           |           |          |
438 | +--------------------------------+----------+-----------+-----------+----------+
439 | | Simulation time (s)            |          |           |           |          |
440 | +--------------------------------+----------+-----------+-----------+----------+
441 | 
442 | References
443 | ----------
444 | 
445 | .. bibliography::
446 | 
447 | 
448 | Packet-level simulation
449 | =======================
450 | 
451 | Environment Setup
452 | -----------------
453 | 
454 | To run the packet-level simulation, you can use the docker setup provided.
455 | To build the environment:
456 | 
457 | .. code:: bash
458 | 
459 |    cd docker
460 |    docker compose up -d
461 | 
462 | To enter the container:
463 | 
464 | 
465 | .. code:: bash
466 | 
467 |    docker exec -it docker-riscv-scalable-simulation-tutorial-sst /bin/bash
468 | 
469 | The working directory inside the container is ``/opt/riscv-scalable-simulation-tutorial-sst/packet-level-simulation``.
470 | We will assume that every command is executed from that folder.
471 | 
472 | Parallelism in Large Language Model training
473 | --------------------------------------------
474 | .. _transformer_arch figure:
475 | 
476 | .. figure:: images/transformer/transformer_arch.svg
477 |    :width: 400
478 |    :align: center
479 | 
480 |    Simplified architecture of a decoder-only transformer.
481 | 
482 | As shown in Figure :numref:`transformer_arch figure`, a decoder-only transformer, like
483 | Llama 3, is made up of different layers:
484 | 
485 | * The input is a batch of token sequences which is converted into embeddings tensor by the
486 |   embeddings layer
487 | 
488 | * The embeddings tensor crosses L hidden layers which include a Self-Attention block and a multilayer perceptron (MLP)
489 | 
490 | * Then the embeddings tensor is processed by a normalization function, and projected
491 |   onto the vocabulary (linear).
492 | 
493 | * Afterwards, the loss is calculated with a softmax and a cross entropy function.
494 | 
495 | * Finally, the loss is backpropagated in order to update the weigths
496 | 
497 | 
498 | .. _tp figure:
499 | 
500 | .. figure:: images/transformer/tensor_parallelism.svg
501 |    :width: 400
502 |    :align: center
503 | 
504 |    Illustration of tensor parallelism.
505 | 
506 | .. _pp figure:
507 | 
508 | .. figure:: images/transformer/pipeline_parallelism_2.svg
509 |    :width: 400
510 |    :align: center
511 | 
512 |    Illustration of 1 forward, 1 backward pipeline parallelism.
513 | 
514 | .. _dp figure:
515 | 
516 | .. figure:: images/transformer/data_parallelism.svg
517 |    :width: 400
518 |    :align: center
519 | 
520 |    Illustration of data parallelism.
521 | 
522 | 
523 | .. _3d figure:
524 | 
525 | .. figure:: images/transformer/3d_parallelism_1.svg
526 |    :width: 400
527 |    :align: center
528 | 
529 |    Illustration of 3D parallelism.
530 | 
531 | 
532 | 3 types of parallelism are explored in this tutorial:
533 | 
534 | * Tensor Parallelism as illustrated in Figure :numref:`tp figure`.
535 | 
536 | * Pipeline Parallelism as illustrated in Figure :numref:`pp figure`.
537 | 
538 | * Data Parallelism as illustrated in Figure :numref:`dp figure`.
539 | 
540 | 
541 | The 3 level of parallelism can be merged to enable 3D parallelism as shown in Figure :numref:`3d figure`.
542 | 
543 | .. _sst-elements.patch: ../../external/sst/sst-elements.patch
544 | 
545 | 4 *Ember* generators are provided in `sst-elements.patch`_ to generate the MPI traffic
546 | corresponding to the 3 types of parallelism and the 3D parallelism.
547 | 
548 | DEMO
549 | ----
550 | 
551 | To run the experiments you need to log into the docker and move in the following directory ``/opt/riscv-scalable-simulation-tutorial-sst/packet-level-simulation``.
552 | 
553 | .. _training_llm: ../../demo/sst/packet-level-simulation/training_llm.py
554 | 
555 | 
556 | .. _small_config: ../../demo/sst/packet-level-simulation/small_config.json
557 | 
558 | 
559 | .. _large_config: ../../demo/sst/packet-level-simulation/large_config.json
560 | 
561 | 
562 | The python script `training_llm`_ builds the system to explore LLM training. Different
563 | options can be passed via the command line:
564 | 
565 | * --tp TP to define the level of tensor parallelism.
566 | 
567 | * --pp PP to define the level of pipeline parallelism.
568 | 
569 | * --dp PP to define the level of data parallelism.
570 | 
571 | * --batch_size BATCH_SIZE to define the number of sequences processed in parallel per DP
572 |   (i.e., if DP = 4 and BATCH_SIZE = 16, the number of sequences processed in parallel is
573 |   64).
574 | 
575 | * --sequence_len SEQUENCE_LEN to define the number of tokens per sequence
576 | 
577 | * --n_batch N_BATCH to define the number of batches to process (e.g., if N_BATCH = 16 and DP
578 |   = 4, each rank will process 4 batches).
579 | 
580 | * --llm_config LLM_CONFIG to define the path to the LLM configuration file (configuration
581 | * files are downloaded from hugging face). 2 files are provided `small_config`_
582 | * corresponding to Llama-3.2 1B and `large_config`_ corresponding to Llama-3.1 405B.
583 | 
584 | * --peak_flop PEAK_FLOP to define the peak compute throughput of 1 GPU at the targeted
585 | * precission. This parameter is used to estimate the compute duration.
586 | 
587 | * --draw_bw DRAW_BW to define the dra compute throughput of 1 GPU at the targeted
588 | * precission. This parameter is used to estimate the compute duration.
589 | 
590 | * --verbose VERBOSE  to enable printing from Ember generators. VERBOSE = 6 prints the
591 | *  compute time for each rank. VERBOSE = 8 print the transitions of the state machine for
592 | *  each rank.
593 | 
594 | * --log [LOG] to enable tracer of motif execution times. LOG defines the name of the
595 |   output file.
596 | 
597 | * --stats [STATS] to enable statistics collection. STATS defines the name of the output
598 |   file (must be csv file).
599 | 
600 | * --topology TOPOLOGY to define the topology to explore.
601 | 
602 | 
603 | **Exploring Tensor Parallelism**
604 | 
605 | 
606 | .. code:: bash
607 | 
608 |    sst --print-timing-info training_llm.py -- --tp 8 --pp 1 --dp 1 --batch_size 16 --sequence_len 1024 --n_batch 4 --llm_config small_config.json --log tp_logger --stats tp_stats.csv --topology single --verbose 10
609 | 
610 | 
611 | **Exploring Pipeline Parallelism**
612 | 
613 | .. code:: bash
614 | 
615 |    sst --print-timing-info training_llm.py -- --tp 1 --pp 5 --dp 1 --batch_size 16 --sequence_len 1024 --n_batch 16 --llm_config small_config.json --log pp_logger --stats pp_stats.csv --topology single --verbose 10
616 | 
617 | 
618 | **Exploring Data Parallelism**
619 | 
620 | .. code:: bash
621 | 
622 |    sst --print-timing-info training_llm.py -- --tp 1 --pp 1 --dp 4 --batch_size 16 --sequence_len 1024 --n_batch 16 --llm_config small_config.json --log tp_logger --stats tp_stats.csv --topology single --verbose 10
623 | 
624 | 
625 | **Exploring 3D Parallelism**
626 | 
627 | .. code:: bash
628 | 
629 |    sst --print-timing-info training_llm.py -- --tp 4 --pp 5 --dp 4 --batch_size 16 --sequence_len 1024 --n_batch 16 --llm_config small_config.json --log tp_logger --stats tp_stats.csv --topology fattree --verbose 10
630 | 
631 | 
632 | 


--------------------------------------------------------------------------------
/demo/sst/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 The Regents of the University of California
 2 | # Copyright (c) 2024 imec v.z.w.
 3 | # All Rights Reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are
 7 | # met: redistributions of source code must retain the above copyright
 8 | # notice, this list of conditions and the following disclaimer;
 9 | # redistributions in binary form must reproduce the above copyright
10 | # notice, this list of conditions and the following disclaimer in the
11 | # documentation and/or other materials provided with the distribution;
12 | # neither the name of the copyright holders nor the names of its
13 | # contributors may be used to endorse or promote products derived from
14 | # this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | FROM --platform=${BUILDPLATFORM} ubuntu:24.04
29 | 
30 | ENV DEBIAN_FRONTEND=noninteractive
31 | RUN apt -y update && apt -y upgrade && apt -y install \
32 |    libtool \
33 |    libtool-bin \
34 |    autoconf \
35 |    python3 \
36 |    python3-dev \
37 |    automake \
38 |    build-essential \
39 |    git \
40 |    vim
41 | 
42 | WORKDIR /scratch/sst-core
43 | ENV SST_CORE_HOME=/scratch/sst-core/install
44 | RUN git clone --depth 1 --branch v14.1.0_Final https://github.com/sstsimulator/sst-core.git source
45 | 
46 | WORKDIR /scratch/sst-elements
47 | COPY sst-elements.patch /scratch/sst-elements/sst-elements.patch
48 | ENV SST_ELEMENTS_HOME=/scratch/sst-elements/install
49 | RUN git clone --depth 1 --branch v14.1.0_Final https://github.com/sstsimulator/sst-elements.git source
50 | 
51 | WORKDIR /scratch/sst-core/source
52 | RUN ./autogen.sh
53 | WORKDIR /scratch/sst-core/build
54 | RUN ../source/configure --prefix=$SST_CORE_HOME --enable-mpi=no
55 | RUN make -j$(nproc) install
56 | ENV PATH=$SST_CORE_HOME/bin:$PATH
57 | 
58 | WORKDIR /scratch/sst-elements/source
59 | RUN git apply /scratch/sst-elements/sst-elements.patch
60 | RUN ./autogen.sh
61 | WORKDIR /scratch/sst-elements/build
62 | RUN ../source/configure --prefix=$SST_ELEMENTS_HOME --with-sst-core=$SST_CORE_HOME --enable-mpi=no
63 | RUN make -j$(nproc) install
64 | 
65 | 
66 | ## pre-commit, as installed via apt in 24.04, attempts to create a cache
67 | ## directory at "${HOME}/.cache/pre-commit". If running docker with non-root,
68 | ## the HOME directory is set to "/". Since non-root users do not have permission
69 | ## to write to this directory, an error is returned when pre-commit is executed.
70 | ## pre-commit's default cache directory can be changed via the `XDG_CACHE_HOME`
71 | ## enivoronment variable. Here we set it to "/tmp". With this pre-commit will
72 | ## create a "/tmp/pre-commit" directory to use for caching. "/tmp" was chosen
73 | ## as it's a directory any user can access and write to. Given this only stores
74 | ## caching information, the "/tmp" directory being wiped is not a concern.
75 | ENV XDG_CACHE_HOME=/tmp/
76 | 


--------------------------------------------------------------------------------
/demo/sst/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 imec v.z.w.
 2 | # All Rights Reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met: redistributions of source code must retain the above copyright
 7 | # notice, this list of conditions and the following disclaimer;
 8 | # redistributions in binary form must reproduce the above copyright
 9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution;
11 | # neither the name of the copyright holders nor the names of its
12 | # contributors may be used to endorse or promote products derived from
13 | # this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | services:
28 |   riscv-scalable-simulation-tutorial-sst:
29 |     privileged: true
30 |     stdin_open: true
31 |     tty: true
32 |     build:
33 |       dockerfile: Dockerfile
34 |     volumes:
35 |       - ..:/opt/riscv-scalable-simulation-tutorial-sst
36 | 


--------------------------------------------------------------------------------
/demo/sst/instruction-level-simulation/.gitignore:
--------------------------------------------------------------------------------
1 | stderr*
2 | stdout*
3 | *.csv
4 | 


--------------------------------------------------------------------------------
/demo/sst/instruction-level-simulation/scale_out.py:
--------------------------------------------------------------------------------
  1 | ## This source code is licensed under the MIT license found in the
  2 | ## LICENSE file in the root directory of this source tree.
  3 | ##
  4 | ## Copyright (c) 2025 IMEC. All rights reserved.
  5 | ## ******************************************************************************
  6 | 
  7 | import sys
  8 | import argparse
  9 | import sst
 10 | import os
 11 | from sst.merlin import *
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser(
 15 |    prog=f'sst {__file__} --',
 16 |    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 17 | 
 18 | 
 19 | parser.add_argument("--num_node_per_router", type=int, help="Number of nodes per router", default=2)
 20 | parser.add_argument("--app_args", type=str, help="Arguments of the application", default="64 64 16")
 21 | parser.add_argument("--stats", type=str, help="write statistics, argument changes the filename", nargs="?", const="-")
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | if args.stats:
 26 |     enableStats = True
 27 |     sst.setStatisticLoadLevel(10)
 28 | 
 29 |     fname = args.stats
 30 |     if fname.endswith(".csv"):
 31 |         sst.setStatisticOutput("sst.statOutputCSV",
 32 |                        {   "filepath" : fname,
 33 |                         "separator" : ";"
 34 |                         } )
 35 |     else:
 36 |         sst.setStatisticOutput("sst.statOutputConsole")
 37 | else:
 38 |     enableStats = False
 39 | 
 40 | 
 41 | # Network topology definition start
 42 | num_node_per_router = args.num_node_per_router
 43 | 
 44 | network_topology = "simple"
 45 | 
 46 | #network_topology = "torus"
 47 | #torus_width = 2
 48 | #torus_shape = [2, 2]
 49 | #
 50 | #network_topology = "fattree"
 51 | #fattree_shape = "1,1:2,2"
 52 | #fattree_shape = ':'.join([fattree_shape, str(num_node_per_router)])
 53 | 
 54 | # Network topology definition end
 55 | 
 56 | num_threads_per_cpu = 1
 57 | num_cpu_per_node = 1
 58 | app_args = args.app_args
 59 | os_verbosity = 0
 60 | 
 61 | cpu_clock = "3GHz"
 62 | 
 63 | coherence_protocol="MESI"
 64 | cache_line_size = 64
 65 | 
 66 | l2cache_size = 1 * 1024**2 # 1MiB
 67 | page_size = 4096
 68 | memsize = 2 * 1024**3 # 2GiB
 69 | physMemSize = str(memsize) + " B"
 70 | 
 71 | 
 72 | full_exe_name = "../software/riscv64/hello_MPI"
 73 | 
 74 | exe_name= full_exe_name.split("/")[-1]
 75 | 
 76 | rdma_nic_num_posted_recv=128
 77 | rdma_nic_comp_q_size=256
 78 | 
 79 | tlbParams = {
 80 |         "hitLatency": 3,
 81 |         "num_tlb_entries_per_thread": 64,
 82 |         "tlb_set_size": 4,
 83 |         "minVirtAddr" : 0x1000,
 84 |         "maxVirtAddr" : memsize
 85 |         }
 86 | 
 87 | networkParams = {
 88 |         "packetSize" : "2048B",
 89 |         "link_bw" : "25GB/s",
 90 |         "xbar_bw" : "50GB/s",
 91 | 
 92 |         "link_lat" : "10ns",
 93 |         "input_latency" : "10ns",
 94 |         "output_latency" : "10ns",
 95 | 
 96 |         "flit_size" : "256B",
 97 |         "input_buf_size" : "14KB",
 98 |         "output_buf_size" : "14KB",
 99 | }
100 | 
101 | rdmaLinkParams = {
102 |         "link_bw" :"25GB/s",
103 |         "input_buf_size" : "14KB",
104 |         "output_buf_size" : "14KB"
105 |         }
106 | 
107 | 
108 | if network_topology == "torus":
109 | 
110 |     shape = 'x'.join([str(x) for x in torus_shape])
111 |     width = 'x'.join([str(torus_width) for x in range(len(torus_shape))])
112 | 
113 |     print(shape)
114 |     print(width)
115 | 
116 |     networkParams |= {
117 |             "num_dims" : len(torus_shape),
118 |             "torus.width" : width,
119 |             "torus.shape" : shape,
120 |             "torus.local_ports" : num_node_per_router
121 |             }
122 | 
123 |     num_node = num_node_per_router
124 |     for x in torus_shape:
125 |         num_node = num_node * x
126 | 
127 | elif network_topology == "fattree":
128 |     networkParams |= {
129 |             "fattree.shape" : fattree_shape,
130 |             }
131 | 
132 |     num_node = 1
133 |     for l in fattree_shape.split(":"):
134 |         for h in l[0].split(","):
135 |             num_node *= int(h)
136 | 
137 | else: # simple
138 |     networkParams |= {
139 |             "router_radix" : num_node_per_router
140 |             }
141 |     num_node = num_node_per_router
142 | 
143 | 
144 | assert num_node > 1
145 | 
146 | 
147 | nodeRtrParams = {
148 |         "xbar_bw" : "57.6GB/s",
149 |         "link_bw" : "28.8GB/s",
150 |         "input_buf_size" : "40KB",
151 |         "output_buf_size" : "40KB",
152 |         "flit_size" : "72B",
153 |         "id" : "0",
154 |         "topology" : "merlin.singlerouter"
155 |         }
156 | 
157 | memCtrlParams = {
158 |         "clock" : "1.6GHz",
159 |         "backend.mem_size" : physMemSize,
160 |         "backing" : "malloc",
161 |         "initBacking" : 1,
162 |         "addr_range_start" : 0x0,
163 |         "addr_range_end" : memsize - 1,
164 |         "backendConvertor.request_width" : 16
165 |         }
166 | 
167 | # DRAM bandwidth = memCtrl.clock * request width * max_requests_per_cycle = 25.6 GB/s
168 | memCtrlParams = {
169 |         "clock" : "1.6GHz",
170 |         "backend.mem_size" : physMemSize,
171 |         "backing" : "malloc",
172 |         "initBacking" : 1,
173 |         "addr_range_start" : 0x0,
174 |         "addr_range_end" : memsize - 1,
175 |         "backendConvertor.request_width" : 16
176 |         }
177 | 
178 | memBackendParams = {
179 |         "mem_size" : physMemSize,
180 |         "access_time" : "20ns",
181 |         "max_requests_per_cycle" : 1,
182 |         "request_width" : 16
183 |         }
184 | 
185 | memNICParams = {
186 |         "min_packet_size" : "72B",
187 |         "network_bw" : "28.8GB/s",
188 |         "network_input_buffer_size" : "4KiB",
189 |         "network_output_buffer_size" : "4KiB"
190 |         }
191 | 
192 | # OS related params
193 | osParams = {
194 |         "dbgLevel" : os_verbosity,
195 |         "dbgMask" : 16,
196 |         "cores" : num_cpu_per_node,
197 |         "hardwareThreadCount" : num_threads_per_cpu,
198 |         "page_size"  : page_size,
199 |         "physMemSize" : physMemSize,
200 |         "useMMU" : True,
201 |         }
202 | 
203 | osl1cacheParams = {
204 |         "access_latency_cycles" : 1,
205 |         "cache_frequency" : cpu_clock,
206 |         "replacement_policy" : "lru",
207 |         "coherence_protocol" : coherence_protocol,
208 |         "associativity" : 8,
209 |         "cache_line_size" : cache_line_size,
210 |         "cache_size" : "32 KiB",
211 |         "L1" : "1",
212 |         }
213 | 
214 | mmuParams = {
215 |         "num_cores": num_cpu_per_node,
216 |         "num_threads": num_threads_per_cpu,
217 |         "page_size": page_size,
218 |         "useNicTlb": True,
219 |         }
220 | 
221 | 
222 | vanadis_cpu_type = "vanadis.VanadisCPU"
223 | cpuParams = {
224 |         "dbg_mask" : 16,
225 |         "verbose" : 0,
226 |         "clock" : cpu_clock,
227 |         "hardware_threads": num_threads_per_cpu,
228 |         "physical_fp_registers" : 168 * num_threads_per_cpu,
229 |         "physical_integer_registers" : 180 * num_threads_per_cpu,
230 |         "integer_arith_units" : 2,
231 |         "integer_arith_cycles" : 2,
232 |         "integer_div_units" : 1,
233 |         "integer_div_cycles" : 20,
234 |         "fp_arith_cycles" : 3,
235 |         "fp_arith_units" : 2,
236 |         "fp_div_units" : 2,
237 |         "fp_div_cycles" : 20,
238 |         "branch_units" : 1,
239 |         "branch_unit_cycles" : 2,
240 |         "reorder_slots" : 128,
241 |         "decodes_per_cycle" : 4,
242 |         "issues_per_cycle" :  4,
243 |         "retires_per_cycle" : 4,
244 |         }
245 | 
246 | branchPredParams = {
247 |         "branch_entries" : 64
248 |         }
249 | 
250 | decoderParams = {
251 |         "loader_mode" : 1,
252 |         "uop_cache_entries" : 1536,
253 |         "predecode_cache_entries" : 4
254 |         }
255 | 
256 | lsqParams = {
257 |         "max_stores" : 16,
258 |         "max_loads" : 32,
259 |         }
260 | 
261 | 
262 | 
263 | l1dcacheParams = {
264 |         "access_latency_cycles" : 1,
265 |         "cache_frequency" : cpu_clock,
266 |         "replacement_policy" : "lru",
267 |         "coherence_protocol" : coherence_protocol,
268 |         "associativity" : 8,
269 |         "cache_line_size" : cache_line_size,
270 |         "cache_size" : "64 KiB",
271 |         "prefetcher" : "cassini.NextBlockPrefetcher",
272 |         "prefetcher.reach" : 2,
273 |         "L1" : "1",
274 |         }
275 | 
276 | l1icacheParams = {
277 |         "access_latency_cycles" : 1,
278 |         "cache_frequency" : cpu_clock,
279 |         "replacement_policy" : "lru",
280 |         "coherence_protocol" : coherence_protocol,
281 |         "associativity" : 8,
282 |         "cache_line_size" : cache_line_size,
283 |         "cache_size" : "32 KiB",
284 |         "prefetcher" : "cassini.NextBlockPrefetcher",
285 |         "prefetcher.reach" : 1,
286 |         "L1" : "1",
287 |         }
288 | 
289 | l2cacheParams = {
290 |         "access_latency_cycles" : 8,
291 |         "max_requests_per_cycle" : 2,
292 |         "cache_frequency" : cpu_clock,
293 |         "replacement_policy" : "lru",
294 |         "coherence_protocol" : coherence_protocol,
295 |         "associativity" : 16,
296 |         "cache_line_size" : cache_line_size,
297 |         "cache_size" : str(l2cache_size) + 'B',
298 |         "mshr_latency_cycles": 3,
299 |         }
300 | 
301 | busParams = {
302 |         "bus_frequency" : cpu_clock,
303 |         }
304 | 
305 | dirCtrlParams = {
306 |         "max_requests_per_cycle" : 2,
307 |         "coherence_protocol" : coherence_protocol,
308 |         "entry_cache_size" : l2cache_size*num_cpu_per_node/cache_line_size,
309 |         "cache_line_size" : cache_line_size,
310 |         "addr_range_start" : 0x0,
311 |         "addr_range_end" : memsize - 1
312 |         }
313 | 
314 | 
315 | rdmaNiCParams = {
316 |         "clock" : cpu_clock,
317 |         "useDmaCache": "true",
318 |         "maxPendingCmds" : rdma_nic_num_posted_recv,
319 |         "maxMemReqs" : rdma_nic_comp_q_size,
320 |         "maxCmdQSize" : rdma_nic_num_posted_recv,
321 |         "cache_line_size"    : cache_line_size,
322 |         'baseAddr': memsize,
323 |         'cmdQSize' : 64,
324 |         }
325 | 
326 | 
327 | rdmaCacheParams = {
328 |         "access_latency_cycles" : 2,
329 |         "max_requests_per_cycle" : 1,
330 |         "mshr_num_entries": 64,
331 |         "cache_frequency" : cpu_clock,
332 |         "replacement_policy" : "lru",
333 |         "coherence_protocol" : coherence_protocol,
334 |         "associativity" : 8,
335 |         "cache_line_size" : cache_line_size,
336 |         "cache_size" : "32 KiB",
337 |         "L1" : "1",
338 |         }
339 | 
340 | 
341 | app_params = {}
342 | if app_args != "":
343 |     app_args_list = app_args.split(" ")
344 |     # We have a plus 1 because the executable name is arg0
345 |     app_args_count = len( app_args_list ) + 1
346 | 
347 |     app_params["argc"] = app_args_count
348 | 
349 |     arg_start = 1
350 |     for next_arg in app_args_list:
351 |         app_params["arg" + str(arg_start)] = next_arg
352 |         arg_start = arg_start + 1
353 | else:
354 |     app_params["argc"] = 1
355 | 
356 | class CPU_Builder:
357 |     def __init__(self):
358 |         pass
359 | 
360 |     def build( self, nodeId, cpuId ):
361 | 
362 |         prefix = 'node' + str(nodeId) + '.cpu' + str( cpuId )
363 |         cpu = sst.Component(prefix, vanadis_cpu_type)
364 |         cpu.addParams( cpuParams )
365 |         cpu.addParam( "core_id", cpuId )
366 |         cpu.addParam( "node_id", nodeId )
367 |         if enableStats:
368 |             cpu.enableAllStatistics()
369 | 
370 |         # CPU.decoder
371 |         for n in range(num_threads_per_cpu):
372 |             decode     = cpu.setSubComponent( "decoder"+str(n), "vanadis.VanadisRISCV64Decoder" )
373 |             decode.addParams( decoderParams )
374 | 
375 |             if enableStats:
376 |                 decode.enableAllStatistics()
377 | 
378 |             # CPU.decoder.osHandler
379 |             os_hdlr     = decode.setSubComponent( "os_handler", "vanadis.VanadisRISCV64OSHandler" )
380 | 
381 |             # CPU.decocer.branch_pred
382 |             branch_pred = decode.setSubComponent( "branch_unit", "vanadis.VanadisBasicBranchUnit" )
383 |             branch_pred.addParams( branchPredParams )
384 | 
385 |             if enableStats:
386 |                 branch_pred.enableAllStatistics()
387 | 
388 | 
389 |         # CPU.lsq
390 |         cpu_lsq = cpu.setSubComponent( "lsq", "vanadis.VanadisBasicLoadStoreQueue" )
391 |         cpu_lsq.addParams(lsqParams)
392 |         if enableStats:
393 |             cpu_lsq.enableAllStatistics()
394 | 
395 | 
396 |         icache_if = cpu.setSubComponent( "mem_interface_inst", "memHierarchy.standardInterface" )
397 |         icache_if.addParam("coreId",cpuId)
398 | 
399 |         dcache_if = cpu_lsq.setSubComponent( "memory_interface", "memHierarchy.standardInterface" )
400 |         dcache_if.addParam("coreId",cpuId)
401 | 
402 |         # L1 D-Cache
403 |         l1cache = sst.Component(prefix + ".l1dcache", "memHierarchy.Cache")
404 |         l1cache.addParams( l1dcacheParams )
405 |         if enableStats:
406 |             l1cache.enableAllStatistics()
407 | 
408 |         l1dcache_2_cpu     = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
409 |         l1dcache_2_l2cache = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
410 | 
411 |         # L1 I-Cache
412 |         l1icache = sst.Component(prefix + ".l1icache", "memHierarchy.Cache")
413 |         l1icache.addParams(l1icacheParams)
414 |         if enableStats:
415 |             l1icache.enableAllStatistics()
416 | 
417 |         # Bus
418 |         cache_bus = sst.Component(prefix + ".bus", "memHierarchy.Bus")
419 |         cache_bus.addParams(busParams)
420 |         if enableStats:
421 |             cache_bus.enableAllStatistics()
422 | 
423 |         # L2 D-Cache
424 |         l2cache = sst.Component(prefix + ".l2cache", "memHierarchy.Cache")
425 |         l2cache.addParams(l2cacheParams)
426 |         if enableStats:
427 |             l2cache.enableAllStatistics()
428 | 
429 |         l2cache_2_cpu = l2cache.setSubComponent("cpulink", "memHierarchy.MemLink")
430 | 
431 |         # CPU D-TLB
432 |         dtlbWrapper = sst.Component(prefix+".dtlb", "mmu.tlb_wrapper")
433 |         dtlb = dtlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
434 |         dtlb.addParam("num_hardware_threads", num_threads_per_cpu)
435 |         dtlb.addParams(tlbParams)
436 | 
437 |         # CPU I-TLB
438 |         itlbWrapper = sst.Component(prefix+".itlb", "mmu.tlb_wrapper")
439 |         itlbWrapper.addParam("exe",True)
440 |         itlb = itlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
441 |         itlb.addParam("num_hardware_threads", num_threads_per_cpu)
442 |         itlb.addParams(tlbParams)
443 | 
444 |         # CPU (data) -> D-TLB
445 |         link = sst.Link(prefix+".link_cpu_dtlb")
446 |         link.connect( (dcache_if, "port", "25ps"), (dtlbWrapper, "cpu_if", "25ps") )
447 | 
448 |         # CPU (instruction) -> I-TLB
449 |         link = sst.Link(prefix+".link_cpu_itlb")
450 |         link.connect( (icache_if, "port", "25ps"), (itlbWrapper, "cpu_if", "25ps") )
451 | 
452 |         l1icache_2_cpu     = l1icache.setSubComponent("cpulink", "memHierarchy.MemLink")
453 |         l1icache_2_l2cache = l1icache.setSubComponent("memlink", "memHierarchy.MemLink")
454 | 
455 |         # D-TLB -> D-L1
456 |         link = sst.Link(prefix+".link_l1cache")
457 |         link.connect( (dtlbWrapper, "cache_if", "25ps"), (l1dcache_2_cpu, "port", "25ps") )
458 | 
459 |         # I-TLB -> I-L1
460 |         link = sst.Link(prefix+".link_l1icache")
461 |         link.connect( (itlbWrapper, "cache_if", "25ps"), (l1icache_2_cpu, "port", "25ps") )
462 | 
463 |         # L1 I-Cache to bus
464 |         link = sst.Link(prefix + ".link_l1dcache_l2cache")
465 |         link.connect( (l1dcache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_0", "1ns") )
466 | 
467 |         # L1 D-Cache to bus
468 |         link = sst.Link(prefix + ".link_l1icache_l2cache")
469 |         link.connect( (l1icache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_1", "1ns") )
470 | 
471 |         # BUS to L2 cache
472 |         link = sst.Link(prefix+".link_bus_l2cache")
473 |         link.connect( (cache_bus, "low_network_0", "1ns"), (l2cache_2_cpu, "port", "1ns") )
474 | 
475 |         return cpu, l2cache, dtlb, itlb
476 | 
477 | 
478 | def addParamsPrefix(prefix,params):
479 |     #print( prefix )
480 |     ret = {}
481 |     for key, value in params.items():
482 |         #print( key, value )
483 |         ret[ prefix + "." + key] = value
484 | 
485 |     #print( ret )
486 |     return ret
487 | 
488 | 
489 | 
490 | class OS_Builder:
491 |     def __init__(self):
492 |         pass
493 | 
494 |     def build( self, numNodes, nodeId):
495 | 
496 |         self.prefix = 'node' + str(nodeId)
497 | 
498 |         self.nodeOS = sst.Component(self.prefix + ".os", "vanadis.VanadisNodeOS")
499 |         self.nodeOS.addParam("node_id", nodeId)
500 |         self.nodeOS.addParams(osParams)
501 |         if enableStats:
502 |             self.nodeOS.enableAllStatistics()
503 | 
504 |         processList = (
505 |                 ( 1, {
506 |                     "env_count" : 6,
507 |                     "env0" : "OMP_NUM_THREADS={}".format(num_cpu_per_node*num_threads_per_cpu),
508 |                     "env1" : "PMI_SIZE={}".format(num_node),
509 |                     "env2" : "PMI_RANK={}".format(nodeId),
510 |                     "env3" : "RDMA_NIC_NUM_POSTED_RECV={}".format(rdma_nic_num_posted_recv),
511 |                     "env4" : "RDMA_NIC_COMP_Q_SIZE={}".format(rdma_nic_comp_q_size),
512 |                     "env5" : "TZ=UTC",
513 |                     "exe"  : full_exe_name,
514 |                     "arg0" : exe_name,
515 |                     } ),
516 |                 )
517 | 
518 |         processList[0][1].update(app_params)
519 | 
520 |         num=0
521 |         for i,process in processList:
522 |             for y in range(i):
523 |                 self.nodeOS.addParams( addParamsPrefix( "process" + str(num), process ) )
524 |                 num+=1
525 | 
526 |         self.mmu = self.nodeOS.setSubComponent( "mmu", "mmu.simpleMMU" )
527 | 
528 |         self.mmu.addParams(mmuParams)
529 | 
530 |         mem_if = self.nodeOS.setSubComponent( "mem_interface", "memHierarchy.standardInterface" )
531 | 
532 |         l1cache = sst.Component(self.prefix + ".node_os.l1cache", "memHierarchy.Cache")
533 |         l1cache.addParams(osl1cacheParams)
534 | 
535 |         l1cache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
536 | 
537 |         link = sst.Link(self.prefix + ".link_os_l1cache")
538 |         link.connect( (mem_if, "port", "25ps"), (l1cache_2_cpu, "port", "25ps") )
539 | 
540 |         return l1cache
541 | 
542 |     def connectCPU( self, core, cpu ):
543 |         link = sst.Link(self.prefix + ".link_core" + str(core) + "_os")
544 |         link.connect( (cpu, "os_link", "5ns"), (self.nodeOS, "core" + str(core), "5ns") )
545 | 
546 |     def connectTlb( self, core, name, tlblink ):
547 |         linkName = self.prefix + ".link_mmu_core" + str(core) + "_" + name
548 |         link = sst.Link( linkName )
549 |         link.connect( (self.mmu, "core"+str(core)+ "." +name, "25ps"), (tlblink, "mmu", "25ps") )
550 | 
551 |     def connectNicTlb( self, name, niclink ):
552 |         linkName = self.prefix + ".link_mmu_" + name
553 |         link = sst.Link( linkName )
554 |         link.connect( (self.mmu, name, "25ps"), (niclink, "mmu", "25ps") )
555 | 
556 | 
557 | 
558 | 
559 | class rdmaNic_Builder:
560 |     def __init__(self,numNodes):
561 |         self.numNodes = numNodes
562 | 
563 |     def build( self, nodeId ):
564 | 
565 |         prefix = 'node' + str(nodeId)
566 |         nic = sst.Component( prefix + ".nic", "rdmaNic.nic")
567 |         nic.addParams(rdmaNiCParams)
568 |         nic.addParam( 'nicId', nodeId )
569 |         nic.addParam( 'pesPerNode', 1 )
570 |         nic.addParam( 'numNodes', self.numNodes )
571 |         if enableStats :
572 |             nic.enableAllStatistics()
573 | 
574 | 
575 |         # NIC DMA interface
576 |         dmaIf = nic.setSubComponent("dma", "memHierarchy.standardInterface")
577 | 
578 |         # NIC MMIO interface
579 |         mmioIf = nic.setSubComponent("mmio", "memHierarchy.standardInterface")
580 | 
581 |         # NIC DMA Cache
582 |         dmaCache = sst.Component(prefix + ".nicDmaCache", "memHierarchy.Cache")
583 |         dmaCache.addParams(rdmaCacheParams)
584 | 
585 |         # NIC DMA TLB
586 |         tlbWrapper = sst.Component(prefix+".nicDmaTlb", "mmu.tlb_wrapper")
587 |         tlb = tlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
588 |         tlb.addParam("num_hardware_threads", num_cpu_per_node*num_threads_per_cpu)
589 |         tlb.addParams(tlbParams)
590 | 
591 |         # Cache to CPU interface
592 |         dmaCacheToCpu = dmaCache.setSubComponent("cpulink", "memHierarchy.MemLink")
593 | 
594 |         # NIC DMA -> TLB
595 |         link = sst.Link(prefix+".link_cpu_dtlb")
596 |         link.connect( (dmaIf, "port", "25ps"), (tlbWrapper, "cpu_if", "25ps") )
597 | 
598 |         # NIC DMA TLB -> cache
599 |         link = sst.Link(prefix+".link_cpu_l1dcache")
600 |         link.connect( (tlbWrapper, "cache_if", "25ps"), (dmaCacheToCpu, "port", "25ps") )
601 | 
602 |         # NIC internode interface
603 |         netLink = nic.setSubComponent( "rtrLink", "merlin.linkcontrol" )
604 |         netLink.addParams(rdmaLinkParams)
605 | 
606 |         return mmioIf, dmaCache, tlb, (netLink, "rtr_port", '10ns')
607 | 
608 | class memory_Builder:
609 |     def __init__(self):
610 |         pass
611 | 
612 |     def build( self, nodeId, numPorts,  group  ):
613 | 
614 |         self.prefix = 'node' + str(nodeId)
615 |         self.numPorts = numPorts + 1
616 | 
617 |         self.chiprtr = sst.Component(self.prefix + ".chiprtr", "merlin.hr_router")
618 |         self.chiprtr.addParam("num_ports", self.numPorts)
619 |         self.chiprtr.addParams(nodeRtrParams)
620 |         self.chiprtr.setSubComponent("topology","merlin.singlerouter")
621 | 
622 |         if enableStats:
623 |             self.chiprtr.enableAllStatistics()
624 | 
625 |         dirctrl = sst.Component(self.prefix + ".dirctrl", "memHierarchy.DirectoryController")
626 |         dirctrl.addParams(dirCtrlParams)
627 |         dirtoMemLink = dirctrl.setSubComponent("memlink", "memHierarchy.MemLink")
628 |         self.connect( "Dirctrl", self.numPorts - 1, dirctrl, group, linkType="cpulink" )
629 |         if enableStats:
630 |             dirctrl.enableAllStatistics()
631 | 
632 |         memctrl = sst.Component(self.prefix + ".memory", "memHierarchy.MemController")
633 |         memctrl.addParams(memCtrlParams)
634 |         if enableStats:
635 |             memctrl.enableAllStatistics()
636 | 
637 |         memToDir = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
638 | 
639 |         memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
640 |         memory.addParams(memBackendParams)
641 | 
642 |         link = sst.Link(self.prefix + ".link_dir_mem")
643 |         link.connect( (dirtoMemLink, "port", "25ps"), (memToDir, "port", "25ps") )
644 | 
645 |     def connect( self, name, port, comp, group=None, linkType="memlink"  ):
646 | 
647 |         assert group
648 |         assert port < self.numPorts
649 | 
650 |         memNIC = comp.setSubComponent(linkType, "memHierarchy.MemNIC")
651 |         memNIC.addParam("group", group)
652 |         memNIC.addParams(memNICParams)
653 | 
654 |         link = sst.Link(self.prefix + ".link_rtr" + str(port) )
655 |         link.connect( (self.chiprtr, "port" + str(port), "25ps"), (memNIC, "port", "25ps") )
656 | 
657 | 
658 | class Endpoint():
659 |     def __init__(self,numNodes):
660 |         self.numNodes = numNodes
661 | 
662 |     def prepParams(self):
663 |         pass
664 | 
665 |     def build(self, nodeId, extraKeys ):
666 | 
667 |         prefix = 'node' + str(nodeId);
668 | 
669 |         cpuBuilder = CPU_Builder()
670 |         memBuilder = memory_Builder()
671 |         osBuilder = OS_Builder()
672 | 
673 |         numPorts = 3 + num_cpu_per_node
674 |         port = 0
675 |         memBuilder.build(nodeId, numPorts, group=2 )
676 | 
677 |         # build the Vanadis OS, it returns
678 |         osCache = osBuilder.build( self.numNodes, nodeId)
679 | 
680 |         # connect OS L1 to Memory
681 |         memBuilder.connect( "OS_L1", port, osCache, group=1 )
682 |         port += 1;
683 | 
684 |         # build the Vanadis CPU block, this returns
685 |         # cpu, L2 cache, DTLB ITLB
686 |         for i in range(num_cpu_per_node):
687 |             cpu, L2, dtlb, itlb = cpuBuilder.build(nodeId, i)
688 | 
689 |             osBuilder.connectCPU( i, cpu )
690 |             osBuilder.connectTlb( i, "dtlb", dtlb )
691 |             osBuilder.connectTlb( i, "itlb", itlb )
692 | 
693 |             # connect CPU L2 to Memory
694 |             memBuilder.connect( "CPU_L2", port, L2, group=1 )
695 |             port += 1;
696 | 
697 |         nicBuilder = rdmaNic_Builder(self.numNodes)
698 |         # build the Rdma NIC, this returns
699 |         # MMIO link, DMA cache, DMA TLB
700 |         mmioIf, dmaCache, dmaTlb, netLink = nicBuilder.build(nodeId)
701 | 
702 |         osBuilder.connectNicTlb( "nicTlb", dmaTlb )
703 | 
704 |         # connect the NIC MMIO to Memory
705 |         #memBuilder.connect( "NIC_MMIO", port, mmioIf, 3, source="1", dest="2" )
706 |         memBuilder.connect( "NIC_MMIO", port, mmioIf, group=2 )
707 |         port += 1;
708 | 
709 |         # connect the NIC DMA Cache to Memory
710 |         #memBuilder.connect( "NIC_DMA", port, dmaCache, 1, dest="2" )
711 |         memBuilder.connect( "NIC_DMA", port, dmaCache, group=1 )
712 |         port += 1;
713 |         return netLink
714 | 
715 | ep = Endpoint( num_node )
716 | 
717 | def setNode( nodeId ):
718 |     return ep;
719 | 
720 | for p in networkParams:
721 |     sst.merlin._params[p] = networkParams[p]
722 | 
723 | if network_topology == "torus":
724 |     topo = topoTorus()
725 | elif network_topology == "fattree":
726 |     topo = topoFatTree()
727 | else:
728 |     topo = topoSimple()
729 | 
730 | topo.bundleEndpoints = False
731 | topo.prepParams()
732 | topo.setEndPointFunc( setNode )
733 | topo.build()
734 | 


--------------------------------------------------------------------------------
/demo/sst/instruction-level-simulation/scale_up.py:
--------------------------------------------------------------------------------
  1 | ## This source code is licensed under the MIT license found in the
  2 | ## LICENSE file in the root directory of this source tree.
  3 | ##
  4 | ## Copyright (c) 2025 IMEC. All rights reserved.
  5 | ## ******************************************************************************
  6 | 
  7 | import sys
  8 | import argparse
  9 | import sst
 10 | import os
 11 | from sst.merlin import *
 12 | 
 13 | 
 14 | parser = argparse.ArgumentParser(
 15 |    prog=f'sst {__file__} --',
 16 |    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 17 | 
 18 | 
 19 | parser.add_argument("--num_threads_per_cpu", type=int, help="Number of hardware threads per cpu", default=2)
 20 | parser.add_argument("--num_cpu_per_node", type=int, help="Number of cpu per node", default=2)
 21 | parser.add_argument("--exe", type=str, help="Binary to run", default="../software/riscv64/mha_OMP_16")
 22 | parser.add_argument("--app_args", type=str, help="Arguments of the application", default="64 128 8")
 23 | parser.add_argument("--stats", type=str, help="write statistics, argument changes the filename", nargs="?", const="-")
 24 | args = parser.parse_args()
 25 | 
 26 | 
 27 | if args.stats:
 28 |     enableStats = True
 29 |     sst.setStatisticLoadLevel(10)
 30 | 
 31 |     fname = args.stats
 32 |     if fname.endswith(".csv"):
 33 |         sst.setStatisticOutput("sst.statOutputCSV",
 34 |                        {   "filepath" : fname,
 35 |                         "separator" : ";"
 36 |                         } )
 37 |     else:
 38 |         sst.setStatisticOutput("sst.statOutputConsole")
 39 | else:
 40 |     enableStats = False
 41 | 
 42 | num_threads_per_cpu = args.num_threads_per_cpu
 43 | num_cpu_per_node = args.num_cpu_per_node
 44 | app_args = args.app_args
 45 | full_exe_name = args.exe
 46 | os_verbosity = 0
 47 | 
 48 | cpu_clock = "3GHz"
 49 | 
 50 | coherence_protocol="MESI"
 51 | cache_line_size = 64
 52 | 
 53 | l2cache_size = 1 * 1024**2 # 1MiB
 54 | page_size = 4096
 55 | memsize = 2 * 1024**3 # 2GiB
 56 | physMemSize = str(memsize) + " B"
 57 | 
 58 | 
 59 | exe_name= full_exe_name.split("/")[-1]
 60 | 
 61 | tlbParams = {
 62 |         "hitLatency": 3,
 63 |         "num_tlb_entries_per_thread": 64,
 64 |         "tlb_set_size": 4,
 65 |         "minVirtAddr" : 0x1000,
 66 |         "maxVirtAddr" : memsize
 67 |         }
 68 | 
 69 | nodeRtrParams = {
 70 |         "xbar_bw" : "57.6GB/s",
 71 |         "link_bw" : "28.8GB/s",
 72 |         "input_buf_size" : "40KB",
 73 |         "output_buf_size" : "40KB",
 74 |         "flit_size" : "72B",
 75 |         "id" : "0",
 76 |         "topology" : "merlin.singlerouter"
 77 |         }
 78 | 
 79 | # DRAM bandwidth = memCtrl.clock * request width * max_requests_per_cycle = 25.6 GB/s
 80 | memCtrlParams = {
 81 |         "clock" : "1.6GHz",
 82 |         "backend.mem_size" : physMemSize,
 83 |         "backing" : "malloc",
 84 |         "initBacking" : 1,
 85 |         "addr_range_start" : 0x0,
 86 |         "addr_range_end" : memsize - 1,
 87 |         "backendConvertor.request_width" : 16
 88 |         }
 89 | 
 90 | memBackendParams = {
 91 |         "mem_size" : physMemSize,
 92 |         "access_time" : "20ns",
 93 |         "max_requests_per_cycle" : 1,
 94 |         "request_width" : 16
 95 |         }
 96 | 
 97 | memNICParams = {
 98 |         "min_packet_size" : "72B",
 99 |         "network_bw" : "28.8GB/s",
100 |         "network_input_buffer_size" : "4KiB",
101 |         "network_output_buffer_size" : "4KiB"
102 |         }
103 | 
104 | # OS related params
105 | osParams = {
106 |         "dbgLevel" : os_verbosity,
107 |         "cores" : num_cpu_per_node,
108 |         "hardwareThreadCount" : num_threads_per_cpu,
109 |         "page_size"  : page_size,
110 |         "physMemSize" : physMemSize,
111 |         "useMMU" : True,
112 |         }
113 | 
114 | osl1cacheParams = {
115 |         "access_latency_cycles" : 1,
116 |         "cache_frequency" : cpu_clock,
117 |         "replacement_policy" : "lru",
118 |         "coherence_protocol" : coherence_protocol,
119 |         "associativity" : 8,
120 |         "cache_line_size" : cache_line_size,
121 |         "cache_size" : "32 KiB",
122 |         "L1" : "1",
123 |         }
124 | 
125 | mmuParams = {
126 |         "num_cores": num_cpu_per_node,
127 |         "num_threads": num_threads_per_cpu,
128 |         "page_size": page_size,
129 |         "useNicTlb":  False,
130 |         }
131 | 
132 | 
133 | vanadis_cpu_type = "vanadis.VanadisCPU"
134 | cpuParams = {
135 |         "clock" : cpu_clock,
136 |         "hardware_threads": num_threads_per_cpu,
137 |         "physical_fp_registers" : 168 * num_threads_per_cpu,
138 |         "physical_integer_registers" : 180 * num_threads_per_cpu,
139 |         "integer_arith_units" : 2,
140 |         "integer_arith_cycles" : 2,
141 |         "integer_div_units" : 1,
142 |         "integer_div_cycles" : 20,
143 |         "fp_arith_cycles" : 3,
144 |         "fp_arith_units" : 2,
145 |         "fp_div_units" : 2,
146 |         "fp_div_cycles" : 20,
147 |         "branch_units" : 1,
148 |         "branch_unit_cycles" : 2,
149 |         "reorder_slots" : 128,
150 |         "decodes_per_cycle" : 4,
151 |         "issues_per_cycle" :  4,
152 |         "retires_per_cycle" : 4,
153 |         }
154 | 
155 | branchPredParams = {
156 |         "branch_entries" : 64
157 |         }
158 | 
159 | decoderParams = {
160 |         "loader_mode" : 1,
161 |         "uop_cache_entries" : 1536,
162 |         "predecode_cache_entries" : 4
163 |         }
164 | 
165 | lsqParams = {
166 |         "max_stores" : 16,
167 |         "max_loads" : 32,
168 |         }
169 | 
170 | 
171 | l1dcacheParams = {
172 |         "access_latency_cycles" : 1,
173 |         "cache_frequency" : cpu_clock,
174 |         "replacement_policy" : "lru",
175 |         "coherence_protocol" : coherence_protocol,
176 |         "associativity" : 8,
177 |         "cache_line_size" : cache_line_size,
178 |         "cache_size" : "64 KiB",
179 |         "prefetcher" : "cassini.NextBlockPrefetcher",
180 |         "prefetcher.reach" : 2,
181 |         "L1" : "1",
182 |         }
183 | 
184 | l1icacheParams = {
185 |         "access_latency_cycles" : 1,
186 |         "cache_frequency" : cpu_clock,
187 |         "replacement_policy" : "lru",
188 |         "coherence_protocol" : coherence_protocol,
189 |         "associativity" : 8,
190 |         "cache_line_size" : cache_line_size,
191 |         "cache_size" : "32 KiB",
192 |         "prefetcher" : "cassini.NextBlockPrefetcher",
193 |         "prefetcher.reach" : 1,
194 |         "L1" : "1",
195 |         }
196 | 
197 | l2cacheParams = {
198 |         "access_latency_cycles" : 8,
199 |         "max_requests_per_cycle" : 2,
200 |         "cache_frequency" : cpu_clock,
201 |         "replacement_policy" : "lru",
202 |         "coherence_protocol" : coherence_protocol,
203 |         "associativity" : 16,
204 |         "cache_line_size" : cache_line_size,
205 |         "cache_size" : str(l2cache_size) + 'B',
206 |         "mshr_latency_cycles": 3,
207 |         }
208 | 
209 | busParams = {
210 |         "bus_frequency" : cpu_clock,
211 |         }
212 | 
213 | dirCtrlParams = {
214 |         "max_requests_per_cycle" : 2,
215 |         "coherence_protocol" : coherence_protocol,
216 |         "entry_cache_size" : l2cache_size*num_cpu_per_node/cache_line_size,
217 |         "cache_line_size" : cache_line_size,
218 |         "addr_range_start" : 0x0,
219 |         "addr_range_end" : memsize - 1
220 |         }
221 | 
222 | app_params = {}
223 | if app_args != "":
224 |     app_args_list = app_args.split(" ")
225 |     # We have a plus 1 because the executable name is arg0
226 |     app_args_count = len( app_args_list ) + 1
227 | 
228 |     app_params["argc"] = app_args_count
229 | 
230 |     arg_start = 1
231 |     for next_arg in app_args_list:
232 |         app_params["arg" + str(arg_start)] = next_arg
233 |         arg_start = arg_start + 1
234 | else:
235 |     app_params["argc"] = 1
236 | 
237 | class CPU_Builder:
238 |     def __init__(self):
239 |         pass
240 | 
241 |     def build( self, nodeId, cpuId ):
242 | 
243 |         prefix = 'node' + str(nodeId) + '.cpu' + str( cpuId )
244 |         cpu = sst.Component(prefix, vanadis_cpu_type)
245 |         cpu.addParams( cpuParams )
246 |         cpu.addParam( "core_id", cpuId )
247 |         cpu.addParam( "node_id", nodeId )
248 |         cpu.enableAllStatistics()
249 | 
250 |         # CPU.decoder
251 |         for n in range(num_threads_per_cpu):
252 |             decode     = cpu.setSubComponent( "decoder"+str(n), "vanadis.VanadisRISCV64Decoder" )
253 |             decode.addParams( decoderParams )
254 | 
255 |             if enableStats:
256 |                 decode.enableAllStatistics()
257 | 
258 |             # CPU.decoder.osHandler
259 |             os_hdlr     = decode.setSubComponent( "os_handler", "vanadis.VanadisRISCV64OSHandler" )
260 | 
261 |             # CPU.decocer.branch_pred
262 |             branch_pred = decode.setSubComponent( "branch_unit", "vanadis.VanadisBasicBranchUnit" )
263 |             branch_pred.addParams( branchPredParams )
264 | 
265 |             if enableStats:
266 |                 branch_pred.enableAllStatistics()
267 | 
268 | 
269 |         # CPU.lsq
270 |         cpu_lsq = cpu.setSubComponent( "lsq", "vanadis.VanadisBasicLoadStoreQueue" )
271 |         cpu_lsq.addParams(lsqParams)
272 |         if enableStats:
273 |             cpu_lsq.enableAllStatistics()
274 | 
275 | 
276 |         icache_if = cpu.setSubComponent( "mem_interface_inst", "memHierarchy.standardInterface" )
277 |         icache_if.addParam("coreId",cpuId)
278 | 
279 |         dcache_if = cpu_lsq.setSubComponent( "memory_interface", "memHierarchy.standardInterface" )
280 |         dcache_if.addParam("coreId",cpuId)
281 | 
282 |         # L1 D-Cache
283 |         l1cache = sst.Component(prefix + ".l1dcache", "memHierarchy.Cache")
284 |         l1cache.addParams( l1dcacheParams )
285 |         if enableStats:
286 |             l1cache.enableAllStatistics()
287 | 
288 |         l1dcache_2_cpu     = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
289 |         l1dcache_2_l2cache = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
290 | 
291 |         # L1 I-Cache
292 |         l1icache = sst.Component(prefix + ".l1icache", "memHierarchy.Cache")
293 |         l1icache.addParams(l1icacheParams)
294 |         if enableStats:
295 |             l1icache.enableAllStatistics()
296 | 
297 |         # Bus
298 |         cache_bus = sst.Component(prefix + ".bus", "memHierarchy.Bus")
299 |         cache_bus.addParams(busParams)
300 |         if enableStats:
301 |             cache_bus.enableAllStatistics()
302 | 
303 |         # L2 D-Cache
304 |         l2cache = sst.Component(prefix + ".l2cache", "memHierarchy.Cache")
305 |         l2cache.addParams(l2cacheParams)
306 |         if enableStats:
307 |             l2cache.enableAllStatistics()
308 | 
309 |         l2cache_2_cpu = l2cache.setSubComponent("cpulink", "memHierarchy.MemLink")
310 | 
311 |         # CPU D-TLB
312 |         dtlbWrapper = sst.Component(prefix+".dtlb", "mmu.tlb_wrapper")
313 |         dtlb = dtlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
314 |         dtlb.addParam("num_hardware_threads", num_threads_per_cpu)
315 |         dtlb.addParams(tlbParams)
316 | 
317 |         # CPU I-TLB
318 |         itlbWrapper = sst.Component(prefix+".itlb", "mmu.tlb_wrapper")
319 |         itlbWrapper.addParam("exe",True)
320 |         itlb = itlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
321 |         itlb.addParam("num_hardware_threads", num_threads_per_cpu)
322 |         itlb.addParams(tlbParams)
323 | 
324 |         # CPU (data) -> D-TLB
325 |         link = sst.Link(prefix+".link_cpu_dtlb")
326 |         link.connect( (dcache_if, "port", "25ps"), (dtlbWrapper, "cpu_if", "25ps") )
327 | 
328 |         # CPU (instruction) -> I-TLB
329 |         link = sst.Link(prefix+".link_cpu_itlb")
330 |         link.connect( (icache_if, "port", "25ps"), (itlbWrapper, "cpu_if", "25ps") )
331 | 
332 |         l1icache_2_cpu     = l1icache.setSubComponent("cpulink", "memHierarchy.MemLink")
333 |         l1icache_2_l2cache = l1icache.setSubComponent("memlink", "memHierarchy.MemLink")
334 | 
335 |         # D-TLB -> D-L1
336 |         link = sst.Link(prefix+".link_l1cache")
337 |         link.connect( (dtlbWrapper, "cache_if", "25ps"), (l1dcache_2_cpu, "port", "25ps") )
338 | 
339 |         # I-TLB -> I-L1
340 |         link = sst.Link(prefix+".link_l1icache")
341 |         link.connect( (itlbWrapper, "cache_if", "25ps"), (l1icache_2_cpu, "port", "25ps") )
342 | 
343 |         # L1 I-Cache to bus
344 |         link = sst.Link(prefix + ".link_l1dcache_l2cache")
345 |         link.connect( (l1dcache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_0", "1ns") )
346 | 
347 |         # L1 D-Cache to bus
348 |         link = sst.Link(prefix + ".link_l1icache_l2cache")
349 |         link.connect( (l1icache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_1", "1ns") )
350 | 
351 |         # BUS to L2 cache
352 |         link = sst.Link(prefix+".link_bus_l2cache")
353 |         link.connect( (cache_bus, "low_network_0", "1ns"), (l2cache_2_cpu, "port", "1ns") )
354 | 
355 |         return cpu, l2cache, dtlb, itlb
356 | 
357 | 
358 | def addParamsPrefix(prefix,params):
359 |     #print( prefix )
360 |     ret = {}
361 |     for key, value in params.items():
362 |         #print( key, value )
363 |         ret[ prefix + "." + key] = value
364 | 
365 |     #print( ret )
366 |     return ret
367 | 
368 | 
369 | 
370 | class OS_Builder:
371 |     def __init__(self):
372 |         pass
373 | 
374 |     def build( self, numNodes, nodeId):
375 | 
376 |         self.prefix = 'node' + str(nodeId)
377 | 
378 |         self.nodeOS = sst.Component(self.prefix + ".os", "vanadis.VanadisNodeOS")
379 |         self.nodeOS.addParam("node_id", nodeId)
380 |         self.nodeOS.addParams(osParams)
381 |         if enableStats:
382 |             self.nodeOS.enableAllStatistics()
383 | 
384 |         processList = (
385 |                 ( 1, {
386 |                     "env_count" : 3,
387 |                     "env0" : "OMP_NUM_THREADS={}".format(num_cpu_per_node*num_threads_per_cpu),
388 |                     "env1" : "TZ=UTC",
389 |                     "env2" : "MV2_ENABLE_AFFINITY=0",
390 |                     "exe"  : full_exe_name,
391 |                     "arg0" : exe_name,
392 |                     } ),
393 |                 )
394 | 
395 |         processList[0][1].update(app_params)
396 | 
397 |         num=0
398 |         for i,process in processList:
399 |             for y in range(i):
400 |                 self.nodeOS.addParams( addParamsPrefix( "process" + str(num), process ) )
401 |                 num+=1
402 | 
403 |         self.mmu = self.nodeOS.setSubComponent( "mmu", "mmu.simpleMMU" )
404 | 
405 |         self.mmu.addParams(mmuParams)
406 | 
407 |         mem_if = self.nodeOS.setSubComponent( "mem_interface", "memHierarchy.standardInterface" )
408 | 
409 |         l1cache = sst.Component(self.prefix + ".node_os.l1cache", "memHierarchy.Cache")
410 |         l1cache.addParams(osl1cacheParams)
411 | 
412 |         l1cache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
413 | 
414 |         link = sst.Link(self.prefix + ".link_os_l1cache")
415 |         link.connect( (mem_if, "port", "25ps"), (l1cache_2_cpu, "port", "25ps") )
416 | 
417 |         return l1cache
418 | 
419 |     def connectCPU( self, core, cpu ):
420 |         link = sst.Link(self.prefix + ".link_core" + str(core) + "_os")
421 |         link.connect( (cpu, "os_link", "5ns"), (self.nodeOS, "core" + str(core), "5ns") )
422 | 
423 |     def connectTlb( self, core, name, tlblink ):
424 |         linkName = self.prefix + ".link_mmu_core" + str(core) + "_" + name
425 |         link = sst.Link( linkName )
426 |         link.connect( (self.mmu, "core"+str(core)+ "." +name, "25ps"), (tlblink, "mmu", "25ps") )
427 | 
428 | class memory_Builder:
429 |     def __init__(self):
430 |         pass
431 | 
432 |     def build( self, nodeId, numPorts,  group  ):
433 | 
434 |         self.prefix = 'node' + str(nodeId)
435 |         self.numPorts = numPorts + 1
436 | 
437 |         self.chiprtr = sst.Component(self.prefix + ".chiprtr", "merlin.hr_router")
438 |         self.chiprtr.addParam("num_ports", self.numPorts)
439 |         self.chiprtr.addParams(nodeRtrParams)
440 |         self.chiprtr.setSubComponent("topology","merlin.singlerouter")
441 | 
442 |         if enableStats:
443 |             self.chiprtr.enableAllStatistics()
444 | 
445 |         dirctrl = sst.Component(self.prefix + ".dirctrl", "memHierarchy.DirectoryController")
446 |         dirctrl.addParams(dirCtrlParams)
447 |         dirtoMemLink = dirctrl.setSubComponent("memlink", "memHierarchy.MemLink")
448 |         self.connect( "Dirctrl", self.numPorts -1 , dirctrl, group, linkType="cpulink" )
449 |         if enableStats:
450 |             dirctrl.enableAllStatistics()
451 | 
452 |         memctrl = sst.Component(self.prefix + ".memory", "memHierarchy.MemController")
453 |         memctrl.addParams(memCtrlParams)
454 |         if enableStats:
455 |             memctrl.enableAllStatistics()
456 | 
457 |         memToDir = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
458 | 
459 |         memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
460 |         memory.addParams(memBackendParams)
461 | 
462 |         link = sst.Link(self.prefix + ".link_dir_mem")
463 |         link.connect( (dirtoMemLink, "port", "25ps"), (memToDir, "port", "25ps") )
464 | 
465 |     def connect( self, name, port, comp, group=None, linkType="memlink"  ):
466 | 
467 |         assert group
468 |         assert port < self.numPorts
469 | 
470 |         memNIC = comp.setSubComponent(linkType, "memHierarchy.MemNIC")
471 |         memNIC.addParam("group", group)
472 |         memNIC.addParams(memNICParams)
473 | 
474 |         link = sst.Link(self.prefix + ".link_rtr" + str(port) )
475 |         link.connect( (self.chiprtr, "port" + str(port), "25ps"), (memNIC, "port", "25ps") )
476 | 
477 | 
478 | class node_Builder():
479 |     def __init__(self):
480 |         pass
481 | 
482 |     def prepParams(self):
483 |         pass
484 | 
485 |     def build(self, nodeId, extraKeys ):
486 | 
487 |         prefix = 'node' + str(nodeId);
488 | 
489 |         cpuBuilder = CPU_Builder()
490 |         memBuilder = memory_Builder()
491 |         osBuilder = OS_Builder()
492 | 
493 |         numPorts = 1  + num_cpu_per_node
494 |         port = 0
495 |         memBuilder.build(nodeId, numPorts, group=2 )
496 | 
497 |         # build the Vanadis OS, it returns
498 |         osCache = osBuilder.build( 1, nodeId)
499 | 
500 |         # connect OS L1 to Memory
501 |         #memBuilder.connect( "OS_L1", port, osCache, 1, dest="2" )
502 |         memBuilder.connect( "OS_L1", port, osCache, group=1 )
503 |         port += 1;
504 | 
505 |         # build the Vanadis CPU block, this returns
506 |         # cpu, L2 cache, DTLB ITLB
507 |         for i in range(num_cpu_per_node):
508 |             cpu, L2, dtlb, itlb = cpuBuilder.build(nodeId, i)
509 | 
510 |             osBuilder.connectCPU( i, cpu )
511 |             osBuilder.connectTlb( i, "dtlb", dtlb )
512 |             osBuilder.connectTlb( i, "itlb", itlb )
513 | 
514 |             # connect CPU L2 to Memory
515 |             #memBuilder.connect( "CPU_L2", port, L2, 1, dest="2,3" )
516 |             memBuilder.connect( "CPU_L2", port, L2, group=1 )
517 |             port += 1;
518 | 
519 | nodeBuilder = node_Builder()
520 | 
521 | nodeBuilder.build(0,{})
522 | 


--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": ".",
 3 |   "architectures": [
 4 |     "LlamaForCausalLM"
 5 |   ],
 6 |   "attention_bias": false,
 7 |   "attention_dropout": 0.0,
 8 |   "bos_token_id": 128000,
 9 |   "eos_token_id": [
10 |     128001,
11 |     128008,
12 |     128009
13 |   ],
14 |   "hidden_act": "silu",
15 |   "hidden_size": 16384,
16 |   "initializer_range": 0.02,
17 |   "intermediate_size": 53248,
18 |   "max_position_embeddings": 131072,
19 |   "mlp_bias": false,
20 |   "model_type": "llama",
21 |   "num_attention_heads": 128,
22 |   "num_hidden_layers": 126,
23 |   "num_key_value_heads": 8,
24 |   "pretraining_tp": 1,
25 |   "rms_norm_eps": 1e-05,
26 |   "rope_scaling": {
27 |     "factor": 8.0,
28 |     "high_freq_factor": 4.0,
29 |     "low_freq_factor": 1.0,
30 |     "original_max_position_embeddings": 8192,
31 |     "rope_type": "llama3"
32 |   },
33 |   "rope_theta": 500000.0,
34 |   "tie_word_embeddings": false,
35 |   "torch_dtype": "bfloat16",
36 |   "transformers_version": "4.43.3",
37 |   "use_cache": true,
38 |   "vocab_size": 128256
39 | }
40 | 


--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/small_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "LlamaForCausalLM"
 4 |   ],
 5 |   "attention_bias": false,
 6 |   "attention_dropout": 0.0,
 7 |   "bos_token_id": 128000,
 8 |   "eos_token_id": 128001,
 9 |   "head_dim": 64,
10 |   "hidden_act": "silu",
11 |   "hidden_size": 2048,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 8192,
14 |   "max_position_embeddings": 131072,
15 |   "mlp_bias": false,
16 |   "model_type": "llama",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 16,
19 |   "num_key_value_heads": 8,
20 |   "pretraining_tp": 1,
21 |   "rms_norm_eps": 1e-05,
22 |   "rope_scaling": {
23 |     "factor": 32.0,
24 |     "high_freq_factor": 4.0,
25 |     "low_freq_factor": 1.0,
26 |     "original_max_position_embeddings": 8192,
27 |     "rope_type": "llama3"
28 |   },
29 |   "rope_theta": 500000.0,
30 |   "tie_word_embeddings": true,
31 |   "torch_dtype": "bfloat16",
32 |   "transformers_version": "4.45.0.dev0",
33 |   "use_cache": true,
34 |   "vocab_size": 128256
35 | }
36 | 


--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/training_llm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import sst
  4 | import os
  5 | import math
  6 | 
  7 | from sst.merlin.base import *
  8 | from sst.merlin.endpoint import *
  9 | from sst.merlin.interface import *
 10 | from sst.merlin.topology import *
 11 | from sst.ember import *
 12 | 
 13 | 
 14 | params = {
 15 |     # in GB/s
 16 |     "link_bw" : 128,
 17 | 
 18 |     "input_buf_size" : "4MB",
 19 |     "output_buf_size" : "4MB",
 20 | 
 21 |     "flit_size" : "256B",
 22 | 
 23 |     "link_lat" : "10ns",
 24 |     "input_latency" : "10ns",
 25 |     "output_latency" : "10ns",
 26 |     "host_link_latency" : "100ns",
 27 | 
 28 |     "num_vns" : 2,
 29 |     "width": 2,
 30 | 
 31 |     "xbar_arb" : "merlin.xbar_arb_lru"
 32 | }
 33 | 
 34 | llm_config_default= os.path.join(os.path.dirname(os.path.realpath(__file__)), "small_config.json")
 35 | 
 36 | parser = argparse.ArgumentParser(
 37 |         prog=f'sst {__file__} --',
 38 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 39 | parser.add_argument("--tp", type=int, help="Tensor Parallelism level", default=8)
 40 | parser.add_argument("--pp", type=int, help="Data Parallelism level", default=1)
 41 | parser.add_argument("--dp", type=int, help="Pipeline Parallelism level", default=1)
 42 | parser.add_argument("--batch_size", type=int, help="Number of sequence processed in parallel", default=32)
 43 | parser.add_argument("--sequence_len", type=int, help="Number of token per sequence", default=8192)
 44 | parser.add_argument("--n_batch", type=int, help="Number of batches", default=128)
 45 | parser.add_argument("--llm_config", type=str, help="Configuration file of the Large Language Model", default=llm_config_default)
 46 | parser.add_argument("--peak_flop", type=int, help="Peak flop throughput per end point for the targeted data type", default=78e12)
 47 | parser.add_argument("--draw_bw", type=int, help="DRAM bandwidth per end point", default=1555e9)
 48 | parser.add_argument("--verbose", type=int, help="Set verbosity", default=0)
 49 | parser.add_argument("--log", type=str, help="Enable motif logger", action='store', nargs='?', const="logger")
 50 | parser.add_argument("--stats", type=str, help="write statistics, argument changes the filename", nargs="?", const="-")
 51 | parser.add_argument("--topology", type=str, help="Network topology", default="single",
 52 |                     choices=["single", "dragonfly", "fattree"] )
 53 | args = parser.parse_args()
 54 | 
 55 | assert os.path.exists(args.llm_config), "LLM config file does not exist!"
 56 | 
 57 | num_ranks = args.tp * args.pp * args.dp
 58 | topology = args.topology.lower()
 59 | 
 60 | if args.stats:
 61 |     enableStats = True
 62 |     sst.setStatisticLoadLevel(10)
 63 | 
 64 |     stat_params = {"type":"sst.AccumulatorStatistic"}
 65 | 
 66 |     fname = args.stats
 67 |     if fname.endswith(".csv"):
 68 |         sst.setStatisticOutput("sst.statOutputCSV",
 69 |                                {   "filepath" : fname,
 70 |                                 "separator" : ";"
 71 |                                 } )
 72 |     else:
 73 |         sst.setStatisticOutput("sst.statOutputConsole")
 74 | else:
 75 |     enableStats = False
 76 | 
 77 | 
 78 | # Network topology definition start
 79 | PlatformDefinition.setCurrentPlatform("firefly-defaults")
 80 | 
 81 | ### set up the endpoint
 82 | networkif = ReorderLinkControl()
 83 | networkif.link_bw = str(params["link_bw"]) + "GB/s"
 84 | networkif.input_buf_size = params["input_buf_size"]
 85 | networkif.output_buf_size = params["output_buf_size"]
 86 | 
 87 | assert (num_ranks == args.tp or num_ranks == args.pp or num_ranks == args.dp or
 88 |        (args.dp > 1 and args.pp > 1 and args.tp > 1)), "Only 1D and 3D parallelism are supported"
 89 | 
 90 | ep = EmberMPIJob(0,num_ranks)
 91 | 
 92 | ep.ember.verbose = 0
 93 | 
 94 | ep.network_interface = networkif
 95 | ep.addMotif("Init")
 96 | 
 97 | if args.tp > 1 and args.pp == 1 and args.dp == 1:
 98 |     ep.addMotif(f"LLMTensorParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
 99 | 
100 | if args.pp > 1 and args.tp == 1 and args.dp == 1:
101 |     ep.addMotif(f"LLMPipelineParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
102 | 
103 | if args.dp > 1 and args.tp == 1 and args.pp == 1:
104 |     ep.addMotif(f"LLMDataParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
105 | 
106 | if args.dp > 1 and args.tp > 1 and args.pp > 1:
107 |     ep.addMotif(f"LLM3DParallelism tp={args.tp} pp={args.pp} dp={args.dp} batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
108 | 
109 | ep.addMotif("Fini")
110 | ep.nic.nic2host_lat= params["host_link_latency"]
111 | 
112 | if args.log:
113 |     ep.enableMotifLog(args.log)
114 | 
115 | if topology == "single":
116 |     topo = topoSingle()
117 |     topo.num_ports = num_ranks
118 |     rank_per_router = num_ranks
119 | 
120 | elif topology == "dragonfly":
121 |     rank_per_router = args.tp
122 |     topo = topoDragonFly()
123 |     topo.hosts_per_router = rank_per_router
124 | 
125 |     topo.routers_per_group = args.dp
126 |     topo.num_groups = args.pp
127 | 
128 |     topo.intergroup_links = params["width"]
129 |     topo.algorithm = "minimal"
130 | 
131 | elif topology == "fattree":
132 |     rank_per_router = args.tp
133 |     topo = topoFatTree()
134 |     fattree_shape = f"1,1:{args.pp},{args.pp}:{args.dp},{args.dp}:{rank_per_router}"
135 |     topo.host_link_latency = params["host_link_latency"]
136 |     topo.shape = fattree_shape
137 | else:
138 |     print(topology, " unknown!")
139 |     sys.exit()
140 | 
141 | # Set up the routers
142 | router = hr_router()
143 | router.link_bw =  str(params["link_bw"]) + "GB/s"
144 | router.flit_size = params["flit_size"]
145 | router.xbar_bw = str(params["link_bw"]*rank_per_router) + "GB/s"
146 | router.input_latency = params["input_latency"]
147 | router.output_latency = params["output_latency"]
148 | router.input_buf_size = params["input_buf_size"]
149 | router.output_buf_size = params["output_buf_size"]
150 | router.num_vns = params["num_vns"]
151 | router.xbar_arb = params["xbar_arb"]
152 | 
153 | 
154 | ### Setup the topology
155 | topo.link_latency = params["link_lat"]
156 | topo.router = router
157 | 
158 | system = System()
159 | system.setTopology(topo)
160 | system.allocateNodes(ep,"linear")
161 | 
162 | if enableStats:
163 |     networkif.enableAllStatistics(stat_params)
164 |     router.enableAllStatistics(stat_params)
165 | 
166 | system.build()
167 | 


--------------------------------------------------------------------------------
/demo/sst/software/.gitignore:
--------------------------------------------------------------------------------
1 | x86
2 | test_gemm.c
3 | test_gemm_t.c
4 | test_softmax.c
5 | 


--------------------------------------------------------------------------------
/demo/sst/software/Makefile:
--------------------------------------------------------------------------------
 1 | ## This source code is licensed under the MIT license found in the
 2 | ## LICENSE file in the root directory of this source tree.
 3 | ##
 4 | ## Copyright (c) 2025 IMEC. All rights reserved.
 5 | ## ******************************************************************************
 6 | 
 7 | ARCH_LIST= x86 riscv64
 8 | 
 9 | .PHONY: all clean
10 | all clean:
11 | 	 @for arch in $(ARCH_LIST) ;					\
12 | 	 do													\
13 | 		  make -f $$arch.make $@ || exit 1;	\
14 | 	 done
15 | 


--------------------------------------------------------------------------------
/demo/sst/software/check_mpi.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | #include <float.h>
  5 | #include <assert.h>
  6 | #include <math.h>
  7 | #include <time.h>
  8 | #include <string.h>
  9 | #include <omp.h>
 10 | #include <mpi.h>
 11 | 
 12 | #define MIN(a,b)  ((a) < (b) ? (a) : (b))
 13 | 
 14 | #define WORLD MPI_COMM_WORLD
 15 | 
 16 | #define CHECK_RES
 17 | 
 18 | 
 19 | //FP32
 20 | //#define DATATYPE 0
 21 | //FP64
 22 | //#define DATATYPE 1
 23 | //I8
 24 | //#define DATATYPE 2
 25 | //I16
 26 | //#define DATATYPE 3
 27 | //I32
 28 | #define DATATYPE 4
 29 | 
 30 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
 31 | 
 32 | 
 33 | #if DATATYPE == 0
 34 | typedef float data_t;
 35 | MPI_Datatype mpi_data_type = MPI_FLOAT;
 36 | static data_type_e data_type = FP32;
 37 | #define TYPE_IS_FP
 38 | #elif DATATYPE == 1
 39 | typedef double data_t;
 40 | MPI_Datatype mpi_data_type = MPI_DOUBLE;
 41 | static data_type_e data_type = FP64;
 42 | #define TYPE_IS_FP
 43 | #elif DATATYPE == 2
 44 | typedef int8_t data_t;
 45 | MPI_Datatype mpi_data_type = MPI_INT8_T;
 46 | static data_type_e data_type = I8;
 47 | #define TYPE_IS_INT
 48 | #elif DATATYPE == 3
 49 | typedef int16_t data_t;
 50 | MPI_Datatype mpi_data_type = MPI_INT16_T;
 51 | static data_type_e data_type = I16;
 52 | #define TYPE_IS_INT
 53 | #elif DATATYPE == 4
 54 | typedef int32_t data_t;
 55 | MPI_Datatype mpi_data_type = MPI_INT32_T;
 56 | static data_type_e data_type = I32;
 57 | #define TYPE_IS_INT
 58 | #else
 59 |    #error Unsupported choice setting
 60 | #endif
 61 | 
 62 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
 63 | 
 64 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
 65 |           int stride_0, int stride_1, int stride_2);
 66 | 
 67 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
 68 |           int stride_0, int stride_1, int stride_2);
 69 | 
 70 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 71 | 
 72 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 73 | 
 74 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
 75 | 
 76 | #ifdef CHECK_RES
 77 | 
 78 | void gemm_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
 79 |           int stride_0, int stride_1, int stride_2);
 80 | 
 81 | void gemm_t_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
 82 |           int stride_0, int stride_1, int stride_2);
 83 | 
 84 | void scale_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 85 | 
 86 | void add_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 87 | 
 88 | void softmax_ref(void * dst, void * src, data_type_e data_type, int m, int n);
 89 | 
 90 | #if defined(TYPE_IS_INT)
 91 | static int cmp(data_t rhs, data_t lhs) {
 92 |    return (rhs) - (lhs);
 93 | }
 94 | static void print_diff(int count, int pos, data_t ref, data_t res) {
 95 |    printf("Difference %d at %d ref = %d mpi = %d\n", count, pos, ref, res);
 96 | }
 97 | #elif defined(TYPE_IS_FP)
 98 | static int cmp(data_t rhs, data_t lhs) {
 99 |    int ret = 0;
100 |    data_t diff = roundf(rhs) - roundf(lhs);
101 |    ret = diff > 1.0 ? 1 : 0;
102 |    return ret;
103 | }
104 | static void print_diff(int count, int pos, data_t ref, data_t res) {
105 |    printf("Difference %d at %d ref = %.4f mpi = %.4f\n", count, pos, ref, res);
106 | }
107 | #else
108 |    #error Unsupported choice setting
109 | #endif
110 | 
111 | #endif // CHECK_RES
112 | 
113 | int main(int argc, char ** argv) {
114 |    const int root = 0;
115 |    int n_ranks, rank;
116 |    MPI_Request emb_req, Qw_req, Kw_req, Vw_req, attn_w_req;
117 | 
118 |    MPI_Init(&argc, &argv);
119 |    MPI_Comm_size(WORLD, &n_ranks);
120 |    MPI_Comm_rank(WORLD, &rank);
121 | 
122 |    MPI_Datatype col, col_type;
123 | 
124 |    if(argc != 4) {
125 |       fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
126 |       exit(EXIT_FAILURE);
127 |    }
128 | 
129 |    struct timespec start, end;
130 |    double time_elapsed_s;
131 |    int dmodel = atoi(argv[1]);
132 |    int h = atoi(argv[3]);
133 |    int S = atoi(argv[2]);
134 |    int dk = dmodel/h;
135 |    int dv = dmodel/h;
136 | 
137 |    if((dmodel%n_ranks) != 0) {
138 |       fprintf(stderr, "Error: dmodel must be a multiple of the number of ranks (dmodel: %d, rank: %d)\n", dmodel, rank);
139 |       exit(EXIT_FAILURE);
140 |    }
141 | 
142 |    fprintf(stdout, "Model n_ranks: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
143 | 
144 | 
145 |    MPI_Type_vector(dmodel, dmodel/n_ranks, dmodel, mpi_data_type, &col);
146 |    MPI_Type_commit(&col);
147 |    MPI_Type_create_resized(col, 0, dmodel/n_ranks*sizeof(data_t), &col_type);
148 |    MPI_Type_commit(&col_type);
149 | 
150 | 
151 |    assert(n_ranks <= h && (h % n_ranks) == 0);
152 | 
153 | #ifdef CHECK_RES
154 |    data_t * embeddings_ref = NULL;
155 | #endif
156 | 
157 |    data_t *embeddings = NULL;
158 |    data_t * Qw = NULL;
159 |    data_t * Kw = NULL;
160 |    data_t * Vw = NULL;
161 | 
162 |    data_t * Qw_heads = NULL;
163 |    data_t * Kw_heads = NULL;
164 |    data_t * Vw_heads = NULL;
165 | 
166 |    data_t * Q = NULL;
167 |    data_t * K = NULL;
168 |    data_t * V = NULL;
169 | 
170 |    data_t * KQ = NULL;
171 |    data_t * softmax_out = NULL;
172 | 
173 |    data_t * QKV = NULL;
174 | 
175 |    data_t * ATTNw = NULL;
176 |    data_t * ATTNout = NULL;
177 | 
178 |    data_t scale_f = 1.0f/sqrtf(((data_t)dk));
179 | 
180 |    srand(time(NULL));
181 | 
182 |    clock_gettime(CLOCK_MONOTONIC, &start);
183 | 
184 |    embeddings = calloc(dmodel*S, sizeof(data_t));
185 | 
186 |    ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
187 | 
188 |    if(rank == root) {
189 |       init_random_tensor(embeddings, data_type, dmodel*S);
190 |       init_random_tensor(ATTNw, data_type, dmodel*dmodel);
191 | #ifdef CHECK_RES
192 |       embeddings_ref = calloc(S*dmodel,sizeof(data_t));
193 |       memcpy(embeddings_ref, embeddings, S*dmodel*sizeof(data_t));
194 | #endif
195 |    }
196 | 
197 |    MPI_Ibcast(embeddings, dmodel*S, mpi_data_type, root, WORLD, &emb_req);
198 |    MPI_Ibcast(ATTNw, dmodel*dmodel, mpi_data_type, root, WORLD, &attn_w_req);
199 | 
200 |    if(rank == root) {
201 |       Qw = calloc(dmodel*dmodel, sizeof(data_t));
202 |       init_random_tensor(Qw, data_type, dmodel*dmodel);
203 | 
204 |       Kw = calloc(dmodel*dmodel, sizeof(data_t));
205 |       init_random_tensor(Kw, data_type, dmodel*dmodel);
206 | 
207 |       Vw = calloc(dmodel*dmodel, sizeof(data_t));
208 |       init_random_tensor(Vw, data_type, dmodel*dmodel);
209 |    }
210 | 
211 |    Qw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
212 |    Kw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
213 |    Vw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
214 | 
215 |    MPI_Iscatter(Qw, 1, col_type, Qw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Qw_req);
216 |    MPI_Iscatter(Kw, 1, col_type, Kw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Kw_req);
217 |    MPI_Iscatter(Vw, 1, col_type, Vw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Vw_req);
218 | 
219 |    Q = calloc(S*dmodel/n_ranks, sizeof(data_t));
220 |    memset(Q, 0, S*dmodel*sizeof(data_t)/n_ranks);
221 | 
222 |    K = calloc(S*dmodel/n_ranks, sizeof(data_t));
223 |    memset(K, 0, S*dmodel*sizeof(data_t)/n_ranks);
224 | 
225 |    V = calloc(S*dmodel/n_ranks, sizeof(data_t));
226 |    memset(V, 0, S*dmodel*sizeof(data_t)/n_ranks);
227 | 
228 |    KQ = calloc(h/n_ranks*S*S, sizeof(data_t));
229 |    memset(KQ, 0, h/n_ranks*S*S*sizeof(data_t));
230 | 
231 |    softmax_out = calloc(h/n_ranks*S*S, sizeof(data_t));
232 |    memset(softmax_out, 0, h/n_ranks*S*S*sizeof(data_t));
233 | 
234 |    QKV = calloc(S/n_ranks*dmodel, sizeof(data_t));
235 |    memset(QKV, 0, S/n_ranks*dmodel*sizeof(data_t));
236 | 
237 |    ATTNout = calloc(S*dmodel, sizeof(data_t));
238 |    memset(ATTNout, 0, S*dmodel*sizeof(data_t));
239 | 
240 |    MPI_Wait(&emb_req, MPI_STATUS_IGNORE);
241 |    MPI_Wait(&Qw_req, MPI_STATUS_IGNORE);
242 |    MPI_Wait(&Kw_req, MPI_STATUS_IGNORE);
243 |    MPI_Wait(&Vw_req, MPI_STATUS_IGNORE);
244 |    MPI_Wait(&attn_w_req, MPI_STATUS_IGNORE);
245 | 
246 |    clock_gettime(CLOCK_MONOTONIC, &end);
247 | 
248 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
249 |    printf("[rank: %d] Init time: %.2f ms\n", rank, time_elapsed_s * 1000);
250 | 
251 |    clock_gettime(CLOCK_MONOTONIC, &start);
252 |    /* MHA */
253 | 
254 |    gemm(Q, embeddings, Qw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
255 |    gemm(K, embeddings, Kw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
256 |    gemm(V, embeddings, Vw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
257 | 
258 |    for(int i = 0; i < h/n_ranks; i++) {
259 |       gemm_t(&KQ[S*S*i], &Q[dmodel/h*i], K, data_type, S, S, dmodel/h, S, dmodel/n_ranks, dmodel/n_ranks);
260 |    }
261 | 
262 |    scale(KQ, KQ, ((void*)&scale_f), data_type, h/n_ranks*S, S);
263 | 
264 |    softmax(softmax_out, KQ, data_type, h/n_ranks*S, S);
265 | 
266 |    for(int i = 0; i < h/n_ranks; i++) {
267 |       gemm(&QKV[dmodel/h*i], &softmax_out[S*S*i], &V[dmodel/h*i], data_type, S, dmodel/h, S, dmodel/n_ranks, S, dmodel/n_ranks);
268 |    }
269 | 
270 |    gemm(ATTNout, QKV, ATTNw, data_type, S, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel);
271 | 
272 |    add(&ATTNout[S/n_ranks*rank*dmodel], &ATTNout[S/n_ranks*rank*dmodel], &embeddings[S/n_ranks*rank*dmodel], data_type, S/n_ranks, dmodel);
273 | 
274 | 
275 |    MPI_Allreduce(ATTNout, embeddings, S*dmodel, mpi_data_type, MPI_SUM, WORLD);
276 | 
277 |    clock_gettime(CLOCK_MONOTONIC, &end);
278 | 
279 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
280 |    printf("[rank: %d] MHA execution time: %.2f ms\n", rank, time_elapsed_s * 1000);
281 | 
282 | #ifdef CHECK_RES
283 |    if(rank == root) {
284 |       data_t * Q_ref = calloc(S*dmodel, sizeof(data_t));
285 |       assert(Q_ref);
286 |       memset(Q_ref, 0, S*dmodel*sizeof(data_t));
287 | 
288 |       data_t * K_ref = calloc(S*dmodel, sizeof(data_t));
289 |       assert(K_ref);
290 |       memset(K_ref, 0, S*dmodel*sizeof(data_t));
291 | 
292 |       data_t * V_ref = calloc(S*dmodel, sizeof(data_t));
293 |       assert(V_ref);
294 |       memset(V_ref, 0, S*dmodel*sizeof(data_t));
295 | 
296 |       data_t * KQ_ref = calloc(S*S*h, sizeof(data_t));
297 |       assert(KQ_ref);
298 |       memset(KQ_ref, 0, h*S*S*sizeof(data_t));
299 | 
300 |       data_t * softmax_out_ref = calloc(h*S*S, sizeof(data_t));
301 |       assert(softmax_out_ref);
302 |       memset(softmax_out_ref, 0, h*S*S*sizeof(data_t));
303 | 
304 |       data_t * QKV_ref = calloc(S*dmodel, sizeof(data_t));
305 |       assert(QKV_ref);
306 |       memset(QKV_ref, 0, S*dmodel*sizeof(data_t));
307 | 
308 |       data_t * ATTNout_ref = calloc(S*dmodel, sizeof(data_t));
309 |       assert(ATTNout_ref);
310 |       memset(ATTNout_ref, 0, S*dmodel*sizeof(data_t));
311 | 
312 |       gemm_ref(Q_ref, embeddings_ref, Qw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
313 |       gemm_ref(K_ref, embeddings_ref, Kw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
314 |       gemm_ref(V_ref, embeddings_ref, Vw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
315 | 
316 |       for(int i = 0; i < h; i++) {
317 |          gemm_t_ref(&KQ_ref[S*S*i], &Q_ref[dmodel/h*i], K_ref, data_type, S, S, dmodel/h, S, dmodel, dmodel);
318 |       }
319 | 
320 |       scale_ref(KQ_ref, KQ_ref, ((void*)&scale_f), data_type, S*h, S);
321 | 
322 |       softmax_ref(softmax_out_ref, KQ_ref, data_type, S*h, S);
323 | 
324 |       for(int i = 0; i < h; i++) {
325 |          gemm_ref(&QKV_ref[dmodel/h*i], &softmax_out_ref[S*S*i], &V_ref[dmodel/h*i], data_type, S, dmodel/h, S, dmodel, S, dmodel);
326 |       }
327 | 
328 |       gemm_ref(ATTNout_ref, QKV_ref, ATTNw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
329 | 
330 |       add_ref(embeddings_ref, embeddings_ref, ATTNout_ref, data_type, S, dmodel);
331 | 
332 |       int count = 0;
333 |       for(int i = 0; i < S*dmodel; i++)
334 |          if(cmp(embeddings_ref[i], embeddings[i]) != 0)
335 |                print_diff(count++, i, embeddings_ref[i], embeddings[i]);
336 | 
337 |       free(Q_ref);
338 |       free(K_ref);
339 |       free(V_ref);
340 |       free(KQ_ref);
341 |       free(softmax_out_ref);
342 |       free(QKV_ref);
343 |       free(embeddings_ref);
344 |       free(ATTNout_ref);
345 |    }
346 | #endif
347 | 
348 |    if(rank == root) {
349 |       free(Qw);
350 |       free(Kw);
351 |       free(Vw);
352 |    }
353 | 
354 |    free(embeddings);
355 |    free(Qw_heads);
356 |    free(Kw_heads);
357 |    free(Vw_heads);
358 |    free(Q);
359 |    free(K);
360 |    free(V);
361 |    free(KQ);
362 |    free(softmax_out);
363 |    free(QKV);
364 |    free(ATTNw);
365 |    free(ATTNout);
366 | 
367 |    MPI_Finalize();
368 |    return 0;
369 | }
370 | 
371 | static size_t get_element_size(data_type_e type) {
372 |    size_t size;
373 |    switch (type) {
374 |       case I8:
375 |          size = sizeof(uint8_t);
376 |          break;
377 |       case I16:
378 |          size = sizeof(uint16_t);
379 |          break;
380 |       case I32:
381 |       case FP32:
382 |          size = sizeof(uint32_t);
383 |          break;
384 |       case FP64:
385 |          size = sizeof(uint64_t);
386 |          break;
387 |       default:
388 |          size = -1;
389 |          break;
390 |    }
391 | 
392 |    assert(size > 0 && "data type unknown");
393 | 
394 |    return size;
395 | }
396 | 
397 | 
398 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
399 |    const data_t range = 10;
400 |    #pragma omp parallel for shared (tensor)
401 |    for(int i = 0; i < nmemb; i++)
402 |       tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
403 | }
404 | 
405 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
406 |    assert(tensor);
407 |    init_random_tensor_impl(((data_t*)tensor), nmemb);
408 | }
409 | 
410 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
411 |    const int bsize = 64;
412 |    int ii0, ii1, ii2;
413 |    int i0, i1, i2;
414 |    data_t pp;
415 |    #pragma omp parallel for shared (dst, src1, src2) private(i0,i1,i2,ii0,ii1,ii2,pp) collapse(3)
416 |    for (ii0 = 0; ii0<m; ii0+=bsize) {
417 |       for (ii1 = 0; ii1<n; ii1+=bsize) {
418 |          for(ii2 = 0; ii2<k; ii2+=bsize) {
419 |             for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
420 |                for (i1 = ii1; i1 < MIN(ii1+bsize,n); i1++) {
421 |                   pp = 0;
422 |                   for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
423 |                      pp += src1[i0*(stride_1)+i2] * src2[i2*stride_2+i1];
424 |                   }
425 |                   dst[i0*(stride_0)+i1]+= pp;
426 |                }
427 |             }
428 |          }
429 |       }
430 |    }
431 | }
432 | 
433 | static void gemm_t_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
434 |    const int bsize = 64;
435 |    int ii0, ii1, ii2;
436 |    int i0, i1, i2;
437 |    data_t pp;
438 |    #pragma omp parallel for shared (dst, src1, src2) private(i0,i1,i2,ii0,ii1,ii2,pp) collapse(3)
439 |    for (ii0 = 0; ii0<m; ii0+=bsize) {
440 |       for (ii1 = 0; ii1<n; ii1+=bsize) {
441 |          for(ii2 = 0; ii2<k; ii2+=bsize) {
442 |             for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
443 |                for (i1 = ii1; i1 < MIN(ii1+bsize,n); i1++) {
444 |                   pp = 0;
445 |                   for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
446 |                      pp += src1[i0*(stride_1)+i2] * src2[i1*stride_2+i2];
447 |                   }
448 |                   dst[i0*stride_0+i1]+= pp;
449 |                }
450 |             }
451 |          }
452 |       }
453 |    }
454 | }
455 | 
456 | static void scale_impl(data_t * dst, const data_t * src1, const data_t src2, int m, int n) {
457 |    int i;
458 |    #pragma omp parallel for shared (dst, src1) private(i)
459 |    for(i = 0; i < m*n; i++)
460 |       dst[i] = src1[i] * src2;
461 | }
462 | 
463 | static void add_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n) {
464 |    int i;
465 |    #pragma omp parallel for shared (dst, src1) private(i)
466 |    for(i = 0; i < m*n; i++)
467 |       dst[i] = src1[i] + src2[i];
468 | }
469 | 
470 | static void softmax_impl(data_t * dst, const data_t * src, int m, int n) {
471 |    int i, j;
472 |    data_t max, sum;
473 |    #pragma omp parallel for shared (dst) private(i, j, max, sum)
474 |    for(i = 0; i < m; i++) {
475 |       max = FLT_MIN;
476 |       for(j = 0; j < n; j++)
477 |          max = (max > src[i*n+j]) ? max : src[i*n+j];
478 | 
479 |       sum = 0.0;
480 |       for(j = 0; j < n; j++) {
481 |          const data_t e = expf(src[i*n+j] - max);
482 |          sum += e;
483 |          dst[i*n+j] = e;
484 |       }
485 | 
486 |       for(j = 0; j < n; j++) {
487 |          dst[i*n+j] *= sum;
488 |       }
489 |    }
490 | }
491 | 
492 | 
493 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
494 |    assert(dst);
495 |    assert(src1);
496 |    assert(src2);
497 |    gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
498 | }
499 | 
500 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
501 |    assert(dst);
502 |    assert(src1);
503 |    assert(src2);
504 |    gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
505 | }
506 | 
507 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
508 |    assert(dst);
509 |    assert(src1);
510 |    assert(src2);
511 |    scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
512 | }
513 | 
514 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
515 |    assert(dst);
516 |    assert(src1);
517 |    assert(src2);
518 |    add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
519 | }
520 | 
521 | 
522 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
523 |    assert(dst);
524 |    assert(src);
525 |    softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
526 | }
527 | 
528 | #ifdef CHECK_RES
529 | 
530 | static void gemm_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
531 |    int i0 = 0, i1 = 0, i2 = 0;
532 |    for (i0 = 0; i0 < m; ++i0) {
533 |       for (i1 = 0; i1 < n; ++i1) {
534 |          for (i2 = 0; i2 < k; ++i2) {
535 |             dst[i0*(stride_0)+i1] += src1[i0*(stride_1)+i2] * src2[i2*stride_2+i1];
536 |          }
537 |       }
538 |    }
539 | }
540 | 
541 | static void gemm_t_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
542 |    int i0 = 0, i1 = 0, i2 = 0;
543 |    for (i0 = 0; i0 < m; ++i0) {
544 |       for (i1 = 0; i1 < n; ++i1) {
545 |          for (i2 = 0; i2 < k; ++i2) {
546 |             dst[i0*stride_0+i1] += src1[i0*(stride_1)+i2] * src2[i1*stride_2+i2];
547 |          }
548 |       }
549 |    }
550 | }
551 | 
552 | static void scale_ref_impl(data_t * dst, const data_t * src1, const data_t src2, int m, int n) {
553 |    int i, j;
554 |    for(i = 0; i < m*n; i++)
555 |       dst[i] = src1[i] * src2;
556 | }
557 | 
558 | static void add_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n) {
559 |    int i, j;
560 |    for(i = 0; i < m*n; i++)
561 |       dst[i] = src1[i] + src2[i];
562 | }
563 | 
564 | static void softmax_ref_impl(data_t * dst, const data_t * src, int m, int n) {
565 |    int i, j;
566 |    data_t max, sum;
567 |    for(i = 0; i < m; i++) {
568 |       max = FLT_MIN;
569 |       for(j = 0; j < n; j++)
570 |          max = (max > src[i*n+j]) ? max : src[i*n+j];
571 | 
572 |       sum = 0.0;
573 |       for(j = 0; j < n; j++) {
574 |          const data_t e = expf(src[i*n+j] - max);
575 |          sum += e;
576 |          dst[i*n+j] = e;
577 |       }
578 | 
579 |       for(j = 0; j < n; j++) {
580 |          dst[i*n+j] *= sum;
581 |       }
582 |    }
583 | }
584 | 
585 | 
586 | void gemm_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
587 |    assert(dst);
588 |    assert(src1);
589 |    assert(src2);
590 |    gemm_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
591 | }
592 | 
593 | void gemm_t_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
594 |    assert(dst);
595 |    assert(src1);
596 |    assert(src2);
597 |    gemm_t_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
598 | }
599 | 
600 | void scale_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
601 |    assert(dst);
602 |    assert(src1);
603 |    assert(src2);
604 |    scale_ref_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
605 | }
606 | 
607 | void add_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
608 |    assert(dst);
609 |    assert(src1);
610 |    assert(src2);
611 |    add_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
612 | }
613 | 
614 | 
615 | void softmax_ref(void * dst, void * src, data_type_e data_type, int m, int n) {
616 |    assert(dst);
617 |    assert(src);
618 |    softmax_ref_impl(((data_t*)dst), ((data_t*)src), m, n);
619 | }
620 | 
621 | #endif
622 | 


--------------------------------------------------------------------------------
/demo/sst/software/gemm_OMP.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | #include <float.h>
  5 | #include <assert.h>
  6 | #include <math.h>
  7 | #include <time.h>
  8 | #include <string.h>
  9 | #include <limits.h>
 10 | #include <omp.h>
 11 | 
 12 | #define MIN(a,b)  ((a) < (b) ? (a) : (b))
 13 | 
 14 | #ifndef TILE_SIZE
 15 | #define TILE_SIZE 16
 16 | #endif
 17 | 
 18 | //FP32
 19 | //#define DATATYPE 0
 20 | //FP64
 21 | //#define DATATYPE 1
 22 | //I8
 23 | //#define DATATYPE 2
 24 | //I16
 25 | //#define DATATYPE 3
 26 | //I32
 27 | #define DATATYPE 4
 28 | 
 29 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
 30 | 
 31 | #if DATATYPE == 0
 32 | typedef float data_t;
 33 | static data_type_e data_type = FP32;
 34 | #define DATA_MIN FLT_MIN
 35 | #define TYPE_IS_FP
 36 | #elif DATATYPE == 1
 37 | typedef double data_t;
 38 | static data_type_e data_type = FP64;
 39 | #define DATA_MIN DBL_MIN
 40 | #define TYPE_IS_FP
 41 | #elif DATATYPE == 2
 42 | typedef int8_t data_t;
 43 | static data_type_e data_type = I8;
 44 | #define DATA_MIN CHAR_MIN
 45 | #define TYPE_IS_INT
 46 | #elif DATATYPE == 3
 47 | typedef int16_t data_t;
 48 | static data_type_e data_type = I16;
 49 | #define DATA_MIN SHRT_MIN
 50 | #define TYPE_IS_INT
 51 | #elif DATATYPE == 4
 52 | typedef int32_t data_t;
 53 | static data_type_e data_type = I32;
 54 | #define DATA_MIN INT_MIN
 55 | #define TYPE_IS_INT
 56 | #else
 57 |    #error Unsupported choice setting
 58 | #endif
 59 | 
 60 | 
 61 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
 62 | 
 63 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
 64 |           int stride_0, int stride_1, int stride_2);
 65 | int main(int argc, char ** argv) {
 66 | 
 67 |    if(argc != 4) {
 68 |       fprintf(stderr, "Usage: %s M N K\n", argv[0]);
 69 |       exit(EXIT_FAILURE);
 70 |    }
 71 | 
 72 |    struct timespec start, end;
 73 |    double time_elapsed_s;
 74 |    int m = atoi(argv[1]);
 75 |    int n = atoi(argv[3]);
 76 |    int k = atoi(argv[2]);
 77 | 
 78 |    fprintf(stdout, "M: %d, N: %d, K: %d\n", m, n, k);
 79 | 
 80 |    data_t * A = NULL;
 81 |    data_t * B = NULL;
 82 |    data_t * C = NULL;
 83 | 
 84 |    srand(time(NULL));
 85 | 
 86 |    clock_gettime(CLOCK_MONOTONIC, &start);
 87 | 
 88 |    A = calloc(m*k, sizeof(data_t));
 89 |    B = calloc(k*n, sizeof(data_t));
 90 |    C = calloc(m*n, sizeof(data_t));
 91 | 
 92 | 
 93 |    init_random_tensor(A, data_type, m*k);
 94 |    init_random_tensor(B, data_type, n*k);
 95 | 
 96 |    clock_gettime(CLOCK_MONOTONIC, &end);
 97 | 
 98 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 99 |    printf("Init time: %.2f ms\n", time_elapsed_s * 1000);
100 | 
101 |    clock_gettime(CLOCK_MONOTONIC, &start);
102 |    /* MHA */
103 | 
104 |    gemm(C, A, B, data_type, 1, m, n, k, n, k, n);
105 | 
106 |    clock_gettime(CLOCK_MONOTONIC, &end);
107 | 
108 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
109 |    const uint64_t flop_count = m*n*k*2;
110 | 
111 |    printf("Execution time: %.2f ms flop count: %lu\n", time_elapsed_s * 1000, flop_count);
112 | 
113 |    free(A);
114 |    free(B);
115 |    free(C);
116 | 
117 |    return 0;
118 | }
119 | 
120 | static size_t get_element_size(data_type_e type) {
121 |    size_t size;
122 |    switch (type) {
123 |       case I8:
124 |          size = sizeof(uint8_t);
125 |          break;
126 |       case I16:
127 |          size = sizeof(uint16_t);
128 |          break;
129 |       case I32:
130 |       case FP32:
131 |          size = sizeof(uint32_t);
132 |          break;
133 |       case FP64:
134 |          size = sizeof(uint64_t);
135 |          break;
136 |       default:
137 |          size = -1;
138 |          break;
139 |    }
140 | 
141 |    assert(size > 0 && "data type unknown");
142 | 
143 |    return size;
144 | }
145 | 
146 | 
147 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
148 |    const data_t range = 10;
149 |    #pragma omp parallel for shared (tensor)
150 |    for(int i = 0; i < nmemb; i++)
151 |       tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
152 | }
153 | 
154 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
155 |    assert(tensor);
156 |    init_random_tensor_impl(((data_t*)tensor), nmemb);
157 | }
158 | 
159 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
160 |       int stride_0, int stride_1, int stride_2) {
161 |    const int bsize = TILE_SIZE;
162 |    int ii0, ii1, ii2;
163 |    int i0, i1, i2;
164 |    int h;
165 |    int start_head, stop_head;
166 |    data_t pp;
167 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
168 |    for (h=0; h < heads; h++) {
169 |       for (ii0 = 0; ii0<m; ii0+=bsize) {
170 |          for (ii1 = h*n; ii1<((h+1)*n); ii1+=bsize) {
171 |             for(ii2 = 0; ii2<k; ii2+=bsize) {
172 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
173 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,((h+1)*n)); i1++) {
174 |                      pp = 0;
175 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
176 |                         pp += src1[(i0+h*m)*(stride_1)+i2] * src2[i2*stride_2+i1];
177 |                      }
178 |                      dst[i0*(stride_0)+i1]+= pp;
179 |                   }
180 |                }
181 |             }
182 |          }
183 |       }
184 |    }
185 | }
186 | 
187 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
188 |           int stride_0, int stride_1, int stride_2) {
189 |    assert(dst);
190 |    assert(src1);
191 |    assert(src2);
192 |    assert(heads*n == stride_0);
193 |    gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
194 | }
195 | 


--------------------------------------------------------------------------------
/demo/sst/software/hello_MPI.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | 
 4 | int main(int argc, char ** argv) {
 5 | 
 6 |    int n_ranks, rid;
 7 | 
 8 |    printf("Initializing MPI\n");
 9 | 
10 |    MPI_Init(&argc, &argv);
11 |    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
12 |    MPI_Comm_rank(MPI_COMM_WORLD, &rid);
13 | 
14 |    printf("Hello from process %d out of %d\n", rid, n_ranks);
15 | 
16 |    MPI_Finalize();
17 | 
18 |    return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/demo/sst/software/hello_MPI_OMP.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | #include <omp.h>
 4 | 
 5 | 
 6 | int main(int argc, char ** argv) {
 7 | 
 8 |    int n_ranks, rid;
 9 |    int n_threads = 0, tid = 0;
10 | 
11 | 
12 |    printf("Initializing MPI\n");
13 | 
14 |    MPI_Init(&argc, &argv);
15 |    MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
16 |    MPI_Comm_rank(MPI_COMM_WORLD, &rid);
17 | 
18 |    printf("[rank %d] Entering OMP section\n", rid);
19 | 
20 |    #pragma omp parallel private(tid, n_threads)
21 |    {
22 |       n_threads = omp_get_num_threads();
23 |       tid = omp_get_thread_num();
24 |       printf("Hello from thread %d out of %d from process %d out of %d\n",
25 |             tid, n_threads, rid, n_ranks);
26 |    }
27 | 
28 |    MPI_Finalize();
29 | 
30 |    return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/demo/sst/software/mha_MPI_OMP.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | #include <float.h>
  5 | #include <assert.h>
  6 | #include <math.h>
  7 | #include <time.h>
  8 | #include <string.h>
  9 | #include <limits.h>
 10 | #include <omp.h>
 11 | #include <mpi.h>
 12 | 
 13 | #define MIN(a,b)  ((a) < (b) ? (a) : (b))
 14 | 
 15 | #define WORLD MPI_COMM_WORLD
 16 | 
 17 | //FP32
 18 | //#define DATATYPE 0
 19 | //FP64
 20 | //#define DATATYPE 1
 21 | //I8
 22 | //#define DATATYPE 2
 23 | //I16
 24 | //#define DATATYPE 3
 25 | //I32
 26 | #define DATATYPE 4
 27 | 
 28 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
 29 | 
 30 | 
 31 | #if DATATYPE == 0
 32 | typedef float data_t;
 33 | MPI_Datatype mpi_data_type = MPI_FLOAT;
 34 | static data_type_e data_type = FP32;
 35 | #define DATA_MIN FLT_MIN
 36 | #define TYPE_IS_FP
 37 | #elif DATATYPE == 1
 38 | typedef double data_t;
 39 | MPI_Datatype mpi_data_type = MPI_DOUBLE;
 40 | static data_type_e data_type = FP64;
 41 | #define DATA_MIN DBL_MIN
 42 | #define TYPE_IS_FP
 43 | #elif DATATYPE == 2
 44 | typedef int8_t data_t;
 45 | MPI_Datatype mpi_data_type = MPI_INT8_T;
 46 | static data_type_e data_type = I8;
 47 | #define DATA_MIN CHAR_MIN
 48 | #define TYPE_IS_INT
 49 | #elif DATATYPE == 3
 50 | typedef int16_t data_t;
 51 | MPI_Datatype mpi_data_type = MPI_INT16_T;
 52 | static data_type_e data_type = I16;
 53 | #define DATA_MIN SHRT_MIN
 54 | #define TYPE_IS_INT
 55 | #elif DATATYPE == 4
 56 | typedef int32_t data_t;
 57 | MPI_Datatype mpi_data_type = MPI_INT32_T;
 58 | static data_type_e data_type = I32;
 59 | #define DATA_MIN INT_MIN
 60 | #define TYPE_IS_INT
 61 | #else
 62 |    #error Unsupported choice setting
 63 | #endif
 64 | 
 65 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
 66 | 
 67 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
 68 |           int stride_0, int stride_1, int stride_2);
 69 | 
 70 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
 71 |           int stride_0, int stride_1, int stride_2);
 72 | 
 73 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 74 | 
 75 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 76 | 
 77 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
 78 | 
 79 | int main(int argc, char ** argv) {
 80 |    const int root = 0;
 81 |    int n_ranks, rank;
 82 | 
 83 |    MPI_Init(&argc, &argv);
 84 |    MPI_Comm_size(WORLD, &n_ranks);
 85 |    MPI_Comm_rank(WORLD, &rank);
 86 | 
 87 |    MPI_Datatype col, col_type;
 88 | 
 89 |    if(argc != 4) {
 90 |       fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
 91 |       exit(EXIT_FAILURE);
 92 |    }
 93 | 
 94 |    struct timespec start, end;
 95 |    double time_elapsed_s;
 96 |    int dmodel = atoi(argv[1]);
 97 |    int h = atoi(argv[3]);
 98 |    int S = atoi(argv[2]);
 99 |    int dk = dmodel/h;
100 |    int dv = dmodel/h;
101 | 
102 |    if((dmodel%n_ranks) != 0) {
103 |       fprintf(stderr, "Error: dmodel must be a multiple of the number of ranks (dmodel: %d, rank: %d)\n", dmodel, rank);
104 |       exit(EXIT_FAILURE);
105 |    }
106 | 
107 |    if(n_ranks > h || (h % n_ranks) != 0) {
108 |       fprintf(stderr, "Error: the number of heads must be a multiple of the number of MPI \
109 |                        ranks and the number of heads must be equal or greater than the \
110 |                        number of ranks. (heads = %d, n_ranks=%s\n", h, n_ranks);
111 |       exit(EXIT_FAILURE);
112 |    }
113 | 
114 |    fprintf(stdout, "Model dim: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
115 | 
116 |    MPI_Type_vector(dmodel, dmodel/n_ranks, dmodel, mpi_data_type, &col);
117 |    MPI_Type_commit(&col);
118 |    MPI_Type_create_resized(col, 0, dmodel/n_ranks*sizeof(data_t), &col_type);
119 |    MPI_Type_commit(&col_type);
120 | 
121 |    data_t *embeddings = NULL;
122 |    data_t * Qw = NULL;
123 |    data_t * Kw = NULL;
124 |    data_t * Vw = NULL;
125 | 
126 |    data_t * Qw_heads = NULL;
127 |    data_t * Kw_heads = NULL;
128 |    data_t * Vw_heads = NULL;
129 | 
130 |    data_t * Q = NULL;
131 |    data_t * K = NULL;
132 |    data_t * V = NULL;
133 | 
134 |    data_t * KQ = NULL;
135 |    data_t * softmax_out = NULL;
136 | 
137 |    data_t * QKV = NULL;
138 | 
139 |    data_t * ATTNw = NULL;
140 |    data_t * ATTNout = NULL;
141 | 
142 |    data_t scale_f = 1.0f/sqrtf(((data_t)dk));
143 | 
144 |    srand(time(NULL));
145 | 
146 |    clock_gettime(CLOCK_MONOTONIC, &start);
147 | 
148 |    embeddings = calloc(dmodel*S, sizeof(data_t));
149 | 
150 |    ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
151 | 
152 |    if(rank == root) {
153 |       init_random_tensor(embeddings, data_type, dmodel*S);
154 |       init_random_tensor(ATTNw, data_type, dmodel*dmodel);
155 |    }
156 | 
157 |    MPI_Bcast(embeddings, dmodel*S, mpi_data_type, root, WORLD);
158 |    MPI_Bcast(ATTNw, dmodel*dmodel, mpi_data_type, root, WORLD);
159 | 
160 |    if(rank == root) {
161 |       Qw = calloc(dmodel*dmodel, sizeof(data_t));
162 |       init_random_tensor(Qw, data_type, dmodel*dmodel);
163 | 
164 |       Kw = calloc(dmodel*dmodel, sizeof(data_t));
165 |       init_random_tensor(Kw, data_type, dmodel*dmodel);
166 | 
167 |       Vw = calloc(dmodel*dmodel, sizeof(data_t));
168 |       init_random_tensor(Vw, data_type, dmodel*dmodel);
169 |    }
170 | 
171 |    Qw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
172 |    Kw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
173 |    Vw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
174 | 
175 |    MPI_Scatter(Qw, 1, col_type, Qw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
176 |    MPI_Scatter(Kw, 1, col_type, Kw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
177 |    MPI_Scatter(Vw, 1, col_type, Vw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
178 | 
179 |    Q = calloc(S*dmodel/n_ranks, sizeof(data_t));
180 |    memset(Q, 0, S*dmodel*sizeof(data_t)/n_ranks);
181 | 
182 |    K = calloc(S*dmodel/n_ranks, sizeof(data_t));
183 |    memset(K, 0, S*dmodel*sizeof(data_t)/n_ranks);
184 | 
185 |    V = calloc(S*dmodel/n_ranks, sizeof(data_t));
186 |    memset(V, 0, S*dmodel*sizeof(data_t)/n_ranks);
187 | 
188 |    KQ = calloc(h/n_ranks*S*S, sizeof(data_t));
189 |    memset(KQ, 0, h/n_ranks*S*S*sizeof(data_t));
190 | 
191 |    softmax_out = calloc(h/n_ranks*S*S, sizeof(data_t));
192 |    memset(softmax_out, 0, h/n_ranks*S*S*sizeof(data_t));
193 | 
194 |    QKV = calloc(S/n_ranks*dmodel, sizeof(data_t));
195 |    memset(QKV, 0, S/n_ranks*dmodel*sizeof(data_t));
196 | 
197 |    ATTNout = calloc(S*dmodel, sizeof(data_t));
198 |    memset(ATTNout, 0, S*dmodel*sizeof(data_t));
199 | 
200 |    clock_gettime(CLOCK_MONOTONIC, &end);
201 | 
202 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
203 |    printf("[rank: %d] Init time: %.2f ms\n", rank, time_elapsed_s * 1000);
204 | 
205 |    clock_gettime(CLOCK_MONOTONIC, &start);
206 |    /* MHA */
207 | 
208 |    gemm(Q, embeddings, Qw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
209 |    gemm(K, embeddings, Kw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
210 |    gemm(V, embeddings, Vw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
211 | 
212 |    gemm_t(KQ, Q, K, data_type, h/n_ranks, S, S, dmodel/h, S, dmodel/n_ranks, dmodel/n_ranks);
213 | 
214 |    scale(KQ, KQ, ((void*)&scale_f), data_type, h/n_ranks*S, S);
215 | 
216 |    softmax(softmax_out, KQ, data_type, h/n_ranks*S, S);
217 | 
218 |    gemm(QKV, softmax_out, V, data_type, h/n_ranks, S, dmodel/h, S, dmodel/n_ranks, S, dmodel/n_ranks);
219 | 
220 |    gemm(ATTNout, QKV, ATTNw, data_type, 1, S, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel);
221 | 
222 |    add(&ATTNout[S/n_ranks*rank*dmodel], &ATTNout[S/n_ranks*rank*dmodel], &embeddings[S/n_ranks*rank*dmodel], data_type, S/n_ranks, dmodel);
223 | 
224 |    MPI_Allreduce(ATTNout, embeddings, S*dmodel, mpi_data_type, MPI_SUM, WORLD);
225 | 
226 |    clock_gettime(CLOCK_MONOTONIC, &end);
227 | 
228 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
229 |    const uint64_t flop_count = S/n_ranks * (dmodel * (8*dmodel + 4*S + 1) + 8*h*S) + S*dmodel;
230 | 
231 |    printf("[rank: %d] MHA execution time: %.2f ms flop count per rank: %lu\n", rank, time_elapsed_s * 1000, flop_count);
232 | 
233 |    if(rank == root) {
234 |       free(Qw);
235 |       free(Kw);
236 |       free(Vw);
237 |    }
238 | 
239 |    free(embeddings);
240 |    free(Qw_heads);
241 |    free(Kw_heads);
242 |    free(Vw_heads);
243 |    free(Q);
244 |    free(K);
245 |    free(V);
246 |    free(KQ);
247 |    free(softmax_out);
248 |    free(QKV);
249 |    free(ATTNw);
250 |    free(ATTNout);
251 | 
252 |    MPI_Finalize();
253 | 
254 |    return 0;
255 | }
256 | 
257 | static size_t get_element_size(data_type_e type) {
258 |    size_t size;
259 |    switch (type) {
260 |       case I8:
261 |          size = sizeof(uint8_t);
262 |          break;
263 |       case I16:
264 |          size = sizeof(uint16_t);
265 |          break;
266 |       case I32:
267 |       case FP32:
268 |          size = sizeof(uint32_t);
269 |          break;
270 |       case FP64:
271 |          size = sizeof(uint64_t);
272 |          break;
273 |       default:
274 |          size = -1;
275 |          break;
276 |    }
277 | 
278 |    assert(size > 0 && "data type unknown");
279 | 
280 |    return size;
281 | }
282 | 
283 | 
284 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
285 |    const data_t range = 10;
286 |    #pragma omp parallel for shared (tensor)
287 |    for(int i = 0; i < nmemb; i++)
288 |       tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
289 | }
290 | 
291 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
292 |    assert(tensor);
293 |    init_random_tensor_impl(((data_t*)tensor), nmemb);
294 | }
295 | 
296 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
297 |       int stride_0, int stride_1, int stride_2) {
298 |    const int bsize = MIN(32,k);
299 |    int ii0, ii1, ii2;
300 |    int i0, i1, i2;
301 |    int h;
302 |    int start_head, stop_head;
303 |    data_t pp;
304 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
305 |    for (h=0; h < heads; h++) {
306 |       for (ii0 = 0; ii0<m; ii0+=bsize) {
307 |          for (ii1 = h*n; ii1<((h+1)*n); ii1+=bsize) {
308 |             for(ii2 = 0; ii2<k; ii2+=bsize) {
309 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
310 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,((h+1)*n)); i1++) {
311 |                      pp = 0;
312 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
313 |                         pp += src1[(i0+h*m)*(stride_1)+i2] * src2[i2*stride_2+i1];
314 |                      }
315 |                      dst[i0*(stride_0)+i1]+= pp;
316 |                   }
317 |                }
318 |             }
319 |          }
320 |       }
321 |    }
322 | }
323 | 
324 | static void gemm_t_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
325 |       int stride_0, int stride_1, int stride_2) {
326 |    const int bsize = 16;
327 |    int ii0, ii1, ii2;
328 |    int i0, i1, i2;
329 |    int h;
330 |    data_t pp;
331 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(1)
332 |    for(h = 0; h < heads; h++) {
333 |       for (ii0 = 0; ii0 < m; ii0+=bsize) {
334 |          for (ii1 = 0; ii1<n; ii1+=bsize) {
335 |             for(ii2 = (h * k); ii2<((h+1)*k); ii2+=bsize) {
336 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
337 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,n); i1++) {
338 |                      pp = 0;
339 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,((h+1)*k)); i2++) {
340 |                         pp += src1[i0*stride_1+i2] * src2[i1*stride_2+i2];
341 |                      }
342 |                      dst[(i0+h*m)*stride_0+i1]+= pp;
343 |                   }
344 |                }
345 |             }
346 |          }
347 |       }
348 |    }
349 | }
350 | 
351 | static void scale_impl(data_t * dst, const data_t * src1, const data_t src2, int m, int n) {
352 |    int i;
353 |    #pragma omp parallel for shared (dst, src1) private(i)
354 |    for(i = 0; i < m*n; i++)
355 |       dst[i] = src1[i] * src2;
356 | }
357 | 
358 | static void add_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n) {
359 |    int i;
360 |    #pragma omp parallel for shared (dst, src1) private(i)
361 |    for(i = 0; i < m*n; i++)
362 |       dst[i] = src1[i] + src2[i];
363 | }
364 | 
365 | static void softmax_impl(data_t * dst, const data_t * src, int m, int n) {
366 |    int i, j;
367 |    data_t max, sum;
368 |    #pragma omp parallel for shared (dst) private(i, j, max, sum)
369 |    for(i = 0; i < m; i++) {
370 |       max = DATA_MIN;
371 |       for(j = 0; j < n; j++)
372 |          max = (max > src[i*n+j]) ? max : src[i*n+j];
373 | 
374 |       sum = 0.0;
375 |       for(j = 0; j < n; j++) {
376 |          const data_t e = expf(src[i*n+j] - max);
377 |          sum += e;
378 |          dst[i*n+j] = e;
379 |       }
380 | 
381 |       for(j = 0; j < n; j++) {
382 |          dst[i*n+j] *= sum;
383 |       }
384 |    }
385 | }
386 | 
387 | 
388 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
389 |           int stride_0, int stride_1, int stride_2) {
390 |    assert(dst);
391 |    assert(src1);
392 |    assert(src2);
393 |    assert(heads*n == stride_0);
394 |    gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
395 | }
396 | 
397 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
398 |           int stride_0, int stride_1, int stride_2) {
399 |    assert(dst);
400 |    assert(src1);
401 |    assert(src2);
402 |    assert(heads*k == stride_1);
403 |    gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
404 | }
405 | 
406 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
407 |    assert(dst);
408 |    assert(src1);
409 |    assert(src2);
410 |    scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
411 | }
412 | 
413 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
414 |    assert(dst);
415 |    assert(src1);
416 |    assert(src2);
417 |    add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
418 | }
419 | 
420 | 
421 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
422 |    assert(dst);
423 |    assert(src);
424 |    softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
425 | }
426 | 


--------------------------------------------------------------------------------
/demo/sst/software/mha_OMP.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdint.h>
  4 | #include <float.h>
  5 | #include <assert.h>
  6 | #include <math.h>
  7 | #include <time.h>
  8 | #include <string.h>
  9 | #include <limits.h>
 10 | #include <omp.h>
 11 | 
 12 | #define MIN(a,b)  ((a) < (b) ? (a) : (b))
 13 | 
 14 | #ifndef TILE_SIZE
 15 | #define TILE_SIZE 16
 16 | #endif
 17 | 
 18 | //FP32
 19 | //#define DATATYPE 0
 20 | //FP64
 21 | //#define DATATYPE 1
 22 | //I8
 23 | //#define DATATYPE 2
 24 | //I16
 25 | //#define DATATYPE 3
 26 | //I32
 27 | #define DATATYPE 4
 28 | 
 29 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
 30 | 
 31 | #if DATATYPE == 0
 32 | typedef float data_t;
 33 | static data_type_e data_type = FP32;
 34 | #define DATA_MIN FLT_MIN
 35 | #define TYPE_IS_FP
 36 | #elif DATATYPE == 1
 37 | typedef double data_t;
 38 | static data_type_e data_type = FP64;
 39 | #define DATA_MIN DBL_MIN
 40 | #define TYPE_IS_FP
 41 | #elif DATATYPE == 2
 42 | typedef int8_t data_t;
 43 | static data_type_e data_type = I8;
 44 | #define DATA_MIN CHAR_MIN
 45 | #define TYPE_IS_INT
 46 | #elif DATATYPE == 3
 47 | typedef int16_t data_t;
 48 | static data_type_e data_type = I16;
 49 | #define DATA_MIN SHRT_MIN
 50 | #define TYPE_IS_INT
 51 | #elif DATATYPE == 4
 52 | typedef int32_t data_t;
 53 | static data_type_e data_type = I32;
 54 | #define DATA_MIN INT_MIN
 55 | #define TYPE_IS_INT
 56 | #else
 57 |    #error Unsupported choice setting
 58 | #endif
 59 | 
 60 | 
 61 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
 62 | 
 63 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
 64 |           int stride_0, int stride_1, int stride_2);
 65 | 
 66 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
 67 |           int stride_0, int stride_1, int stride_2);
 68 | 
 69 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 70 | 
 71 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
 72 | 
 73 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
 74 | 
 75 | int main(int argc, char ** argv) {
 76 | 
 77 |    if(argc != 4) {
 78 |       fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
 79 |       exit(EXIT_FAILURE);
 80 |    }
 81 | 
 82 |    struct timespec start, end;
 83 |    double time_elapsed_s;
 84 |    int dmodel = atoi(argv[1]);
 85 |    int h = atoi(argv[3]);
 86 |    int S = atoi(argv[2]);
 87 |    int dk = dmodel/h;
 88 |    int dv = dmodel/h;
 89 | 
 90 |    fprintf(stdout, "Model n_ranks: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
 91 | 
 92 |    data_t *embeddings = NULL;
 93 |    data_t * Qw = NULL;
 94 |    data_t * Kw = NULL;
 95 |    data_t * Vw = NULL;
 96 | 
 97 |    data_t * Qw_heads = NULL;
 98 |    data_t * Kw_heads = NULL;
 99 |    data_t * Vw_heads = NULL;
100 | 
101 |    data_t * Q = NULL;
102 |    data_t * K = NULL;
103 |    data_t * V = NULL;
104 | 
105 |    data_t * KQ = NULL;
106 |    data_t * softmax_out = NULL;
107 | 
108 |    data_t * QKV = NULL;
109 | 
110 |    data_t * ATTNw = NULL;
111 |    data_t * ATTNout = NULL;
112 | 
113 |    data_t scale_f = 1.0f/sqrtf(((data_t)dk));
114 | 
115 |    srand(time(NULL));
116 | 
117 |    clock_gettime(CLOCK_MONOTONIC, &start);
118 | 
119 |    embeddings = calloc(dmodel*S, sizeof(data_t));
120 | 
121 |    ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
122 | 
123 |    init_random_tensor(embeddings, data_type, dmodel*S);
124 |    init_random_tensor(ATTNw, data_type, dmodel*dmodel);
125 |    Qw = calloc(dmodel*dmodel, sizeof(data_t));
126 |    init_random_tensor(Qw, data_type, dmodel*dmodel);
127 | 
128 |    Kw = calloc(dmodel*dmodel, sizeof(data_t));
129 |    init_random_tensor(Kw, data_type, dmodel*dmodel);
130 | 
131 |    Vw = calloc(dmodel*dmodel, sizeof(data_t));
132 |    init_random_tensor(Vw, data_type, dmodel*dmodel);
133 | 
134 |    Q = calloc(S*dmodel, sizeof(data_t));
135 |    memset(Q, 0, S*dmodel*sizeof(data_t));
136 | 
137 |    K = calloc(S*dmodel, sizeof(data_t));
138 |    memset(K, 0, S*dmodel*sizeof(data_t));
139 | 
140 |    V = calloc(S*dmodel, sizeof(data_t));
141 |    memset(V, 0, S*dmodel*sizeof(data_t));
142 | 
143 |    KQ = calloc(h*S*S, sizeof(data_t));
144 |    memset(KQ, 0, h*S*S*sizeof(data_t));
145 | 
146 |    softmax_out = calloc(h*S*S, sizeof(data_t));
147 |    memset(softmax_out, 0, h*S*S*sizeof(data_t));
148 | 
149 |    QKV = calloc(S*dmodel, sizeof(data_t));
150 |    memset(QKV, 0, S*dmodel*sizeof(data_t));
151 | 
152 |    ATTNout = calloc(S*dmodel, sizeof(data_t));
153 |    memset(ATTNout, 0, S*dmodel*sizeof(data_t));
154 | 
155 |    clock_gettime(CLOCK_MONOTONIC, &end);
156 | 
157 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
158 |    printf("Init time: %.2f ms\n", time_elapsed_s * 1000);
159 | 
160 |    clock_gettime(CLOCK_MONOTONIC, &start);
161 |    /* MHA */
162 | 
163 |    gemm(Q, embeddings, Qw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
164 |    gemm(K, embeddings, Kw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
165 |    gemm(V, embeddings, Vw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
166 | 
167 |    gemm_t(KQ, Q, K, data_type, h, S, S, dmodel/h, S, dmodel, dmodel);
168 | 
169 |    scale(KQ, KQ, ((void*)&scale_f), data_type, h*S, S);
170 | 
171 |    softmax(softmax_out, KQ, data_type, h*S, S);
172 | 
173 | 
174 |    gemm(QKV, softmax_out, V, data_type, h, S, dmodel/h, S, dmodel, S, dmodel);
175 | 
176 |    gemm(ATTNout, QKV, ATTNw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
177 | 
178 |    add(embeddings, ATTNout, embeddings, data_type, S, dmodel);
179 | 
180 | 
181 |    clock_gettime(CLOCK_MONOTONIC, &end);
182 | 
183 |    time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
184 |    const uint64_t flop_count = S * (dmodel * (8*dmodel + 4*S + 1) + 8*h*S);
185 | 
186 |    printf("MHA execution time: %.2f ms flop count: %lu\n", time_elapsed_s * 1000, flop_count);
187 | 
188 |    free(Qw);
189 |    free(Kw);
190 |    free(Vw);
191 | 
192 |    free(embeddings);
193 |    free(Qw_heads);
194 |    free(Kw_heads);
195 |    free(Vw_heads);
196 |    free(Q);
197 |    free(K);
198 |    free(V);
199 |    free(KQ);
200 |    free(softmax_out);
201 |    free(QKV);
202 |    free(ATTNw);
203 |    free(ATTNout);
204 | 
205 |    return 0;
206 | }
207 | 
208 | static size_t get_element_size(data_type_e type) {
209 |    size_t size;
210 |    switch (type) {
211 |       case I8:
212 |          size = sizeof(uint8_t);
213 |          break;
214 |       case I16:
215 |          size = sizeof(uint16_t);
216 |          break;
217 |       case I32:
218 |       case FP32:
219 |          size = sizeof(uint32_t);
220 |          break;
221 |       case FP64:
222 |          size = sizeof(uint64_t);
223 |          break;
224 |       default:
225 |          size = -1;
226 |          break;
227 |    }
228 | 
229 |    assert(size > 0 && "data type unknown");
230 | 
231 |    return size;
232 | }
233 | 
234 | 
235 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
236 |    const data_t range = 10;
237 |    #pragma omp parallel for shared (tensor)
238 |    for(int i = 0; i < nmemb; i++)
239 |       tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
240 | }
241 | 
242 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
243 |    assert(tensor);
244 |    init_random_tensor_impl(((data_t*)tensor), nmemb);
245 | }
246 | 
247 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
248 |       int stride_0, int stride_1, int stride_2) {
249 |    const int bsize = TILE_SIZE;
250 |    int ii0, ii1, ii2;
251 |    int i0, i1, i2;
252 |    int h;
253 |    int start_head, stop_head;
254 |    data_t pp;
255 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
256 |    for (h=0; h < heads; h++) {
257 |       for (ii0 = 0; ii0<m; ii0+=bsize) {
258 |          for (ii1 = h*n; ii1<((h+1)*n); ii1+=bsize) {
259 |             for(ii2 = 0; ii2<k; ii2+=bsize) {
260 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
261 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,((h+1)*n)); i1++) {
262 |                      pp = 0;
263 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,k); i2++) {
264 |                         pp += src1[(i0+h*m)*(stride_1)+i2] * src2[i2*stride_2+i1];
265 |                      }
266 |                      dst[i0*(stride_0)+i1]+= pp;
267 |                   }
268 |                }
269 |             }
270 |          }
271 |       }
272 |    }
273 | }
274 | 
275 | static void gemm_t_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
276 |       int stride_0, int stride_1, int stride_2) {
277 |    const int bsize = TILE_SIZE;
278 |    int ii0, ii1, ii2;
279 |    int i0, i1, i2;
280 |    int h;
281 |    data_t pp;
282 |    #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(1)
283 |    for(h = 0; h < heads; h++) {
284 |       for (ii0 = 0; ii0 < m; ii0+=bsize) {
285 |          for (ii1 = 0; ii1<n; ii1+=bsize) {
286 |             for(ii2 = (h * k); ii2<((h+1)*k); ii2+=bsize) {
287 |                for (i0 = ii0; i0 < MIN(ii0+bsize,m); i0++) {
288 |                   for (i1 = ii1; i1 < MIN(ii1+bsize,n); i1++) {
289 |                      pp = 0;
290 |                      for (i2 = ii2; i2 < MIN(ii2+bsize,((h+1)*k)); i2++) {
291 |                         pp += src1[i0*stride_1+i2] * src2[i1*stride_2+i2];
292 |                      }
293 |                      dst[(i0+h*m)*stride_0+i1]+= pp;
294 |                   }
295 |                }
296 |             }
297 |          }
298 |       }
299 |    }
300 | }
301 | 
302 | static void scale_impl(data_t * dst, const data_t * src1, const data_t src2, int m, int n) {
303 |    int i;
304 |    #pragma omp parallel for shared (dst, src1) private(i)
305 |    for(i = 0; i < m*n; i++)
306 |       dst[i] = src1[i] * src2;
307 | }
308 | 
309 | static void add_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n) {
310 |    int i;
311 |    #pragma omp parallel for shared (dst, src1) private(i)
312 |    for(i = 0; i < m*n; i++)
313 |       dst[i] = src1[i] + src2[i];
314 | }
315 | 
316 | static void softmax_impl(data_t * dst, const data_t * src, int m, int n) {
317 |    int i, j;
318 |    data_t max, sum;
319 |    #pragma omp parallel for shared (dst) private(i, j, max, sum)
320 |    for(i = 0; i < m; i++) {
321 |       max = DATA_MIN;
322 |       for(j = 0; j < n; j++)
323 |          max = (max > src[i*n+j]) ? max : src[i*n+j];
324 | 
325 |       sum = 0.0;
326 |       for(j = 0; j < n; j++) {
327 |          const data_t e = expf(src[i*n+j] - max);
328 |          sum += e;
329 |          dst[i*n+j] = e;
330 |       }
331 | 
332 |       for(j = 0; j < n; j++) {
333 |          dst[i*n+j] *= sum;
334 |       }
335 |    }
336 | }
337 | 
338 | 
339 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
340 |           int stride_0, int stride_1, int stride_2) {
341 |    assert(dst);
342 |    assert(src1);
343 |    assert(src2);
344 |    assert(heads*n == stride_0);
345 |    gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
346 | }
347 | 
348 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
349 |           int stride_0, int stride_1, int stride_2) {
350 |    assert(dst);
351 |    assert(src1);
352 |    assert(src2);
353 |    assert(heads*k == stride_1);
354 |    gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
355 | }
356 | 
357 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
358 |    assert(dst);
359 |    assert(src1);
360 |    assert(src2);
361 |    scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
362 | }
363 | 
364 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
365 |    assert(dst);
366 |    assert(src1);
367 |    assert(src2);
368 |    add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
369 | }
370 | 
371 | 
372 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
373 |    assert(dst);
374 |    assert(src);
375 |    softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
376 | }
377 | 


--------------------------------------------------------------------------------
/demo/sst/software/riscv64.make:
--------------------------------------------------------------------------------
 1 | ARCH= riscv64
 2 | CC=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-gcc
 3 | MPICC=$(MVAPICH2_INSTALL_DIR)/bin/mpicc
 4 | CFLAGS=-O3 -fopenmp
 5 | LDFLAGS=-static -lm
 6 | 
 7 | .PHONY: all clean
 8 | all : $(ARCH)/mha_OMP_8 $(ARCH)/mha_OMP_16 $(ARCH)/mha_OMP_32 $(ARCH)/mha_OMP_64 \
 9 |    $(ARCH)/mha_MPI_OMP $(ARCH)/hello_MPI_OMP
10 | 
11 | $(ARCH)/mha_OMP_8 : mha_OMP.c
12 | 	@mkdir -p $(@D)
13 | 	$(CC) $(CFLAGS) -DTILE_SIZE=8 -o $@ $^ $(LDFLAGS)
14 | 
15 | $(ARCH)/mha_OMP_16 : mha_OMP.c
16 | 	@mkdir -p $(@D)
17 | 	$(CC) $(CFLAGS) -DTILE_SIZE=16 -o $@ $^ $(LDFLAGS)
18 | 
19 | $(ARCH)/mha_OMP_32 : mha_OMP.c
20 | 	@mkdir -p $(@D)
21 | 	$(CC) $(CFLAGS) -DTILE_SIZE=32 -o $@ $^ $(LDFLAGS)
22 | 
23 | $(ARCH)/mha_OMP_64 : mha_OMP.c
24 | 	@mkdir -p $(@D)
25 | 	$(CC) $(CFLAGS) -DTILE_SIZE=64 -o $@ $^ $(LDFLAGS)
26 | 
27 | 
28 | $(ARCH)/mha_MPI_OMP : mha_MPI_OMP.c
29 | 	@mkdir -p $(@D)
30 | 	$(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
31 | 
32 | $(ARCH)/hello_MPI_OMP : hello_MPI_OMP.c
33 | 	@mkdir -p $(@D)
34 | 	$(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
35 | 
36 | clean:
37 | 	 rm -rf $(ARCH)
38 | 


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/gemm_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/gemm_OMP


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/hello_MPI:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/hello_MPI


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/hello_MPI_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/hello_MPI_OMP


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_MPI_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_MPI_OMP


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_16:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_16


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_32:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_32


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_64


--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_8:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_8


--------------------------------------------------------------------------------
/demo/sst/software/x86.make:
--------------------------------------------------------------------------------
 1 | ARCH=x86
 2 | CFLAGS=-O3 -fopenmp
 3 | LDFLAGS=-lm
 4 | 
 5 | ifndef CC
 6 | $(error CC is not set)
 7 | endif
 8 | 
 9 | ifndef MPICC
10 | $(error MPICC is not set)
11 | endif
12 | 
13 | .PHONY: all clean
14 | all : $(ARCH)/mha_OMP $(ARCH)/mha_MPI_OMP $(ARCH)/check_mpi $(ARCH)/hello_MPI_OMP
15 | 
16 | $(ARCH)/mha_OMP : mha_OMP.c
17 | 	@mkdir -p $(@D)
18 | 	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
19 | 
20 | $(ARCH)/mha_MPI_OMP : mha_MPI_OMP.c
21 | 	@mkdir -p $(@D)
22 | 	$(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
23 | 
24 | $(ARCH)/check_mpi : check_mpi.c
25 | 	@mkdir -p $(@D)
26 | 	$(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
27 | 
28 | $(ARCH)/hello_MPI_OMP : hello_MPI_OMP.c
29 | 	@mkdir -p $(@D)
30 | 	$(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
31 | 
32 | clean:
33 | 	rm -rf $(ARCH)
34 | 


--------------------------------------------------------------------------------
/docs/512px-LOGO-IMEC_black.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/512px-LOGO-IMEC_black.svg.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | ## This source code is licensed under the MIT license found in the
 2 | ## LICENSE file in the root directory of this source tree.
 3 | ##
 4 | ## Copyright (c) 2025 IMEC. All rights reserved.
 5 | ## ******************************************************************************
 6 | # Configuration file for the Sphinx documentation builder.
 7 | #
 8 | # For the full list of built-in configuration values, see the documentation:
 9 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
10 | 
11 | # -- Project information -----------------------------------------------------
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
13 | 
14 | project = 'Scalable System Simulations Tutorial'
15 | copyright = '2024-2025, imec vzw'
16 | author = 'imec vzw'
17 | release = '0.1'
18 | 
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 | 
22 | extensions = ['sphinx_rtd_theme', 'sphinxcontrib.bibtex']
23 | 
24 | templates_path = ['_templates']
25 | exclude_patterns = []
26 | bibtex_bibfiles = ['references.bib']
27 | 
28 | 
29 | 
30 | # -- Options for HTML output -------------------------------------------------
31 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
32 | 
33 | # html_theme = 'alabaster'
34 | html_theme = "sphinx_rtd_theme"
35 | html_static_path = ['_static']
36 | html_logo = "512px-LOGO-IMEC_black.svg.png"
37 | html_theme_options = {
38 |     'logo_only': False,
39 |     'display_version': False,
40 | }
41 | html_css_files = ['custom_svg.css']
42 | numfig = True
43 | 


--------------------------------------------------------------------------------
/docs/gem5.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../demo/gem5/README.rst


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. vlsid_2025 documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec  6 11:47:00 2024.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | A tutorial on scalable system simulations for RISC-V architectures and performance analysis for machine learning workloads
 7 | ##########################################################################################################################
 8 | This tutorial aims to provide a comprehensive introduction to computer system simulations
 9 | and performance analysis, focusing on the integration and application of consensual open-source frameworks
10 | like Structural Simulation Toolkit (SST), Gem5, and Multi-Level Intermediate Representation (MLIR).
11 | Participants will gain hands-on experience in conducting an architectural design exploration with a pragmatic approach
12 | where the simulation framework used is chosen based on a trade-off between fidelity and scalability requirements. By the end of the tutorial, participants will be equipped with the skills necessary to conduct in-depth performance analysis and optimize complex systems using state-of-the-art tools.
13 | 
14 | 
15 | 
16 | .. toctree::
17 |    :maxdepth: 3
18 |    :caption: Contents:
19 | 
20 | 
21 |    gem5
22 |    sst
23 |    About us - CSA, imec<https://www.imec-int.com/en/expertise/compute-system-architecture>
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
 1 | @article{DBLP:journals/corr/VaswaniSPUJGKP17,
 2 |   author       = {Ashish Vaswani and
 3 |                   Noam Shazeer and
 4 |                   Niki Parmar and
 5 |                   Jakob Uszkoreit and
 6 |                   Llion Jones and
 7 |                   Aidan N. Gomez and
 8 |                   Lukasz Kaiser and
 9 |                   Illia Polosukhin},
10 |   title        = {Attention Is All You Need},
11 |   journal      = {CoRR},
12 |   volume       = {abs/1706.03762},
13 |   year         = {2017},
14 |   url          = {http://arxiv.org/abs/1706.03762},
15 |   eprinttype    = {arXiv},
16 |   eprint       = {1706.03762},
17 |   timestamp    = {Sat, 23 Jan 2021 01:20:40 +0100},
18 |   biburl       = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
19 |   bibsource    = {dblp computer science bibliography, https://dblp.org}
20 | }
21 | 


--------------------------------------------------------------------------------
/docs/slides/2025_05_ISPASS_Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/slides/2025_05_ISPASS_Presentation.pdf


--------------------------------------------------------------------------------
/docs/slides/VLSID25_Tutorial_Slides_imec_CSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/slides/VLSID25_Tutorial_Slides_imec_CSA.pdf


--------------------------------------------------------------------------------
/docs/sst.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../external/INSTALL.rst
2 | .. include:: ../demo/sst/README.rst
3 | 


--------------------------------------------------------------------------------
/external/.gitignore:
--------------------------------------------------------------------------------
1 | mvapich2-2.3.7-1
2 | 


--------------------------------------------------------------------------------
/external/INSTALL.rst:
--------------------------------------------------------------------------------
  1 | .. _Installation instructions:
  2 | 
  3 | Installation instructions for scale-out system simulation
  4 | ==========================================================
  5 | 
  6 | To run the demo on your side you need to install at least SST.
  7 | The rv64 binaries are already compiled. However, if you want to compile new applications you must install the mpi compiler as described below.
  8 | 
  9 | Run the following command to download the required sub-modules:
 10 | 
 11 | .. code:: bash
 12 | 
 13 |    git submodule init --update
 14 | 
 15 | Install instructions for SST
 16 | ----------------------------
 17 | 
 18 | You must install **SST-core** SST. To do this, run the following commands in a terminal:
 19 | 
 20 | .. code:: bash
 21 | 
 22 |    cd sst/sst-core
 23 |    export SST_CORE_HOME=$(pwd)/install
 24 |    ./autogen.sh
 25 |    mkdir build
 26 |    cd build
 27 |    ../configure --prefix=$SST_CORE_HOME
 28 |    make -j all
 29 |    make install
 30 |    export PATH=$SST_CORE_HOME/bin:$PATH
 31 |    cd ../../../
 32 | 
 33 | Then, you can install **SST-elements** as follow:
 34 | 
 35 | .. code:: bash
 36 | 
 37 |    cd sst/sst-elements
 38 |    git apply ../../demo/sst/docker/sst-elements.patch
 39 |    export SST_ELEMENTS_HOME=$(pwd)/install
 40 |    ./autogen.sh
 41 |    mkdir build
 42 |    cd build
 43 |    ../configure --prefix=$SST_ELEMENTS_HOME --with-sst-core=$SST_CORE_HOME
 44 |    make -j all
 45 |    make install
 46 |    cd ../../../
 47 | 
 48 | Install instructions for rv64 mpi compiler
 49 | ------------------------------------------
 50 | The first step is to install **riscv64-unknown-linux-musl-gcc**. To do this, run the following commands in a terminal:
 51 | 
 52 | .. code:: bash
 53 | 
 54 |    cd riscv-gnu-toolchain
 55 |    export RV64_GNU_INSTALL=$(pwd)/install
 56 |    CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" ./configure --prefix=$RV64_GNU_INSTALL --disable-multilib --with-languages=c,c++
 57 |    make -j8 musl
 58 | 
 59 | Then, you must build the RDMA library
 60 | 
 61 | .. code:: bash
 62 | 
 63 |    cd sst/libRDMA
 64 |    make
 65 | 
 66 | Finally, you can build and install **mpicc** as follow:
 67 | 
 68 | .. code:: bash
 69 | 
 70 |    export RDMA_NIC_DIR=$(realpath ./sst/sst-elements/src/sst/elements/rdmaNic)
 71 |    export RDMA_LIB_DIR=$(realpath ./sst/libRDMA/riscv64/)
 72 | 
 73 |    tar xzvf mvapich2-2.3.7-1.tar.gz
 74 |    ulimit -n 4096
 75 |    patch --directory=mvapich2-2.3.7-1/ -p1 < mvapich2-2.3.7-1.patch
 76 | 
 77 |    cd mvapich2-2.3.7-1/
 78 |    ./autogen.sh
 79 | 
 80 |    mkdir install
 81 |    mkdir build
 82 | 
 83 |    export MVAPICH2_INSTALL_DIR=$(pwd)/install
 84 | 
 85 |    cd build
 86 | 
 87 |    ../configure                                                                        \
 88 |          --prefix=${MVAPICH2_INSTALL_DIR}                                              \
 89 |          --enable-fortran=no                                                           \
 90 |          --with-device=ch3:rdma                                                        \
 91 |          --enable-romio=no                                                             \
 92 |          --enable-hybrid=no                                                            \
 93 |          --enable-shared=no                                                            \
 94 |          --enable-static=yes                                                           \
 95 |          --with-pmi=vanadis                                                            \
 96 |          --with-pm=none                                                                \
 97 |          --enable-threads=single                                                       \
 98 |          --enable-rsh=yes                                                              \
 99 |          --host=riscv64-unknown-linux-musl                                             \
100 |          CC=${RV64_GNU_INSTALL}/bin/riscv64-unknown-linux-musl-gcc                     \
101 |          CFLAGS="-I${RDMA_NIC_DIR}/tests/app/rdma/include -I${RDMA_NIC_DIR} -fPIC"     \
102 |          CXX=${RV64_GNU_INSTALL}/bin/riscv64-unknown-linux-musl-g++                    \
103 |          CXXFLAGS="-I${RDMA_NIC_DIR}/tests/app/rdma/include -I${RDMA_NIC_DIR} -fPIC"   \
104 |          LDFLAGS="-L${RDMA_LIB_DIR}"                                                   \
105 |          LIBS=-lrdma
106 | 
107 |    make -j8 install
108 | 


--------------------------------------------------------------------------------
/external/mvapich2-2.3.7-1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/external/mvapich2-2.3.7-1.tar.gz


--------------------------------------------------------------------------------
/external/sst/libRDMA/Makefile:
--------------------------------------------------------------------------------
 1 | ARCH = riscv64
 2 | ADDR_TYPE ?= uint64_t
 3 | PRIxBITS ?= PRIx64
 4 | PRIuBITS ?= PRIu64
 5 | 
 6 | AR=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-ar
 7 | CC=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-gcc
 8 | 
 9 | RDMADIR=$(abspath ../sst-elements/src/sst/elements/rdmaNic/)
10 | APPDIR=$(RDMADIR)/tests/app/rdma/
11 | 
12 | CFLAGS=-I$(APPDIR)/include -I$(RDMADIR) -Wattributes -Wall -DADDR_TYPE=$(ADDR_TYPE)  -DPRIxBITS=$(PRIxBITS) -DPRIuBITS=$(PRIuBITS)
13 | LIBS=-lrdma -Lriscv64
14 | 
15 | OBJS=riscv64/base.o riscv64/rdma.o
16 | 
17 | 
18 | .PHONY: all clean
19 | 
20 | all: librdma.a
21 | librdma.a: ${OBJS}
22 | 	$(AR) rcs riscv64/librdma.a $^
23 | 
24 | riscv64/rdma.o: $(APPDIR)/src/rdma.c $(APPDIR)/include/rdma.h $(RDMADIR)/rdmaNicHostInterface.h
25 | 	@mkdir -p $(@D)
26 | 	$(CC) $(CFLAGS) -c $< -o $@
27 | 
28 | riscv64/base.o: $(APPDIR)/src/base.c $(APPDIR)/include/base.h $(RDMADIR)/rdmaNicHostInterface.h
29 | 	@mkdir -p $(@D)
30 | 	$(CC) $(CFLAGS) -c $< -o $@
31 | 
32 | clean:
33 | 	rm -rf riscv64
34 | 
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==8.1.3
2 | sphinx_rtd_theme==3.0.2
3 | sphinxcontrib.bibtex==2.6.3
4 | 


--------------------------------------------------------------------------------