├── .github
└── workflows
│ └── deploy-docs.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── demo
├── gem5
│ ├── .gitignore
│ ├── README.rst
│ ├── convert_onnx_model.py
│ ├── docker
│ │ ├── Dockerfile
│ │ └── docker-compose.yaml
│ ├── iree
│ │ ├── Makefile
│ │ └── toolchain.generic.cmake
│ └── vlsid-riscv-fs.py
└── sst
│ ├── README.rst
│ ├── docker
│ ├── Dockerfile
│ ├── docker-compose.yaml
│ └── sst-elements.patch
│ ├── instruction-level-simulation
│ ├── .gitignore
│ ├── scale_out.py
│ └── scale_up.py
│ ├── packet-level-simulation
│ ├── large_config.json
│ ├── small_config.json
│ └── training_llm.py
│ └── software
│ ├── .gitignore
│ ├── Makefile
│ ├── check_mpi.c
│ ├── gemm_OMP.c
│ ├── hello_MPI.c
│ ├── hello_MPI_OMP.c
│ ├── mha_MPI_OMP.c
│ ├── mha_OMP.c
│ ├── riscv64.make
│ ├── riscv64
│ ├── gemm_OMP
│ ├── hello_MPI
│ ├── hello_MPI_OMP
│ ├── mha_MPI_OMP
│ ├── mha_OMP_16
│ ├── mha_OMP_32
│ ├── mha_OMP_64
│ └── mha_OMP_8
│ └── x86.make
├── docs
├── 512px-LOGO-IMEC_black.svg.png
├── conf.py
├── gem5.rst
├── images
│ ├── gem5
│ │ ├── gem5-system.svg
│ │ └── mnist-8.svg
│ ├── sst
│ │ ├── core.svg
│ │ ├── cpu.svg
│ │ ├── mha.svg
│ │ ├── mha_mpi.svg
│ │ ├── node.svg
│ │ └── system.svg
│ └── transformer
│ │ ├── 3d_parallelism_1.svg
│ │ ├── 3d_parallelism_2.svg
│ │ ├── data_parallelism.svg
│ │ ├── pipeline_parallelism_1.svg
│ │ ├── pipeline_parallelism_2.svg
│ │ ├── tensor_parallelism.svg
│ │ └── transformer_arch.svg
├── index.rst
├── references.bib
├── slides
│ ├── 2025_05_ISPASS_Presentation.pdf
│ └── VLSID25_Tutorial_Slides_imec_CSA.pdf
└── sst.rst
├── external
├── .gitignore
├── INSTALL.rst
├── mvapich2-2.3.7-1.patch
├── mvapich2-2.3.7-1.tar.gz
└── sst
│ └── libRDMA
│ └── Makefile
└── requirements.txt
/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
1 | name: Build and Deploy Docs
2 |
3 | on:
4 | push:
5 | branches:
6 | - main # Run workflow on pushes to the main branch
7 |
8 | jobs:
9 | build-deploy:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | # Step 1: Check out the repository
14 | - name: Checkout repository
15 | uses: actions/checkout@v3
16 |
17 | # Step 2: Set up Python
18 | - name: Set up Python
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: '3.10' # Adjust to your desired Python version
22 |
23 | # Step 3: Install dependencies
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install -r requirements.txt
28 |
29 | # Step 4: Build the HTML documentation
30 | - name: Build Sphinx documentation
31 | run: |
32 | make html
33 |
34 | # Step 5: Deploy to GitHub Pages (docs branch)
35 | - name: Deploy to GitHub Pages
36 | uses: peaceiris/actions-gh-pages@v4
37 | with:
38 | github_token: ${{ secrets.GITHUB_TOKEN }}
39 | publish_dir: build/html
40 | publish_branch: docs # The branch where GitHub Pages will serve from
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Compiled Object files
5 | *.slo
6 | *.lo
7 | *.o
8 | *.obj
9 |
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 |
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 |
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 |
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 |
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 |
34 | build
35 | imec_tut_2025
36 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/sst/sst-elements"]
2 | path = external/sst/sst-elements
3 | url = https://github.com/sstsimulator/sst-elements.git
4 | branch = v14.1.0_Final
5 | [submodule "external/sst/sst-core"]
6 | path = external/sst/sst-core
7 | url = https://github.com/sstsimulator/sst-core.git
8 | branch = v14.1.0_Final
9 | [submodule "external/riscv-gnu-toolchain"]
10 | path = external/riscv-gnu-toolchain
11 | url = https://github.com/riscv-collab/riscv-gnu-toolchain.git
12 | branch = 2024.11.22
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 imec, Belgium
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ## This source code is licensed under the MIT license found in the
2 | ## LICENSE file in the root directory of this source tree.
3 | ##
4 | ## Copyright (c) 2025 IMEC. All rights reserved.
5 | ## ******************************************************************************
6 |
7 | # Minimal makefile for Sphinx documentation
8 | #
9 |
10 | # You can set these variables from the command line, and also
11 | # from the environment for the first two.
12 | SPHINXOPTS ?=
13 | SPHINXBUILD ?= sphinx-build
14 | SOURCEDIR = docs
15 | BUILDDIR = build
16 |
17 | # Put it first so that "make" without argument is like "make help".
18 | help:
19 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
21 | .PHONY: help Makefile
22 |
23 | # Catch-all target: route all unknown targets to Sphinx using the new
24 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
25 | %: Makefile
26 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
27 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A tutorial on Scalable System Simulations
2 | *Looking at RISC-V architectures and performance analysis for machine learning workloads*
3 |
4 | Dive into the world of system-level simulations!
5 | - Explore RISC-V modelling and workload representation using gem5+MLIR.
6 | - Learn to scale your system simulations effortlessly with the power of SST.
7 |
8 | This tutorial bridges cutting-edge open-source tools and techniques to empower your hardware-software co-design journey. This tutorial has been presented
9 | at [International Conference on VLSI Design 2025](https://vlsid.org/).
10 |
11 | If you use this repository, please cite it as follows:
12 |
13 | ```bibtex
14 | @misc{sim-learning-tutorial,
15 | author = {Erwan Lenormand, Tommaso Marinelli, Debjyoti Bhattacharjee},
16 | title = {A tutorial on Scalable System Simulations},
17 | year = {2025},
18 | version = {v1.0.0},
19 | howpublished = {Presented at the International Conference on VLSI Design 2025},
20 | note = {\url{https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/} Accessed: 2025-01-02}
21 | }
22 | ```
23 |
24 | ### Getting Started
25 | Follow the documentation [online](https://csa-infra.github.io/RISCV-Scalable-Simulation-tutorial/index.html) or build the documentation yourself.
26 | Minimum required version of python is 3.10. The tutorial has been tested on Linux based systems.
27 |
28 | ```
29 | python3 -m venv imec_tut_2025
30 | source imec_tut_2025/bin/activate
31 | pip3 install -r requirements.txt
32 | make html
33 | ```
34 | ### Need help?
35 | If you need help or clarification regarding any part of the tutorial, file an [issue](https://github.com/CSA-infra/RISCV-Scaleable-Simulation-tutorial/issues/new) in the repository.
36 |
--------------------------------------------------------------------------------
/demo/gem5/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !README.md
4 | !convert_onnx_model.py
5 | !docker/
6 | !docker/*
7 | !images
8 | !iree/
9 | !iree/Makefile
10 | !iree/toolchain.generic.cmake
11 | !vlsid-riscv-fs.py
12 |
--------------------------------------------------------------------------------
/demo/gem5/README.rst:
--------------------------------------------------------------------------------
1 | Application-oriented system modeling and optimization
2 | *****************************************************
3 |
4 | *i.e. how to lower an AI/ML model to simulated RISC-V hardware for system-level
5 | exploration*
6 |
7 | The goal of this tutorial is to introduce the attendees to architectural
8 | simulation targeting machine learning workloads. The main tool we will be
9 | using to model a sample RISC-V system and run applications on top is
10 | \ `gem5 `__\ . The ML benchmarks are derived from
11 | ONNX files, translated into machine-optimized code and executed though a
12 | ligthweight runtime. This process is carried out with the help of the
13 | \ `IREE `__\ workflow.
14 |
15 | Prerequisites
16 | -------------
17 |
18 | - A Linux-based x86-64 system (native or WSL2/VM)
19 | - Docker or Podman
20 |
21 | Containerized environment
22 | ~~~~~~~~~~~~~~~~~~~~~~~~~
23 |
24 | .. note::
25 | The container is executed in privileged mode to
26 | allow mounting the disk image as a loop device. If you don’t like this,
27 | remove the corresponding option from ``docker-compose.yaml``.
28 |
29 | Dealing with all the software dependencies that this setup needs can be
30 | complicated. For this reason, a container file has been provided, which
31 | allows to generate a virtual environment with all the dependencies
32 | installed. Assuming that Docker is present in your system, you can prepare
33 | the environment this way:
34 |
35 | ::
36 |
37 | git clone https://github.com/CSA-infra/RISCV-Scalable-Simulation-tutorial.git vlsid-csa-tutorial
38 | cd vlsid-csa-tutorial/demo/gem5/docker
39 | docker compose up -d
40 |
41 | If it doesn’t work, try with ``docker-compose`` alternatively.
42 |
43 | To enter the container:
44 |
45 | ::
46 |
47 | docker exec -it docker_vlsid-iree-gem5_1 /bin/bash
48 |
49 | If you stop the container (e.g. reboot), you can easily return back to
50 | it with:
51 |
52 | ::
53 |
54 | docker start docker_vlsid-iree-gem5_1
55 | docker exec -it docker_vlsid-iree-gem5_1 /bin/bash
56 |
57 | Finally, if you want to destroy the container, you can do it with:
58 |
59 | ::
60 |
61 | cd vlsid-csa-tutorial/demo/gem5/docker
62 | docker compose down
63 |
64 | The working directory inside the container is ``/opt/vlsid-iree-gem5``.
65 | We will assume that every command is executed from that folder.
66 |
67 | Environment Setup
68 | -----------------
69 |
70 | Part 1: Prepare benchmark
71 | ~~~~~~~~~~~~~~~~~~~~~~~~~
72 |
73 | The IREE workflow is used to first convert a ML model to a supported
74 | intermediate representation, then compile and optimize the model for a
75 | target architecture. The output of the process is a Virtual Machine
76 | FlatBuffer (VMFB) file than can be run by the IREE runtime.
77 |
78 | A simple MNIST image classification model will be used as example, but
79 | the process is generalizable to other models too. The file format for the
80 | model is ONNX. Note that IREE also supports other formats (e.g. TF/TFLite),
81 | it is possible to convert them to MLIR using the right importers.
82 |
83 | .. figure:: images/gem5/mnist-8.svg
84 | :align: center
85 |
86 | Visual representation of the MNIST model
87 |
88 | - Download ONNX model
89 |
90 | ::
91 |
92 | wget https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mnist/model/mnist-8.onnx -O mnist-8-orig.onnx
93 |
94 | - `Upgrade ONNX
95 | opset `__
96 |
97 | ::
98 |
99 | ./convert_onnx_model.py mnist-8-orig.onnx mnist-8.onnx
100 |
101 | - Use IREE to convert ONNX file to MLIR Torch ONNX dialect
102 |
103 | ::
104 |
105 | iree-import-onnx mnist-8.onnx > mnist-8.mlir
106 |
107 | - Compile MLIR model to VMFB
108 |
109 | ::
110 |
111 | iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c mnist-8.mlir -o mnist-8.vmfb
112 |
113 | Part 2: Compile IREE run module
114 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115 |
116 | The IREE run module allows the execution of a compiled module using the
117 | IREE runtime. This module has to be added to the final disk image
118 | together with the benchmarks, since we don’t want to pull the entire
119 | IREE distribution.
120 |
121 | Even if pre-built binaries are available, as of now they are not
122 | compiled for any RISC-V architecture. Thus, we will have to compile this
123 | module from source. A Makefile has been provided to simplify the
124 | process.
125 |
126 | ::
127 |
128 | make -C iree
129 |
130 | Part 3: Compile m5 utility
131 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
132 |
133 | The m5 utility is used to send pseudo-instructions to the simulator.
134 | This allows a number of operations, like checkpointing, resetting
135 | statistics, etc. We want to include this utility in our final image.
136 | Note that will need the cross-compiler employed in the previous step to
137 | generate the binary.
138 |
139 | - Get the gem5 simulator
140 |
141 | ::
142 |
143 | git clone https://github.com/gem5/gem5.git -b v24.1
144 |
145 | - Compile the m5 utility
146 |
147 | ::
148 |
149 | export PATH=$PATH:$(realpath toolchain-riscv64/bin)
150 | scons riscv.CROSS_COMPILE=riscv64-buildroot-linux-musl- -C gem5/util/m5 build/riscv/out/m5
151 |
152 | Part 4: Prepare RISC-V disk image
153 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
154 |
155 | .. warning::
156 | If using Podman or rootless Docker, this steps must be done
157 | outside the container, as they typically require sudo permissions.
158 | Pay attention when executing each command!
159 |
160 | The last part of the setup consists in packing the benchmarks and IREE
161 | runtime into a disk image. For this task, we will use a pre-built
162 | minimal image from the gem5 community and modify it.
163 |
164 | - Get and extract `base
165 | image `__
166 |
167 | ::
168 |
169 | wget https://storage.googleapis.com/dist.gem5.org/dist/develop/images/riscv/busybox/riscv-disk.img.gz
170 | gzip -d riscv-disk.img.gz
171 | cp riscv-disk.img vlsid-disk.img
172 |
173 | - Mount image (execute with sudo if outside the container)
174 |
175 | ::
176 |
177 | mkdir /tmp/rootfs
178 | mount vlsid-disk.img /tmp/rootfs
179 |
180 | - Copy benchmark (execute with sudo if outside the container)
181 |
182 | ::
183 |
184 | cp mnist-8.vmfb /tmp/rootfs/root/
185 |
186 | - Copy IREE run module (execute with sudo if outside the container)
187 |
188 | ::
189 |
190 | cp iree/iree-build-riscv64/install/bin/iree-run-module /tmp/rootfs/bin/
191 |
192 | - Copy m5 utility (execute with sudo if outside the container)
193 |
194 | ::
195 |
196 | cp gem5/util/m5/build/riscv/out/m5 /tmp/rootfs/sbin/
197 |
198 | - Unmount image (execute with sudo if outside the container)
199 |
200 | ::
201 |
202 | umount /tmp/rootfs
203 |
204 | Machine Learning Workload Execution
205 | -----------------------------------
206 |
207 | At this point, we are ready to run the experiment. A gem5 configuration
208 | file is present in this directory, which is derived from the
209 | ``riscv-fs.py`` sample script of gem5. The main difference is that
210 | instead of using the default disk image it will pick the one that we
211 | have just generated.
212 |
213 | The script defines a simple RISC-V system comprising a processor, a two-level
214 | cache hierarchy, a main memory and a generic board with some basic devices
215 | (UART controller, RNG, disk interface, etc.). An auto-generated diagram of the
216 | simulated system is presented below. You may need to zoom in to find out about
217 | all the individual components and connections.
218 |
219 | .. figure:: images/gem5/gem5-system.svg
220 | :align: center
221 |
222 | Composition of the simulated system
223 |
224 | - Compile gem5
225 |
226 | .. note::
227 | This step will take a while.
228 |
229 | ::
230 |
231 | scons build/RISCV/gem5.opt -C gem5 -j$(nproc)
232 |
233 | - Compile m5term
234 |
235 | ::
236 |
237 | make -C gem5/util/term
238 |
239 | - Run the script
240 |
241 | .. note::
242 | This step will take a while. We will speed up following
243 | executions through checkpointing.
244 |
245 | ::
246 |
247 | ./build/RISCV/gem5.opt vlsid-riscv-fs.py
248 |
249 | While the simulation is running, its output is not immediately visible,
250 | as it is redirected to a separate console. To view it, open another
251 | terminal and use the m5term utility.
252 |
253 | ::
254 |
255 | ./gem5/util/term/m5term 3456
256 |
257 | The boot process is going to take several minutes. After that, you will
258 | se a login shell. Enter user “root” and password “root” to proceed.
259 | After login, you can launch your IREE benchmark. This is the command to
260 | execute for MNIST:
261 |
262 | ::
263 |
264 | iree-run-module --module=/root/mnist-8.vmfb --device=local-task --input="1x1x28x28xf32=0"
265 |
266 | For simplicity we are assuming an input tensor filled with zeros. You
267 | should see this output after some time:
268 |
269 | ::
270 |
271 | EXEC @CNTKGraph
272 | result[0]: hal.buffer_view
273 | 1x10xf32=[-0.044856 0.00779166 0.0681008 0.0299937 -0.12641 0.140219 -0.0552849 -0.0493838 0.0843221 -0.0545404]
274 |
275 | Congratulations! You are ready to go!
276 |
277 | Extra: Checkpoints
278 | ------------------
279 |
280 | You will have noticed that booting the Linux kernel and reaching the
281 | login shell takes several minutes, even with a minimal image like the
282 | one we are using. We want to avoid waiting so long for each one of the
283 | experiments. One of the commonly used techniques to deal with these
284 | situations is checkpointing: we can “take a picture” of the system at a
285 | certain moment of time and start other simulations from that point.
286 | Technically speaking, this requires saving the main memory content and
287 | the processors context. Cache content is not saved, but since we will
288 | execute our benchmarks from scratch this is not a big deal.
289 |
290 | In order to dump a checkpoint, after entering the shell in the simulated
291 | environment type this command:
292 |
293 | ::
294 |
295 | m5 checkpoint
296 |
297 | After terminating the simulation, you will see that in the output folder
298 | (e.g. ``m5out``) a folder named ``cpt.`` has appeared. This
299 | contains the checkpoint we have just dumped. We strongly suggest to move
300 | this folder outside the ``m5out`` directory.
301 |
302 | ::
303 |
304 | mv m5out/cpt. checkpoint
305 |
306 | From now on, it will be possible to execute a simulation starting from
307 | this checkpoint. It is sufficient to add an argument to the gem5
308 | command, specifying the position of the folder containing the checkpoint
309 | files:
310 |
311 | ::
312 |
313 | ./build/RISCV/gem5.opt vlsid-riscv-fs.py --restore-from checkpoint
314 |
315 | This way, you will be immediately dropped to the shell. Huge
316 | improvement!
317 |
318 | Experimental Studies
319 | --------------------
320 |
321 | Now that you are able to run complete simulations, it is time to explore
322 | a few knobs and analyze their impact on the system performance.
323 |
324 | Part 1: Change CPU model
325 | ~~~~~~~~~~~~~~~~~~~~~~~~
326 |
327 | The gem5 simulator supports different `CPU
328 | models `__.
329 | By default, the script runs with an *atomic* CPU, which implies atomic
330 | accesses to the memory system with fixed latencies. This model is fast
331 | and simple, but inaccurate.
332 |
333 | The first task is to replace the CPU type with a more detailed one.
334 | There are three possible choices:
335 |
336 | - **TimingSimpleCPU:** simple timing CPU, 1-stage pipeline
337 | - **MinorCPU:** in-order CPU, 4-stages pipeline
338 | - **O3CPU:** out-of-order CPU, 7-stages pipeline
339 |
340 | These CPU models are highly configurable, but for this experiment it is
341 | fine to stick with the default parameters set.
342 |
343 | To implement such change, open the ``vlsid-riscv-fs.py`` script and
344 | change ``CPUTypes.ATOMIC`` (line 78) to ``CPUTypes.TIMING``,
345 | ``CPUTypes.MINOR`` and ``CPUTypes.O3``. After each execution, have a
346 | look at the ``stats.txt`` file in the output folder (default:
347 | ``m5out``). In particular, look at how these statistics change:
348 |
349 | ::
350 |
351 | simSeconds -> Simulated system execution time
352 | hostSeconds -> Host system simulation time
353 | board.processor.cores.core.ipc -> IPC of simulated CPU
354 | board.memory.mem_ctrl.dram.bwTotal::total -> DRAM memory bandwidth
355 |
356 | **Tip 1:** Wrap your benchmark execution around the commands “m5
357 | resetstats” and “m5 exit”, to make sure that the statistics only reflect
358 | the benchmark execution and not the system boot or idle time. E.g.:
359 |
360 | ::
361 |
362 | m5 resetstats && iree-run-module [...] && m5 exit
363 |
364 | **Tip 2:** You can specify different output folders for each experiment.
365 | E.g.:
366 |
367 | ::
368 |
369 | gem5.opt -d ./experiment1 vlsid-riscv-fs.py
370 |
371 | Part 2: Change cache hierarchy
372 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
373 |
374 | The cache configuration can have a significant impact on the system
375 | performance, depending on the data locality and access patterns of the
376 | executed applications. This is one of the knobs we can easily change in
377 | the ``vlsid-riscv-fs.py`` configuration file (line 70).
378 |
379 | The second task consists in performing the experiments after applying
380 | the following modifications (one by one):
381 |
382 | - Decrease L1I (instruction cache) and L1D (data cache) size from 32 kB
383 | to 8 kB
384 | - Increase L2 (last-level cache) size from 512 kB to 2 MB
385 |
386 | Use MinorCPU or O3CPU. Compare the output statistic with the baseline
387 | configuration, to check if there is a change in performance and how
388 | appreciable that is. You can also have a look at cache-specific metrics,
389 | e.g. the miss rates:
390 |
391 | ::
392 |
393 | board.cache_hierarchy.l1d-cache-0.overallMissRate::total
394 | board.cache_hierarchy.l1i-cache-0.overallMissRate::total
395 | board.cache_hierarchy.l2-cache-0.overallMissRate::total
396 |
397 | Part 3: Vectorization
398 | ~~~~~~~~~~~~~~~~~~~~~
399 |
400 | The RISC-V architecture we are simulating supports the RVV vector
401 | extension v1.0. This means that the IREE compiler can optimize the
402 | application by enabling SIMD support. The default VLEN for the simulated
403 | hardware is of 256 bits.
404 |
405 | For this step, we will need to recompile the benchmark and add it to the
406 | disk image. The following command will create an RVV-enabled benchmark:
407 |
408 | ::
409 |
410 | iree-compile --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-triple=riscv64 --iree-llvmcpu-target-cpu-features=+m,+a,+f,+d,+c,+v,+zvl256b -riscv-v-vector-bits-min=256 -riscv-v-fixed-length-vector-lmul-max=8 mnist-8.mlir -o mnist-8
411 | -v.vmfb
412 |
413 | Execute this new version of the benchmark and compare the output with
414 | the non-vectorized version. You should notice an improvement of the
415 | performance.
416 |
417 | **Note:** Like other microarchitectural parameters, the latencies of the
418 | vector units are not calibrated on any specific design, and default
419 | values are used. Do not expect fully realistic numbers.
420 |
421 | Part 4: New benchmarks
422 | ~~~~~~~~~~~~~~~~~~~~~~
423 |
424 | .. warning::
425 | The execution time can be much higher for more complex
426 | benchmarks, even in atomic mode. We suggest you to try out these
427 | tests after the tutorial, keeping the simulations as background tasks
428 | until they complete.
429 |
430 | Now that you know how to run the full workflow, you can try out new
431 | benchmarks. Bear in mind that not all the models are supported with the
432 | current version of IREE, and compatibility issues may arise when
433 | compiling. We will provide you with a few examples that are guaranteed
434 | to succeed.
435 |
436 | ::
437 |
438 | https://github.com/onnx/models/raw/refs/heads/main/validated/vision/classification/mobilenet/model/mobilenetv2-10.onnx
439 | https://github.com/onnx/models/raw/refs/heads/main/validated/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx
440 |
441 | The launch commands for these models are:
442 |
443 | ::
444 |
445 | iree-run-module --module=/root/mobilenetv2-10.vmfb --device=local-task --input="1x1x672x672xf32=0"
446 | iree-run-module --module=/root/super-resolution-10.vmfb --device=local-task --input="1x1x224x224xf32=0"
447 |
448 | **Tip:** If you want to store multiple models in your image, or models
449 | that exceed the image capacity, you may run out of space. You can resize
450 | the image to a bigger size (e.g. 150 MB) with the following commands:
451 |
452 | ::
453 |
454 | e2fsck -f vlsid-disk.img
455 | resize2fs vlsid-disk.img 150M
456 |
--------------------------------------------------------------------------------
/demo/gem5/convert_onnx_model.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python3
2 | # Script derived and adapted from this source:
3 | # https://iree.dev/guides/ml-frameworks/onnx/#troubleshooting
4 |
5 | import argparse
6 | import onnx
7 |
8 | parser = argparse.ArgumentParser("ONNX Version Converter")
9 | parser.add_argument("input", type=str, help="Input ONNX file")
10 | parser.add_argument("output", type=str, help="Output ONNX file")
11 | args = parser.parse_args()
12 |
13 | original_model = onnx.load_model(args.input)
14 | converted_model = onnx.version_converter.convert_version(original_model, 17)
15 | onnx.save(converted_model, args.output)
16 |
--------------------------------------------------------------------------------
/demo/gem5/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 The Regents of the University of California
2 | # Copyright (c) 2024 imec v.z.w.
3 | # All Rights Reserved.
4 | #
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are
7 | # met: redistributions of source code must retain the above copyright
8 | # notice, this list of conditions and the following disclaimer;
9 | # redistributions in binary form must reproduce the above copyright
10 | # notice, this list of conditions and the following disclaimer in the
11 | # documentation and/or other materials provided with the distribution;
12 | # neither the name of the copyright holders nor the names of its
13 | # contributors may be used to endorse or promote products derived from
14 | # this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | FROM --platform=${BUILDPLATFORM} ubuntu:24.04
29 |
30 | ENV DEBIAN_FRONTEND=noninteractive
31 | RUN apt -y update && apt -y upgrade && apt -y install \
32 | build-essential \
33 | scons \
34 | python3-dev \
35 | git \
36 | pre-commit \
37 | zlib1g \
38 | zlib1g-dev \
39 | libprotobuf-dev \
40 | protobuf-compiler \
41 | libprotoc-dev \
42 | libgoogle-perftools-dev \
43 | libboost-all-dev \
44 | libhdf5-serial-dev \
45 | python3-pip \
46 | python3-pydot \
47 | python3-venv \
48 | python3-tk \
49 | mypy \
50 | m4 \
51 | libcapstone-dev \
52 | libpng-dev \
53 | libelf-dev \
54 | pkg-config \
55 | wget \
56 | cmake \
57 | doxygen
58 |
59 | RUN python3 -m pip install --break-system-packages \
60 | tensorflow \
61 | iree-base-runtime==3.4.0 \
62 | iree-base-compiler==3.4.0 \
63 | matplotlib \
64 | onnx \
65 | pandas-stubs
66 |
67 | # pre-commit, as installed via apt in 24.04, attempts to create a cache
68 | # directory at "${HOME}/.cache/pre-commit". If running docker with non-root,
69 | # the HOME directory is set to "/". Since non-root users do not have permission
70 | # to write to this directory, an error is returned when pre-commit is executed.
71 | # pre-commit's default cache directory can be changed via the `XDG_CACHE_HOME`
72 | # enivoronment variable. Here we set it to "/tmp". With this pre-commit will
73 | # create a "/tmp/pre-commit" directory to use for caching. "/tmp" was chosen
74 | # as it's a directory any user can access and write to. Given this only stores
75 | # caching information, the "/tmp" directory being wiped is not a concern.
76 | ENV XDG_CACHE_HOME=/tmp/
77 |
--------------------------------------------------------------------------------
/demo/gem5/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 imec v.z.w.
2 | # All Rights Reserved.
3 | #
4 | # Redistribution and use in source and binary forms, with or without
5 | # modification, are permitted provided that the following conditions are
6 | # met: redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer;
8 | # redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution;
11 | # neither the name of the copyright holders nor the names of its
12 | # contributors may be used to endorse or promote products derived from
13 | # this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | services:
28 | vlsid-iree-gem5:
29 | privileged: true
30 | stdin_open: true
31 | tty: true
32 | build:
33 | context: .
34 | dockerfile: Dockerfile
35 | volumes:
36 | - ..:/opt/vlsid-iree-gem5
37 |
--------------------------------------------------------------------------------
/demo/gem5/iree/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 imec v.z.w.
2 | # All Rights Reserved.
3 | #
4 | # Redistribution and use in source and binary forms, with or without
5 | # modification, are permitted provided that the following conditions are
6 | # met: redistributions of source code must retain the above copyright
7 | # notice, this list of conditions and the following disclaimer;
8 | # redistributions in binary form must reproduce the above copyright
9 | # notice, this list of conditions and the following disclaimer in the
10 | # documentation and/or other materials provided with the distribution;
11 | # neither the name of the copyright holders nor the names of its
12 | # contributors may be used to endorse or promote products derived from
13 | # this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
27 | IREE_GIT_TAG=v3.4.0
28 | TOOLCHAIN_ROOT=..
29 | TOOLCHAIN_URL=https://toolchains.bootlin.com/downloads/releases/toolchains/riscv64-lp64d/tarballs/riscv64-lp64d--musl--stable-2024.05-1.tar.xz
30 |
31 | IREE_BUILD_OPTS=\
32 | -DIREE_BUILD_COMPILER=OFF \
33 | -DIREE_BUILD_TESTS=OFF \
34 | -DIREE_BUILD_SAMPLES=OFF \
35 | -DIREE_BUILD_DOCS=OFF \
36 | -DIREE_HAL_DRIVER_VULKAN=OFF
37 |
38 | toolchan_root_abspath=$(shell realpath $(TOOLCHAIN_ROOT))
39 | toolchain_path=$(toolchan_root_abspath)/toolchain-riscv64
40 | sysroot_path=$(toolchain_path)/riscv64-buildroot-linux-musl/sysroot
41 |
42 | default: iree-build-riscv64
43 |
44 | iree-dist:
45 | git clone --depth 1 --branch $(IREE_GIT_TAG) https://github.com/iree-org/iree.git iree-dist
46 | cd iree-dist && git submodule update --init \
47 | third_party/benchmark \
48 | third_party/cpuinfo \
49 | third_party/flatcc \
50 | third_party/googletest \
51 | third_party/hip-build-deps \
52 | third_party/musl \
53 | third_party/spirv_cross \
54 | third_party/tracy \
55 | third_party/vulkan_headers \
56 | third_party/webgpu-headers \
57 | third_party/hsa-runtime-headers
58 |
59 | iree-build-native: iree-dist
60 | cd $< && cmake -G "Unix Makefiles" \
61 | $(IREE_BUILD_OPTS) \
62 | -DCMAKE_INSTALL_PREFIX=../$@/install \
63 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \
64 | . -B ../$@
65 | cd $< && cmake --build ../$@ --target install
66 |
67 | orig_dir=$(patsubst %.tar.gz,%,$(patsubst %.tar.bz2,%,$(patsubst %.tar.xz,%,$(lastword $(subst /, ,$(TOOLCHAIN_URL))))))
68 |
69 | .PHONY: toolchain-riscv64
70 | toolchain-riscv64: $(toolchain_path)
71 | $(toolchain_path):
72 | wget $(TOOLCHAIN_URL) -O $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2
73 | tar -xvf $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2 -C $(TOOLCHAIN_ROOT)
74 | mv $(TOOLCHAIN_ROOT)/$(orig_dir) $(toolchain_path)
75 | $(toolchain_path)/relocate-sdk.sh
76 | rm $(TOOLCHAIN_ROOT)/toolchain-riscv64.tar.bz2
77 |
78 | iree-build-riscv64: iree-dist iree-build-native $(toolchain_path)
79 | cd $< && cmake -G "Unix Makefiles" \
80 | $(IREE_BUILD_OPTS) \
81 | -DIREE_ENABLE_CPUINFO=OFF \
82 | -DCMAKE_INSTALL_PREFIX=../$@/install \
83 | -DIREE_HOST_BIN_DIR=../iree-build-native/install/bin \
84 | -DCMAKE_TOOLCHAIN_FILE="../toolchain.generic.cmake" \
85 | -DTOOLCHAIN_TARGET=riscv64 \
86 | -DTOOLCHAIN_PATH=$(toolchain_path) \
87 | -DTOOLCHAIN_PREFIX=riscv64-buildroot-linux-musl- \
88 | . -B ../$@
89 | cd $< && cmake --build ../$@ --target install
90 |
91 | .PHONY: clean
92 | clean:
93 | rm -rf iree-dist iree-build-*
94 |
95 | .PHONY: distclean
96 | distclean: clean
97 | rm -rf $(toolchain_path)
98 |
--------------------------------------------------------------------------------
/demo/gem5/iree/toolchain.generic.cmake:
--------------------------------------------------------------------------------
1 | # Script derived and adapted from this source:
2 | # https://kubasejdak.com/how-to-cross-compile-for-embedded-with-cmake-like-a-champ
3 |
4 | set(CMAKE_SYSTEM_NAME Generic)
5 | set(CMAKE_SYSTEM_PROCESSOR ${TOOLCHAIN_TARGET})
6 |
7 | # Without that flag CMake is not able to pass test compilation check
8 | set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
9 |
10 | set(CMAKE_AR ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ar${CMAKE_EXECUTABLE_SUFFIX})
11 | set(CMAKE_ASM_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX})
12 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}gcc${CMAKE_EXECUTABLE_SUFFIX})
13 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}g++${CMAKE_EXECUTABLE_SUFFIX})
14 | set(CMAKE_LINKER ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ld${CMAKE_EXECUTABLE_SUFFIX})
15 | set(CMAKE_OBJCOPY ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}objcopy${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
16 | set(CMAKE_RANLIB ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}ranlib${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
17 | set(CMAKE_SIZE ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}size${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
18 | set(CMAKE_STRIP ${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}strip${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
19 |
20 | set(CMAKE_C_FLAGS "-static -Wno-psabi -fdata-sections -ffunction-sections -Wl,--gc-sections" CACHE INTERNAL "")
21 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -fno-exceptions" CACHE INTERNAL "")
22 |
23 | set(CMAKE_C_FLAGS_DEBUG "-Os -g" CACHE INTERNAL "")
24 | set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG" CACHE INTERNAL "")
25 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" CACHE INTERNAL "")
26 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}" CACHE INTERNAL "")
27 |
28 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
29 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
30 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
31 |
--------------------------------------------------------------------------------
/demo/gem5/vlsid-riscv-fs.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 The Regents of the University of California
2 | # Copyright (c) 2024 imec v.z.w.
3 | # All rights reserved.
4 | #
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are
7 | # met: redistributions of source code must retain the above copyright
8 | # notice, this list of conditions and the following disclaimer;
9 | # redistributions in binary form must reproduce the above copyright
10 | # notice, this list of conditions and the following disclaimer in the
11 | # documentation and/or other materials provided with the distribution;
12 | # neither the name of the copyright holders nor the names of its
13 | # contributors may be used to endorse or promote products derived from
14 | # this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | """
29 | This example runs a simple linux boot. It uses the 'riscv-disk-img' resource.
30 | It is built with the sources in `src/riscv-fs` in [gem5 resources](
31 | https://github.com/gem5/gem5-resources).
32 |
33 | Characteristics
34 | ---------------
35 |
36 | * Runs exclusively on the RISC-V ISA with the classic caches
37 | * Assumes that the kernel is compiled into the bootloader
38 | * Automatically generates the DTB file
39 | * Will boot but requires a user to login using `m5term` (username: `root`,
40 | password: `root`)
41 | """
42 |
43 | import argparse
44 | from pathlib import Path
45 |
46 | from gem5.components.boards.riscv_board import RiscvBoard
47 | from gem5.components.cachehierarchies.classic.private_l1_private_l2_walk_cache_hierarchy import (
48 | PrivateL1PrivateL2WalkCacheHierarchy,
49 | )
50 | from gem5.components.memory import SingleChannelDDR3_1600
51 | from gem5.components.processors.cpu_types import CPUTypes
52 | from gem5.components.processors.simple_processor import SimpleProcessor
53 | from gem5.isas import ISA
54 | from gem5.resources.resource import DiskImageResource, obtain_resource
55 | from gem5.simulate.simulator import Simulator
56 | from gem5.utils.requires import requires
57 |
58 | # Run a check to ensure the right version of gem5 is being used.
59 | requires(isa_required=ISA.RISCV)
60 |
61 | # Instantiate argument parser
62 | parser = argparse.ArgumentParser()
63 | parser.add_argument("--restore-from", type=Path, help="Checkpoint directory")
64 | parser.add_argument(
65 | "--disk-image",
66 | type=Path,
67 | help="Disk image path",
68 | default="vlsid-disk.img"
69 | )
70 | args = parser.parse_args()
71 |
72 | # Setup the cache hierarchy.
73 | # For classic, PrivateL1PrivateL2 and NoCache have been tested.
74 | # For Ruby, MESI_Two_Level and MI_example have been tested.
75 | cache_hierarchy = PrivateL1PrivateL2WalkCacheHierarchy(
76 | l1d_size="32KiB", l1i_size="32KiB", l2_size="512KiB"
77 | )
78 |
79 | # Setup the system memory.
80 | memory = SingleChannelDDR3_1600()
81 |
82 | # Setup a single core Processor.
83 | processor = SimpleProcessor(
84 | cpu_type=CPUTypes.ATOMIC, isa=ISA.RISCV, num_cores=1
85 | )
86 |
87 | # Setup the board.
88 | board = RiscvBoard(
89 | clk_freq="1GHz",
90 | processor=processor,
91 | memory=memory,
92 | cache_hierarchy=cache_hierarchy,
93 | )
94 |
95 | # Set the Full System workload.
96 | board.set_kernel_disk_workload(
97 | kernel=obtain_resource("riscv-linux-6.6.33-kernel",
98 | resource_version="1.0.0"),
99 | bootloader=obtain_resource("riscv-bootloader-opensbi-1.3.1",
100 | resource_version="1.0.0"),
101 | disk_image=DiskImageResource(args.disk_image),
102 | checkpoint=args.restore_from,
103 | )
104 |
105 | simulator = Simulator(board=board)
106 | print("Beginning simulation!")
107 | # Note: This simulation will never stop. You can access the terminal upon boot
108 | # using m5term (`./util/term`): `./m5term localhost `. Note the ``
109 | # value is obtained from the gem5 terminal stdout. Look out for
110 | # "system.platform.terminal: Listening for connections on port ".
111 | simulator.run()
112 |
--------------------------------------------------------------------------------
/demo/sst/README.rst:
--------------------------------------------------------------------------------
1 | Scale-out system simulation with SST
2 | ************************************
3 |
4 | **How to perform a scale-out system simulation with instruction-level simulation and packet-level simulation?**
5 | The goal of the second part of this tutorial is to introduce the Structural Simulation
6 | Toolkit (SST) framework which allows to simulate a scale-out.
7 |
8 | Instruction-level simulation
9 | ============================
10 |
11 | Environment Setup
12 | -----------------
13 |
14 | To run the SST experiments you need to install SST. Please refer to `Installation instructions`_.
15 |
16 |
17 | System under exploration
18 | ------------------------
19 | .. _cpu figure:
20 |
21 | .. figure:: images/sst/cpu.svg
22 | :width: 400
23 | :align: center
24 |
25 | Microarchitecture of a cpu core.
26 |
27 |
28 | The system under exploration is made up of multi-threaded RISC-V CPU cores. As illustrated
29 | in Figure :numref:`cpu figure`, a CPU core is attached to an L1 data cache and an L1
30 | instruction cache. The two caches are interconnect to a second level of cache (L2 cache)
31 | with a memory bus. The core itself is composed of one decoder for each thread, one branch
32 | unit and one dispatch unit, one register file for floating point numbers and another one
33 | for integers, a load store unit (or load store queue), multiple ALU and multiple FPU. The
34 | core is attached to each cache through a TLB and a memory interface. TLBs are managed by
35 | the operating system.
36 |
37 |
38 | .. _node figure:
39 |
40 | .. figure:: images/sst/node.svg
41 | :width: 600
42 | :align: center
43 |
44 | Microarchitecture of a compute node.
45 |
46 | As shown in Figure :numref:`node figure`, the RISC-V cores are integrated into a compute
47 | node. The number of cores per node is configurable from the script. The set of L2 caches
48 | are federated with a directory which maintains coherency in the node. The L2s and the
49 | directory are interconnected through a NoC. The directory is attached to a DRAM
50 | controller. In addition, a node integrates a component that emulates an operating systems.
51 | The latter manages the virtual memory and is attached to every CPU core to provide the
52 | minimal service required to run applications.
53 |
54 | .. _system figure:
55 |
56 | .. figure:: images/sst/system.svg
57 | :width: 800
58 | :align: center
59 | :alt: Scale-out system microarchitecture
60 |
61 | Microarchitecture of a multi-node system.
62 |
63 | Multi-node can be interconnect with a network to build a scale-out system, as illustrated
64 | in Figure :numref:`system figure`. Each node has an independent operating system and a
65 | private memory space. To allow communication between node, we can use
66 | Message Passing Interface (MPI). To do that, each node integrates a NIC in addition. The
67 | latter is interconnected to the NoC.
68 |
69 | The inter-node network is built with pymerlin (a python script provided in SST-elements).
70 | Thanks to that script we can defined different topologies easily (e.g., single router, fat
71 | tree, dragonfly, torus, mesh, torus, etc).
72 |
73 |
74 | Every components or sub-components are configurable, for instance you can configure the
75 | latency of the ALU or the capacity of each cache. You can find more information on the
76 | parameters and their impact on the simulated system using **sst-info** command.
77 |
78 | .. list-table:: How to find the available parameters
79 | :widths: 25 50
80 | :header-rows: 1
81 |
82 | * - Command
83 | - Description
84 | * - sst-info vanadis
85 | - Parameters of the cpu core and the operating system
86 | * - sst-info mmu
87 | - Parameters of the TBL and MMU
88 | * - sst-info sst-info memHierarchy
89 | - Parameters of the cache, directory controller, DRAM, memory bus
90 | * - sst-info merlin
91 | - Parameters of the NoC and internode network components
92 | * - sst-info sst-info rdmaNic
93 | - Parameters of the NIC
94 |
95 |
96 | Workload under evaluation
97 | -------------------------
98 |
99 |
100 | The workload under evaluation is inspired by a Multi-head attention, one of the
101 | calculation layers of transformers :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
102 |
103 | .. _OMP_MHA figure:
104 |
105 | .. figure:: images/sst/mha.svg
106 | :width: 600
107 | :align: center
108 | :alt: Multi-head attention block
109 |
110 | Illustration of the workload run on a single-node system.
111 |
112 | As shown in Figure :numref:`OMP_MHA figure`, the application multiplies an *Embeddings*
113 | matrix of Seq\ :sub:`len`\ x D\ :sub:`model` \ elements with 3 matrices of
114 | D\ :sub:`model` x D\ :sub:`model` weights, producing 3 matrices of Seq\ :sub:`len`\ x D\ :sub:`model` \ elements,
115 | called Keys, Queries and Values. In fact, the weight matrices are divided into *heads*.
116 | Each head of Queries matrix are multiplied with the corresponding transposed head of Keys
117 | matrix, producing *QK* matrix. The latter is then scaled. Then the *softmax* of each row of
118 | the scaled *QK* is computed. Afterward, the result of the *softmax* is multiplied with
119 | Values matrix, producing *QKV* matrix. Finally, *QKV* is summed with the *Embeddings*
120 | matrix.
121 |
122 |
123 | .. _mha_OMP: ../../demo/sst/software/mha_OMP.c
124 |
125 | The corresponding code is implemented in **C** `mha_OMP`_, and is parallelized with **OpenMP**.
126 |
127 |
128 | Matrix-Matrix multiplication is the heaviest workload in this application. To minimize the
129 | data movement, a tiled GEMM is implemented. *TILE_SIZE* macro defines the dimension of the
130 | tiles.
131 |
132 | .. code-block:: C
133 | :linenos:
134 | :emphasize-lines: 1
135 |
136 | const int bsize = TILE_SIZE;
137 | int ii0, ii1, ii2;
138 | int i0, i1, i2;
139 | int h;
140 | int start_head, stop_head;
141 | data_t pp;
142 | #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
143 | for (h=0; h < heads; h++) {
144 | for (ii0 = 0; ii0 1
145 |
146 |
147 | nodeRtrParams = {
148 | "xbar_bw" : "57.6GB/s",
149 | "link_bw" : "28.8GB/s",
150 | "input_buf_size" : "40KB",
151 | "output_buf_size" : "40KB",
152 | "flit_size" : "72B",
153 | "id" : "0",
154 | "topology" : "merlin.singlerouter"
155 | }
156 |
157 | memCtrlParams = {
158 | "clock" : "1.6GHz",
159 | "backend.mem_size" : physMemSize,
160 | "backing" : "malloc",
161 | "initBacking" : 1,
162 | "addr_range_start" : 0x0,
163 | "addr_range_end" : memsize - 1,
164 | "backendConvertor.request_width" : 16
165 | }
166 |
167 | # DRAM bandwidth = memCtrl.clock * request width * max_requests_per_cycle = 25.6 GB/s
168 | memCtrlParams = {
169 | "clock" : "1.6GHz",
170 | "backend.mem_size" : physMemSize,
171 | "backing" : "malloc",
172 | "initBacking" : 1,
173 | "addr_range_start" : 0x0,
174 | "addr_range_end" : memsize - 1,
175 | "backendConvertor.request_width" : 16
176 | }
177 |
178 | memBackendParams = {
179 | "mem_size" : physMemSize,
180 | "access_time" : "20ns",
181 | "max_requests_per_cycle" : 1,
182 | "request_width" : 16
183 | }
184 |
185 | memNICParams = {
186 | "min_packet_size" : "72B",
187 | "network_bw" : "28.8GB/s",
188 | "network_input_buffer_size" : "4KiB",
189 | "network_output_buffer_size" : "4KiB"
190 | }
191 |
192 | # OS related params
193 | osParams = {
194 | "dbgLevel" : os_verbosity,
195 | "dbgMask" : 16,
196 | "cores" : num_cpu_per_node,
197 | "hardwareThreadCount" : num_threads_per_cpu,
198 | "page_size" : page_size,
199 | "physMemSize" : physMemSize,
200 | "useMMU" : True,
201 | }
202 |
203 | osl1cacheParams = {
204 | "access_latency_cycles" : 1,
205 | "cache_frequency" : cpu_clock,
206 | "replacement_policy" : "lru",
207 | "coherence_protocol" : coherence_protocol,
208 | "associativity" : 8,
209 | "cache_line_size" : cache_line_size,
210 | "cache_size" : "32 KiB",
211 | "L1" : "1",
212 | }
213 |
214 | mmuParams = {
215 | "num_cores": num_cpu_per_node,
216 | "num_threads": num_threads_per_cpu,
217 | "page_size": page_size,
218 | "useNicTlb": True,
219 | }
220 |
221 |
222 | vanadis_cpu_type = "vanadis.VanadisCPU"
223 | cpuParams = {
224 | "dbg_mask" : 16,
225 | "verbose" : 0,
226 | "clock" : cpu_clock,
227 | "hardware_threads": num_threads_per_cpu,
228 | "physical_fp_registers" : 168 * num_threads_per_cpu,
229 | "physical_integer_registers" : 180 * num_threads_per_cpu,
230 | "integer_arith_units" : 2,
231 | "integer_arith_cycles" : 2,
232 | "integer_div_units" : 1,
233 | "integer_div_cycles" : 20,
234 | "fp_arith_cycles" : 3,
235 | "fp_arith_units" : 2,
236 | "fp_div_units" : 2,
237 | "fp_div_cycles" : 20,
238 | "branch_units" : 1,
239 | "branch_unit_cycles" : 2,
240 | "reorder_slots" : 128,
241 | "decodes_per_cycle" : 4,
242 | "issues_per_cycle" : 4,
243 | "retires_per_cycle" : 4,
244 | }
245 |
246 | branchPredParams = {
247 | "branch_entries" : 64
248 | }
249 |
250 | decoderParams = {
251 | "loader_mode" : 1,
252 | "uop_cache_entries" : 1536,
253 | "predecode_cache_entries" : 4
254 | }
255 |
256 | lsqParams = {
257 | "max_stores" : 16,
258 | "max_loads" : 32,
259 | }
260 |
261 |
262 |
263 | l1dcacheParams = {
264 | "access_latency_cycles" : 1,
265 | "cache_frequency" : cpu_clock,
266 | "replacement_policy" : "lru",
267 | "coherence_protocol" : coherence_protocol,
268 | "associativity" : 8,
269 | "cache_line_size" : cache_line_size,
270 | "cache_size" : "64 KiB",
271 | "prefetcher" : "cassini.NextBlockPrefetcher",
272 | "prefetcher.reach" : 2,
273 | "L1" : "1",
274 | }
275 |
276 | l1icacheParams = {
277 | "access_latency_cycles" : 1,
278 | "cache_frequency" : cpu_clock,
279 | "replacement_policy" : "lru",
280 | "coherence_protocol" : coherence_protocol,
281 | "associativity" : 8,
282 | "cache_line_size" : cache_line_size,
283 | "cache_size" : "32 KiB",
284 | "prefetcher" : "cassini.NextBlockPrefetcher",
285 | "prefetcher.reach" : 1,
286 | "L1" : "1",
287 | }
288 |
289 | l2cacheParams = {
290 | "access_latency_cycles" : 8,
291 | "max_requests_per_cycle" : 2,
292 | "cache_frequency" : cpu_clock,
293 | "replacement_policy" : "lru",
294 | "coherence_protocol" : coherence_protocol,
295 | "associativity" : 16,
296 | "cache_line_size" : cache_line_size,
297 | "cache_size" : str(l2cache_size) + 'B',
298 | "mshr_latency_cycles": 3,
299 | }
300 |
301 | busParams = {
302 | "bus_frequency" : cpu_clock,
303 | }
304 |
305 | dirCtrlParams = {
306 | "max_requests_per_cycle" : 2,
307 | "coherence_protocol" : coherence_protocol,
308 | "entry_cache_size" : l2cache_size*num_cpu_per_node/cache_line_size,
309 | "cache_line_size" : cache_line_size,
310 | "addr_range_start" : 0x0,
311 | "addr_range_end" : memsize - 1
312 | }
313 |
314 |
315 | rdmaNiCParams = {
316 | "clock" : cpu_clock,
317 | "useDmaCache": "true",
318 | "maxPendingCmds" : rdma_nic_num_posted_recv,
319 | "maxMemReqs" : rdma_nic_comp_q_size,
320 | "maxCmdQSize" : rdma_nic_num_posted_recv,
321 | "cache_line_size" : cache_line_size,
322 | 'baseAddr': memsize,
323 | 'cmdQSize' : 64,
324 | }
325 |
326 |
327 | rdmaCacheParams = {
328 | "access_latency_cycles" : 2,
329 | "max_requests_per_cycle" : 1,
330 | "mshr_num_entries": 64,
331 | "cache_frequency" : cpu_clock,
332 | "replacement_policy" : "lru",
333 | "coherence_protocol" : coherence_protocol,
334 | "associativity" : 8,
335 | "cache_line_size" : cache_line_size,
336 | "cache_size" : "32 KiB",
337 | "L1" : "1",
338 | }
339 |
340 |
341 | app_params = {}
342 | if app_args != "":
343 | app_args_list = app_args.split(" ")
344 | # We have a plus 1 because the executable name is arg0
345 | app_args_count = len( app_args_list ) + 1
346 |
347 | app_params["argc"] = app_args_count
348 |
349 | arg_start = 1
350 | for next_arg in app_args_list:
351 | app_params["arg" + str(arg_start)] = next_arg
352 | arg_start = arg_start + 1
353 | else:
354 | app_params["argc"] = 1
355 |
356 | class CPU_Builder:
357 | def __init__(self):
358 | pass
359 |
360 | def build( self, nodeId, cpuId ):
361 |
362 | prefix = 'node' + str(nodeId) + '.cpu' + str( cpuId )
363 | cpu = sst.Component(prefix, vanadis_cpu_type)
364 | cpu.addParams( cpuParams )
365 | cpu.addParam( "core_id", cpuId )
366 | cpu.addParam( "node_id", nodeId )
367 | if enableStats:
368 | cpu.enableAllStatistics()
369 |
370 | # CPU.decoder
371 | for n in range(num_threads_per_cpu):
372 | decode = cpu.setSubComponent( "decoder"+str(n), "vanadis.VanadisRISCV64Decoder" )
373 | decode.addParams( decoderParams )
374 |
375 | if enableStats:
376 | decode.enableAllStatistics()
377 |
378 | # CPU.decoder.osHandler
379 | os_hdlr = decode.setSubComponent( "os_handler", "vanadis.VanadisRISCV64OSHandler" )
380 |
381 | # CPU.decocer.branch_pred
382 | branch_pred = decode.setSubComponent( "branch_unit", "vanadis.VanadisBasicBranchUnit" )
383 | branch_pred.addParams( branchPredParams )
384 |
385 | if enableStats:
386 | branch_pred.enableAllStatistics()
387 |
388 |
389 | # CPU.lsq
390 | cpu_lsq = cpu.setSubComponent( "lsq", "vanadis.VanadisBasicLoadStoreQueue" )
391 | cpu_lsq.addParams(lsqParams)
392 | if enableStats:
393 | cpu_lsq.enableAllStatistics()
394 |
395 |
396 | icache_if = cpu.setSubComponent( "mem_interface_inst", "memHierarchy.standardInterface" )
397 | icache_if.addParam("coreId",cpuId)
398 |
399 | dcache_if = cpu_lsq.setSubComponent( "memory_interface", "memHierarchy.standardInterface" )
400 | dcache_if.addParam("coreId",cpuId)
401 |
402 | # L1 D-Cache
403 | l1cache = sst.Component(prefix + ".l1dcache", "memHierarchy.Cache")
404 | l1cache.addParams( l1dcacheParams )
405 | if enableStats:
406 | l1cache.enableAllStatistics()
407 |
408 | l1dcache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
409 | l1dcache_2_l2cache = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
410 |
411 | # L1 I-Cache
412 | l1icache = sst.Component(prefix + ".l1icache", "memHierarchy.Cache")
413 | l1icache.addParams(l1icacheParams)
414 | if enableStats:
415 | l1icache.enableAllStatistics()
416 |
417 | # Bus
418 | cache_bus = sst.Component(prefix + ".bus", "memHierarchy.Bus")
419 | cache_bus.addParams(busParams)
420 | if enableStats:
421 | cache_bus.enableAllStatistics()
422 |
423 | # L2 D-Cache
424 | l2cache = sst.Component(prefix + ".l2cache", "memHierarchy.Cache")
425 | l2cache.addParams(l2cacheParams)
426 | if enableStats:
427 | l2cache.enableAllStatistics()
428 |
429 | l2cache_2_cpu = l2cache.setSubComponent("cpulink", "memHierarchy.MemLink")
430 |
431 | # CPU D-TLB
432 | dtlbWrapper = sst.Component(prefix+".dtlb", "mmu.tlb_wrapper")
433 | dtlb = dtlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
434 | dtlb.addParam("num_hardware_threads", num_threads_per_cpu)
435 | dtlb.addParams(tlbParams)
436 |
437 | # CPU I-TLB
438 | itlbWrapper = sst.Component(prefix+".itlb", "mmu.tlb_wrapper")
439 | itlbWrapper.addParam("exe",True)
440 | itlb = itlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
441 | itlb.addParam("num_hardware_threads", num_threads_per_cpu)
442 | itlb.addParams(tlbParams)
443 |
444 | # CPU (data) -> D-TLB
445 | link = sst.Link(prefix+".link_cpu_dtlb")
446 | link.connect( (dcache_if, "port", "25ps"), (dtlbWrapper, "cpu_if", "25ps") )
447 |
448 | # CPU (instruction) -> I-TLB
449 | link = sst.Link(prefix+".link_cpu_itlb")
450 | link.connect( (icache_if, "port", "25ps"), (itlbWrapper, "cpu_if", "25ps") )
451 |
452 | l1icache_2_cpu = l1icache.setSubComponent("cpulink", "memHierarchy.MemLink")
453 | l1icache_2_l2cache = l1icache.setSubComponent("memlink", "memHierarchy.MemLink")
454 |
455 | # D-TLB -> D-L1
456 | link = sst.Link(prefix+".link_l1cache")
457 | link.connect( (dtlbWrapper, "cache_if", "25ps"), (l1dcache_2_cpu, "port", "25ps") )
458 |
459 | # I-TLB -> I-L1
460 | link = sst.Link(prefix+".link_l1icache")
461 | link.connect( (itlbWrapper, "cache_if", "25ps"), (l1icache_2_cpu, "port", "25ps") )
462 |
463 | # L1 I-Cache to bus
464 | link = sst.Link(prefix + ".link_l1dcache_l2cache")
465 | link.connect( (l1dcache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_0", "1ns") )
466 |
467 | # L1 D-Cache to bus
468 | link = sst.Link(prefix + ".link_l1icache_l2cache")
469 | link.connect( (l1icache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_1", "1ns") )
470 |
471 | # BUS to L2 cache
472 | link = sst.Link(prefix+".link_bus_l2cache")
473 | link.connect( (cache_bus, "low_network_0", "1ns"), (l2cache_2_cpu, "port", "1ns") )
474 |
475 | return cpu, l2cache, dtlb, itlb
476 |
477 |
478 | def addParamsPrefix(prefix,params):
479 | #print( prefix )
480 | ret = {}
481 | for key, value in params.items():
482 | #print( key, value )
483 | ret[ prefix + "." + key] = value
484 |
485 | #print( ret )
486 | return ret
487 |
488 |
489 |
490 | class OS_Builder:
491 | def __init__(self):
492 | pass
493 |
494 | def build( self, numNodes, nodeId):
495 |
496 | self.prefix = 'node' + str(nodeId)
497 |
498 | self.nodeOS = sst.Component(self.prefix + ".os", "vanadis.VanadisNodeOS")
499 | self.nodeOS.addParam("node_id", nodeId)
500 | self.nodeOS.addParams(osParams)
501 | if enableStats:
502 | self.nodeOS.enableAllStatistics()
503 |
504 | processList = (
505 | ( 1, {
506 | "env_count" : 6,
507 | "env0" : "OMP_NUM_THREADS={}".format(num_cpu_per_node*num_threads_per_cpu),
508 | "env1" : "PMI_SIZE={}".format(num_node),
509 | "env2" : "PMI_RANK={}".format(nodeId),
510 | "env3" : "RDMA_NIC_NUM_POSTED_RECV={}".format(rdma_nic_num_posted_recv),
511 | "env4" : "RDMA_NIC_COMP_Q_SIZE={}".format(rdma_nic_comp_q_size),
512 | "env5" : "TZ=UTC",
513 | "exe" : full_exe_name,
514 | "arg0" : exe_name,
515 | } ),
516 | )
517 |
518 | processList[0][1].update(app_params)
519 |
520 | num=0
521 | for i,process in processList:
522 | for y in range(i):
523 | self.nodeOS.addParams( addParamsPrefix( "process" + str(num), process ) )
524 | num+=1
525 |
526 | self.mmu = self.nodeOS.setSubComponent( "mmu", "mmu.simpleMMU" )
527 |
528 | self.mmu.addParams(mmuParams)
529 |
530 | mem_if = self.nodeOS.setSubComponent( "mem_interface", "memHierarchy.standardInterface" )
531 |
532 | l1cache = sst.Component(self.prefix + ".node_os.l1cache", "memHierarchy.Cache")
533 | l1cache.addParams(osl1cacheParams)
534 |
535 | l1cache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
536 |
537 | link = sst.Link(self.prefix + ".link_os_l1cache")
538 | link.connect( (mem_if, "port", "25ps"), (l1cache_2_cpu, "port", "25ps") )
539 |
540 | return l1cache
541 |
542 | def connectCPU( self, core, cpu ):
543 | link = sst.Link(self.prefix + ".link_core" + str(core) + "_os")
544 | link.connect( (cpu, "os_link", "5ns"), (self.nodeOS, "core" + str(core), "5ns") )
545 |
546 | def connectTlb( self, core, name, tlblink ):
547 | linkName = self.prefix + ".link_mmu_core" + str(core) + "_" + name
548 | link = sst.Link( linkName )
549 | link.connect( (self.mmu, "core"+str(core)+ "." +name, "25ps"), (tlblink, "mmu", "25ps") )
550 |
551 | def connectNicTlb( self, name, niclink ):
552 | linkName = self.prefix + ".link_mmu_" + name
553 | link = sst.Link( linkName )
554 | link.connect( (self.mmu, name, "25ps"), (niclink, "mmu", "25ps") )
555 |
556 |
557 |
558 |
559 | class rdmaNic_Builder:
560 | def __init__(self,numNodes):
561 | self.numNodes = numNodes
562 |
563 | def build( self, nodeId ):
564 |
565 | prefix = 'node' + str(nodeId)
566 | nic = sst.Component( prefix + ".nic", "rdmaNic.nic")
567 | nic.addParams(rdmaNiCParams)
568 | nic.addParam( 'nicId', nodeId )
569 | nic.addParam( 'pesPerNode', 1 )
570 | nic.addParam( 'numNodes', self.numNodes )
571 | if enableStats :
572 | nic.enableAllStatistics()
573 |
574 |
575 | # NIC DMA interface
576 | dmaIf = nic.setSubComponent("dma", "memHierarchy.standardInterface")
577 |
578 | # NIC MMIO interface
579 | mmioIf = nic.setSubComponent("mmio", "memHierarchy.standardInterface")
580 |
581 | # NIC DMA Cache
582 | dmaCache = sst.Component(prefix + ".nicDmaCache", "memHierarchy.Cache")
583 | dmaCache.addParams(rdmaCacheParams)
584 |
585 | # NIC DMA TLB
586 | tlbWrapper = sst.Component(prefix+".nicDmaTlb", "mmu.tlb_wrapper")
587 | tlb = tlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
588 | tlb.addParam("num_hardware_threads", num_cpu_per_node*num_threads_per_cpu)
589 | tlb.addParams(tlbParams)
590 |
591 | # Cache to CPU interface
592 | dmaCacheToCpu = dmaCache.setSubComponent("cpulink", "memHierarchy.MemLink")
593 |
594 | # NIC DMA -> TLB
595 | link = sst.Link(prefix+".link_cpu_dtlb")
596 | link.connect( (dmaIf, "port", "25ps"), (tlbWrapper, "cpu_if", "25ps") )
597 |
598 | # NIC DMA TLB -> cache
599 | link = sst.Link(prefix+".link_cpu_l1dcache")
600 | link.connect( (tlbWrapper, "cache_if", "25ps"), (dmaCacheToCpu, "port", "25ps") )
601 |
602 | # NIC internode interface
603 | netLink = nic.setSubComponent( "rtrLink", "merlin.linkcontrol" )
604 | netLink.addParams(rdmaLinkParams)
605 |
606 | return mmioIf, dmaCache, tlb, (netLink, "rtr_port", '10ns')
607 |
608 | class memory_Builder:
609 | def __init__(self):
610 | pass
611 |
612 | def build( self, nodeId, numPorts, group ):
613 |
614 | self.prefix = 'node' + str(nodeId)
615 | self.numPorts = numPorts + 1
616 |
617 | self.chiprtr = sst.Component(self.prefix + ".chiprtr", "merlin.hr_router")
618 | self.chiprtr.addParam("num_ports", self.numPorts)
619 | self.chiprtr.addParams(nodeRtrParams)
620 | self.chiprtr.setSubComponent("topology","merlin.singlerouter")
621 |
622 | if enableStats:
623 | self.chiprtr.enableAllStatistics()
624 |
625 | dirctrl = sst.Component(self.prefix + ".dirctrl", "memHierarchy.DirectoryController")
626 | dirctrl.addParams(dirCtrlParams)
627 | dirtoMemLink = dirctrl.setSubComponent("memlink", "memHierarchy.MemLink")
628 | self.connect( "Dirctrl", self.numPorts - 1, dirctrl, group, linkType="cpulink" )
629 | if enableStats:
630 | dirctrl.enableAllStatistics()
631 |
632 | memctrl = sst.Component(self.prefix + ".memory", "memHierarchy.MemController")
633 | memctrl.addParams(memCtrlParams)
634 | if enableStats:
635 | memctrl.enableAllStatistics()
636 |
637 | memToDir = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
638 |
639 | memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
640 | memory.addParams(memBackendParams)
641 |
642 | link = sst.Link(self.prefix + ".link_dir_mem")
643 | link.connect( (dirtoMemLink, "port", "25ps"), (memToDir, "port", "25ps") )
644 |
645 | def connect( self, name, port, comp, group=None, linkType="memlink" ):
646 |
647 | assert group
648 | assert port < self.numPorts
649 |
650 | memNIC = comp.setSubComponent(linkType, "memHierarchy.MemNIC")
651 | memNIC.addParam("group", group)
652 | memNIC.addParams(memNICParams)
653 |
654 | link = sst.Link(self.prefix + ".link_rtr" + str(port) )
655 | link.connect( (self.chiprtr, "port" + str(port), "25ps"), (memNIC, "port", "25ps") )
656 |
657 |
658 | class Endpoint():
659 | def __init__(self,numNodes):
660 | self.numNodes = numNodes
661 |
662 | def prepParams(self):
663 | pass
664 |
665 | def build(self, nodeId, extraKeys ):
666 |
667 | prefix = 'node' + str(nodeId);
668 |
669 | cpuBuilder = CPU_Builder()
670 | memBuilder = memory_Builder()
671 | osBuilder = OS_Builder()
672 |
673 | numPorts = 3 + num_cpu_per_node
674 | port = 0
675 | memBuilder.build(nodeId, numPorts, group=2 )
676 |
677 | # build the Vanadis OS, it returns
678 | osCache = osBuilder.build( self.numNodes, nodeId)
679 |
680 | # connect OS L1 to Memory
681 | memBuilder.connect( "OS_L1", port, osCache, group=1 )
682 | port += 1;
683 |
684 | # build the Vanadis CPU block, this returns
685 | # cpu, L2 cache, DTLB ITLB
686 | for i in range(num_cpu_per_node):
687 | cpu, L2, dtlb, itlb = cpuBuilder.build(nodeId, i)
688 |
689 | osBuilder.connectCPU( i, cpu )
690 | osBuilder.connectTlb( i, "dtlb", dtlb )
691 | osBuilder.connectTlb( i, "itlb", itlb )
692 |
693 | # connect CPU L2 to Memory
694 | memBuilder.connect( "CPU_L2", port, L2, group=1 )
695 | port += 1;
696 |
697 | nicBuilder = rdmaNic_Builder(self.numNodes)
698 | # build the Rdma NIC, this returns
699 | # MMIO link, DMA cache, DMA TLB
700 | mmioIf, dmaCache, dmaTlb, netLink = nicBuilder.build(nodeId)
701 |
702 | osBuilder.connectNicTlb( "nicTlb", dmaTlb )
703 |
704 | # connect the NIC MMIO to Memory
705 | #memBuilder.connect( "NIC_MMIO", port, mmioIf, 3, source="1", dest="2" )
706 | memBuilder.connect( "NIC_MMIO", port, mmioIf, group=2 )
707 | port += 1;
708 |
709 | # connect the NIC DMA Cache to Memory
710 | #memBuilder.connect( "NIC_DMA", port, dmaCache, 1, dest="2" )
711 | memBuilder.connect( "NIC_DMA", port, dmaCache, group=1 )
712 | port += 1;
713 | return netLink
714 |
715 | ep = Endpoint( num_node )
716 |
717 | def setNode( nodeId ):
718 | return ep;
719 |
720 | for p in networkParams:
721 | sst.merlin._params[p] = networkParams[p]
722 |
723 | if network_topology == "torus":
724 | topo = topoTorus()
725 | elif network_topology == "fattree":
726 | topo = topoFatTree()
727 | else:
728 | topo = topoSimple()
729 |
730 | topo.bundleEndpoints = False
731 | topo.prepParams()
732 | topo.setEndPointFunc( setNode )
733 | topo.build()
734 |
--------------------------------------------------------------------------------
/demo/sst/instruction-level-simulation/scale_up.py:
--------------------------------------------------------------------------------
1 | ## This source code is licensed under the MIT license found in the
2 | ## LICENSE file in the root directory of this source tree.
3 | ##
4 | ## Copyright (c) 2025 IMEC. All rights reserved.
5 | ## ******************************************************************************
6 |
7 | import sys
8 | import argparse
9 | import sst
10 | import os
11 | from sst.merlin import *
12 |
13 |
14 | parser = argparse.ArgumentParser(
15 | prog=f'sst {__file__} --',
16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
17 |
18 |
19 | parser.add_argument("--num_threads_per_cpu", type=int, help="Number of hardware threads per cpu", default=2)
20 | parser.add_argument("--num_cpu_per_node", type=int, help="Number of cpu per node", default=2)
21 | parser.add_argument("--exe", type=str, help="Binary to run", default="../software/riscv64/mha_OMP_16")
22 | parser.add_argument("--app_args", type=str, help="Arguments of the application", default="64 128 8")
23 | parser.add_argument("--stats", type=str, help="write statistics, argument changes the filename", nargs="?", const="-")
24 | args = parser.parse_args()
25 |
26 |
27 | if args.stats:
28 | enableStats = True
29 | sst.setStatisticLoadLevel(10)
30 |
31 | fname = args.stats
32 | if fname.endswith(".csv"):
33 | sst.setStatisticOutput("sst.statOutputCSV",
34 | { "filepath" : fname,
35 | "separator" : ";"
36 | } )
37 | else:
38 | sst.setStatisticOutput("sst.statOutputConsole")
39 | else:
40 | enableStats = False
41 |
42 | num_threads_per_cpu = args.num_threads_per_cpu
43 | num_cpu_per_node = args.num_cpu_per_node
44 | app_args = args.app_args
45 | full_exe_name = args.exe
46 | os_verbosity = 0
47 |
48 | cpu_clock = "3GHz"
49 |
50 | coherence_protocol="MESI"
51 | cache_line_size = 64
52 |
53 | l2cache_size = 1 * 1024**2 # 1MiB
54 | page_size = 4096
55 | memsize = 2 * 1024**3 # 2GiB
56 | physMemSize = str(memsize) + " B"
57 |
58 |
59 | exe_name= full_exe_name.split("/")[-1]
60 |
61 | tlbParams = {
62 | "hitLatency": 3,
63 | "num_tlb_entries_per_thread": 64,
64 | "tlb_set_size": 4,
65 | "minVirtAddr" : 0x1000,
66 | "maxVirtAddr" : memsize
67 | }
68 |
69 | nodeRtrParams = {
70 | "xbar_bw" : "57.6GB/s",
71 | "link_bw" : "28.8GB/s",
72 | "input_buf_size" : "40KB",
73 | "output_buf_size" : "40KB",
74 | "flit_size" : "72B",
75 | "id" : "0",
76 | "topology" : "merlin.singlerouter"
77 | }
78 |
79 | # DRAM bandwidth = memCtrl.clock * request width * max_requests_per_cycle = 25.6 GB/s
80 | memCtrlParams = {
81 | "clock" : "1.6GHz",
82 | "backend.mem_size" : physMemSize,
83 | "backing" : "malloc",
84 | "initBacking" : 1,
85 | "addr_range_start" : 0x0,
86 | "addr_range_end" : memsize - 1,
87 | "backendConvertor.request_width" : 16
88 | }
89 |
90 | memBackendParams = {
91 | "mem_size" : physMemSize,
92 | "access_time" : "20ns",
93 | "max_requests_per_cycle" : 1,
94 | "request_width" : 16
95 | }
96 |
97 | memNICParams = {
98 | "min_packet_size" : "72B",
99 | "network_bw" : "28.8GB/s",
100 | "network_input_buffer_size" : "4KiB",
101 | "network_output_buffer_size" : "4KiB"
102 | }
103 |
104 | # OS related params
105 | osParams = {
106 | "dbgLevel" : os_verbosity,
107 | "cores" : num_cpu_per_node,
108 | "hardwareThreadCount" : num_threads_per_cpu,
109 | "page_size" : page_size,
110 | "physMemSize" : physMemSize,
111 | "useMMU" : True,
112 | }
113 |
114 | osl1cacheParams = {
115 | "access_latency_cycles" : 1,
116 | "cache_frequency" : cpu_clock,
117 | "replacement_policy" : "lru",
118 | "coherence_protocol" : coherence_protocol,
119 | "associativity" : 8,
120 | "cache_line_size" : cache_line_size,
121 | "cache_size" : "32 KiB",
122 | "L1" : "1",
123 | }
124 |
125 | mmuParams = {
126 | "num_cores": num_cpu_per_node,
127 | "num_threads": num_threads_per_cpu,
128 | "page_size": page_size,
129 | "useNicTlb": False,
130 | }
131 |
132 |
133 | vanadis_cpu_type = "vanadis.VanadisCPU"
134 | cpuParams = {
135 | "clock" : cpu_clock,
136 | "hardware_threads": num_threads_per_cpu,
137 | "physical_fp_registers" : 168 * num_threads_per_cpu,
138 | "physical_integer_registers" : 180 * num_threads_per_cpu,
139 | "integer_arith_units" : 2,
140 | "integer_arith_cycles" : 2,
141 | "integer_div_units" : 1,
142 | "integer_div_cycles" : 20,
143 | "fp_arith_cycles" : 3,
144 | "fp_arith_units" : 2,
145 | "fp_div_units" : 2,
146 | "fp_div_cycles" : 20,
147 | "branch_units" : 1,
148 | "branch_unit_cycles" : 2,
149 | "reorder_slots" : 128,
150 | "decodes_per_cycle" : 4,
151 | "issues_per_cycle" : 4,
152 | "retires_per_cycle" : 4,
153 | }
154 |
155 | branchPredParams = {
156 | "branch_entries" : 64
157 | }
158 |
159 | decoderParams = {
160 | "loader_mode" : 1,
161 | "uop_cache_entries" : 1536,
162 | "predecode_cache_entries" : 4
163 | }
164 |
165 | lsqParams = {
166 | "max_stores" : 16,
167 | "max_loads" : 32,
168 | }
169 |
170 |
171 | l1dcacheParams = {
172 | "access_latency_cycles" : 1,
173 | "cache_frequency" : cpu_clock,
174 | "replacement_policy" : "lru",
175 | "coherence_protocol" : coherence_protocol,
176 | "associativity" : 8,
177 | "cache_line_size" : cache_line_size,
178 | "cache_size" : "64 KiB",
179 | "prefetcher" : "cassini.NextBlockPrefetcher",
180 | "prefetcher.reach" : 2,
181 | "L1" : "1",
182 | }
183 |
184 | l1icacheParams = {
185 | "access_latency_cycles" : 1,
186 | "cache_frequency" : cpu_clock,
187 | "replacement_policy" : "lru",
188 | "coherence_protocol" : coherence_protocol,
189 | "associativity" : 8,
190 | "cache_line_size" : cache_line_size,
191 | "cache_size" : "32 KiB",
192 | "prefetcher" : "cassini.NextBlockPrefetcher",
193 | "prefetcher.reach" : 1,
194 | "L1" : "1",
195 | }
196 |
197 | l2cacheParams = {
198 | "access_latency_cycles" : 8,
199 | "max_requests_per_cycle" : 2,
200 | "cache_frequency" : cpu_clock,
201 | "replacement_policy" : "lru",
202 | "coherence_protocol" : coherence_protocol,
203 | "associativity" : 16,
204 | "cache_line_size" : cache_line_size,
205 | "cache_size" : str(l2cache_size) + 'B',
206 | "mshr_latency_cycles": 3,
207 | }
208 |
209 | busParams = {
210 | "bus_frequency" : cpu_clock,
211 | }
212 |
213 | dirCtrlParams = {
214 | "max_requests_per_cycle" : 2,
215 | "coherence_protocol" : coherence_protocol,
216 | "entry_cache_size" : l2cache_size*num_cpu_per_node/cache_line_size,
217 | "cache_line_size" : cache_line_size,
218 | "addr_range_start" : 0x0,
219 | "addr_range_end" : memsize - 1
220 | }
221 |
222 | app_params = {}
223 | if app_args != "":
224 | app_args_list = app_args.split(" ")
225 | # We have a plus 1 because the executable name is arg0
226 | app_args_count = len( app_args_list ) + 1
227 |
228 | app_params["argc"] = app_args_count
229 |
230 | arg_start = 1
231 | for next_arg in app_args_list:
232 | app_params["arg" + str(arg_start)] = next_arg
233 | arg_start = arg_start + 1
234 | else:
235 | app_params["argc"] = 1
236 |
237 | class CPU_Builder:
238 | def __init__(self):
239 | pass
240 |
241 | def build( self, nodeId, cpuId ):
242 |
243 | prefix = 'node' + str(nodeId) + '.cpu' + str( cpuId )
244 | cpu = sst.Component(prefix, vanadis_cpu_type)
245 | cpu.addParams( cpuParams )
246 | cpu.addParam( "core_id", cpuId )
247 | cpu.addParam( "node_id", nodeId )
248 | cpu.enableAllStatistics()
249 |
250 | # CPU.decoder
251 | for n in range(num_threads_per_cpu):
252 | decode = cpu.setSubComponent( "decoder"+str(n), "vanadis.VanadisRISCV64Decoder" )
253 | decode.addParams( decoderParams )
254 |
255 | if enableStats:
256 | decode.enableAllStatistics()
257 |
258 | # CPU.decoder.osHandler
259 | os_hdlr = decode.setSubComponent( "os_handler", "vanadis.VanadisRISCV64OSHandler" )
260 |
261 | # CPU.decocer.branch_pred
262 | branch_pred = decode.setSubComponent( "branch_unit", "vanadis.VanadisBasicBranchUnit" )
263 | branch_pred.addParams( branchPredParams )
264 |
265 | if enableStats:
266 | branch_pred.enableAllStatistics()
267 |
268 |
269 | # CPU.lsq
270 | cpu_lsq = cpu.setSubComponent( "lsq", "vanadis.VanadisBasicLoadStoreQueue" )
271 | cpu_lsq.addParams(lsqParams)
272 | if enableStats:
273 | cpu_lsq.enableAllStatistics()
274 |
275 |
276 | icache_if = cpu.setSubComponent( "mem_interface_inst", "memHierarchy.standardInterface" )
277 | icache_if.addParam("coreId",cpuId)
278 |
279 | dcache_if = cpu_lsq.setSubComponent( "memory_interface", "memHierarchy.standardInterface" )
280 | dcache_if.addParam("coreId",cpuId)
281 |
282 | # L1 D-Cache
283 | l1cache = sst.Component(prefix + ".l1dcache", "memHierarchy.Cache")
284 | l1cache.addParams( l1dcacheParams )
285 | if enableStats:
286 | l1cache.enableAllStatistics()
287 |
288 | l1dcache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
289 | l1dcache_2_l2cache = l1cache.setSubComponent("memlink", "memHierarchy.MemLink")
290 |
291 | # L1 I-Cache
292 | l1icache = sst.Component(prefix + ".l1icache", "memHierarchy.Cache")
293 | l1icache.addParams(l1icacheParams)
294 | if enableStats:
295 | l1icache.enableAllStatistics()
296 |
297 | # Bus
298 | cache_bus = sst.Component(prefix + ".bus", "memHierarchy.Bus")
299 | cache_bus.addParams(busParams)
300 | if enableStats:
301 | cache_bus.enableAllStatistics()
302 |
303 | # L2 D-Cache
304 | l2cache = sst.Component(prefix + ".l2cache", "memHierarchy.Cache")
305 | l2cache.addParams(l2cacheParams)
306 | if enableStats:
307 | l2cache.enableAllStatistics()
308 |
309 | l2cache_2_cpu = l2cache.setSubComponent("cpulink", "memHierarchy.MemLink")
310 |
311 | # CPU D-TLB
312 | dtlbWrapper = sst.Component(prefix+".dtlb", "mmu.tlb_wrapper")
313 | dtlb = dtlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
314 | dtlb.addParam("num_hardware_threads", num_threads_per_cpu)
315 | dtlb.addParams(tlbParams)
316 |
317 | # CPU I-TLB
318 | itlbWrapper = sst.Component(prefix+".itlb", "mmu.tlb_wrapper")
319 | itlbWrapper.addParam("exe",True)
320 | itlb = itlbWrapper.setSubComponent("tlb", "mmu.simpleTLB" );
321 | itlb.addParam("num_hardware_threads", num_threads_per_cpu)
322 | itlb.addParams(tlbParams)
323 |
324 | # CPU (data) -> D-TLB
325 | link = sst.Link(prefix+".link_cpu_dtlb")
326 | link.connect( (dcache_if, "port", "25ps"), (dtlbWrapper, "cpu_if", "25ps") )
327 |
328 | # CPU (instruction) -> I-TLB
329 | link = sst.Link(prefix+".link_cpu_itlb")
330 | link.connect( (icache_if, "port", "25ps"), (itlbWrapper, "cpu_if", "25ps") )
331 |
332 | l1icache_2_cpu = l1icache.setSubComponent("cpulink", "memHierarchy.MemLink")
333 | l1icache_2_l2cache = l1icache.setSubComponent("memlink", "memHierarchy.MemLink")
334 |
335 | # D-TLB -> D-L1
336 | link = sst.Link(prefix+".link_l1cache")
337 | link.connect( (dtlbWrapper, "cache_if", "25ps"), (l1dcache_2_cpu, "port", "25ps") )
338 |
339 | # I-TLB -> I-L1
340 | link = sst.Link(prefix+".link_l1icache")
341 | link.connect( (itlbWrapper, "cache_if", "25ps"), (l1icache_2_cpu, "port", "25ps") )
342 |
343 | # L1 I-Cache to bus
344 | link = sst.Link(prefix + ".link_l1dcache_l2cache")
345 | link.connect( (l1dcache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_0", "1ns") )
346 |
347 | # L1 D-Cache to bus
348 | link = sst.Link(prefix + ".link_l1icache_l2cache")
349 | link.connect( (l1icache_2_l2cache, "port", "1ns"), (cache_bus, "high_network_1", "1ns") )
350 |
351 | # BUS to L2 cache
352 | link = sst.Link(prefix+".link_bus_l2cache")
353 | link.connect( (cache_bus, "low_network_0", "1ns"), (l2cache_2_cpu, "port", "1ns") )
354 |
355 | return cpu, l2cache, dtlb, itlb
356 |
357 |
358 | def addParamsPrefix(prefix,params):
359 | #print( prefix )
360 | ret = {}
361 | for key, value in params.items():
362 | #print( key, value )
363 | ret[ prefix + "." + key] = value
364 |
365 | #print( ret )
366 | return ret
367 |
368 |
369 |
370 | class OS_Builder:
371 | def __init__(self):
372 | pass
373 |
374 | def build( self, numNodes, nodeId):
375 |
376 | self.prefix = 'node' + str(nodeId)
377 |
378 | self.nodeOS = sst.Component(self.prefix + ".os", "vanadis.VanadisNodeOS")
379 | self.nodeOS.addParam("node_id", nodeId)
380 | self.nodeOS.addParams(osParams)
381 | if enableStats:
382 | self.nodeOS.enableAllStatistics()
383 |
384 | processList = (
385 | ( 1, {
386 | "env_count" : 3,
387 | "env0" : "OMP_NUM_THREADS={}".format(num_cpu_per_node*num_threads_per_cpu),
388 | "env1" : "TZ=UTC",
389 | "env2" : "MV2_ENABLE_AFFINITY=0",
390 | "exe" : full_exe_name,
391 | "arg0" : exe_name,
392 | } ),
393 | )
394 |
395 | processList[0][1].update(app_params)
396 |
397 | num=0
398 | for i,process in processList:
399 | for y in range(i):
400 | self.nodeOS.addParams( addParamsPrefix( "process" + str(num), process ) )
401 | num+=1
402 |
403 | self.mmu = self.nodeOS.setSubComponent( "mmu", "mmu.simpleMMU" )
404 |
405 | self.mmu.addParams(mmuParams)
406 |
407 | mem_if = self.nodeOS.setSubComponent( "mem_interface", "memHierarchy.standardInterface" )
408 |
409 | l1cache = sst.Component(self.prefix + ".node_os.l1cache", "memHierarchy.Cache")
410 | l1cache.addParams(osl1cacheParams)
411 |
412 | l1cache_2_cpu = l1cache.setSubComponent("cpulink", "memHierarchy.MemLink")
413 |
414 | link = sst.Link(self.prefix + ".link_os_l1cache")
415 | link.connect( (mem_if, "port", "25ps"), (l1cache_2_cpu, "port", "25ps") )
416 |
417 | return l1cache
418 |
419 | def connectCPU( self, core, cpu ):
420 | link = sst.Link(self.prefix + ".link_core" + str(core) + "_os")
421 | link.connect( (cpu, "os_link", "5ns"), (self.nodeOS, "core" + str(core), "5ns") )
422 |
423 | def connectTlb( self, core, name, tlblink ):
424 | linkName = self.prefix + ".link_mmu_core" + str(core) + "_" + name
425 | link = sst.Link( linkName )
426 | link.connect( (self.mmu, "core"+str(core)+ "." +name, "25ps"), (tlblink, "mmu", "25ps") )
427 |
428 | class memory_Builder:
429 | def __init__(self):
430 | pass
431 |
432 | def build( self, nodeId, numPorts, group ):
433 |
434 | self.prefix = 'node' + str(nodeId)
435 | self.numPorts = numPorts + 1
436 |
437 | self.chiprtr = sst.Component(self.prefix + ".chiprtr", "merlin.hr_router")
438 | self.chiprtr.addParam("num_ports", self.numPorts)
439 | self.chiprtr.addParams(nodeRtrParams)
440 | self.chiprtr.setSubComponent("topology","merlin.singlerouter")
441 |
442 | if enableStats:
443 | self.chiprtr.enableAllStatistics()
444 |
445 | dirctrl = sst.Component(self.prefix + ".dirctrl", "memHierarchy.DirectoryController")
446 | dirctrl.addParams(dirCtrlParams)
447 | dirtoMemLink = dirctrl.setSubComponent("memlink", "memHierarchy.MemLink")
448 | self.connect( "Dirctrl", self.numPorts -1 , dirctrl, group, linkType="cpulink" )
449 | if enableStats:
450 | dirctrl.enableAllStatistics()
451 |
452 | memctrl = sst.Component(self.prefix + ".memory", "memHierarchy.MemController")
453 | memctrl.addParams(memCtrlParams)
454 | if enableStats:
455 | memctrl.enableAllStatistics()
456 |
457 | memToDir = memctrl.setSubComponent("cpulink", "memHierarchy.MemLink")
458 |
459 | memory = memctrl.setSubComponent("backend", "memHierarchy.simpleMem")
460 | memory.addParams(memBackendParams)
461 |
462 | link = sst.Link(self.prefix + ".link_dir_mem")
463 | link.connect( (dirtoMemLink, "port", "25ps"), (memToDir, "port", "25ps") )
464 |
465 | def connect( self, name, port, comp, group=None, linkType="memlink" ):
466 |
467 | assert group
468 | assert port < self.numPorts
469 |
470 | memNIC = comp.setSubComponent(linkType, "memHierarchy.MemNIC")
471 | memNIC.addParam("group", group)
472 | memNIC.addParams(memNICParams)
473 |
474 | link = sst.Link(self.prefix + ".link_rtr" + str(port) )
475 | link.connect( (self.chiprtr, "port" + str(port), "25ps"), (memNIC, "port", "25ps") )
476 |
477 |
478 | class node_Builder():
479 | def __init__(self):
480 | pass
481 |
482 | def prepParams(self):
483 | pass
484 |
485 | def build(self, nodeId, extraKeys ):
486 |
487 | prefix = 'node' + str(nodeId);
488 |
489 | cpuBuilder = CPU_Builder()
490 | memBuilder = memory_Builder()
491 | osBuilder = OS_Builder()
492 |
493 | numPorts = 1 + num_cpu_per_node
494 | port = 0
495 | memBuilder.build(nodeId, numPorts, group=2 )
496 |
497 | # build the Vanadis OS, it returns
498 | osCache = osBuilder.build( 1, nodeId)
499 |
500 | # connect OS L1 to Memory
501 | #memBuilder.connect( "OS_L1", port, osCache, 1, dest="2" )
502 | memBuilder.connect( "OS_L1", port, osCache, group=1 )
503 | port += 1;
504 |
505 | # build the Vanadis CPU block, this returns
506 | # cpu, L2 cache, DTLB ITLB
507 | for i in range(num_cpu_per_node):
508 | cpu, L2, dtlb, itlb = cpuBuilder.build(nodeId, i)
509 |
510 | osBuilder.connectCPU( i, cpu )
511 | osBuilder.connectTlb( i, "dtlb", dtlb )
512 | osBuilder.connectTlb( i, "itlb", itlb )
513 |
514 | # connect CPU L2 to Memory
515 | #memBuilder.connect( "CPU_L2", port, L2, 1, dest="2,3" )
516 | memBuilder.connect( "CPU_L2", port, L2, group=1 )
517 | port += 1;
518 |
519 | nodeBuilder = node_Builder()
520 |
521 | nodeBuilder.build(0,{})
522 |
--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/large_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": ".",
3 | "architectures": [
4 | "LlamaForCausalLM"
5 | ],
6 | "attention_bias": false,
7 | "attention_dropout": 0.0,
8 | "bos_token_id": 128000,
9 | "eos_token_id": [
10 | 128001,
11 | 128008,
12 | 128009
13 | ],
14 | "hidden_act": "silu",
15 | "hidden_size": 16384,
16 | "initializer_range": 0.02,
17 | "intermediate_size": 53248,
18 | "max_position_embeddings": 131072,
19 | "mlp_bias": false,
20 | "model_type": "llama",
21 | "num_attention_heads": 128,
22 | "num_hidden_layers": 126,
23 | "num_key_value_heads": 8,
24 | "pretraining_tp": 1,
25 | "rms_norm_eps": 1e-05,
26 | "rope_scaling": {
27 | "factor": 8.0,
28 | "high_freq_factor": 4.0,
29 | "low_freq_factor": 1.0,
30 | "original_max_position_embeddings": 8192,
31 | "rope_type": "llama3"
32 | },
33 | "rope_theta": 500000.0,
34 | "tie_word_embeddings": false,
35 | "torch_dtype": "bfloat16",
36 | "transformers_version": "4.43.3",
37 | "use_cache": true,
38 | "vocab_size": 128256
39 | }
40 |
--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/small_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "LlamaForCausalLM"
4 | ],
5 | "attention_bias": false,
6 | "attention_dropout": 0.0,
7 | "bos_token_id": 128000,
8 | "eos_token_id": 128001,
9 | "head_dim": 64,
10 | "hidden_act": "silu",
11 | "hidden_size": 2048,
12 | "initializer_range": 0.02,
13 | "intermediate_size": 8192,
14 | "max_position_embeddings": 131072,
15 | "mlp_bias": false,
16 | "model_type": "llama",
17 | "num_attention_heads": 32,
18 | "num_hidden_layers": 16,
19 | "num_key_value_heads": 8,
20 | "pretraining_tp": 1,
21 | "rms_norm_eps": 1e-05,
22 | "rope_scaling": {
23 | "factor": 32.0,
24 | "high_freq_factor": 4.0,
25 | "low_freq_factor": 1.0,
26 | "original_max_position_embeddings": 8192,
27 | "rope_type": "llama3"
28 | },
29 | "rope_theta": 500000.0,
30 | "tie_word_embeddings": true,
31 | "torch_dtype": "bfloat16",
32 | "transformers_version": "4.45.0.dev0",
33 | "use_cache": true,
34 | "vocab_size": 128256
35 | }
36 |
--------------------------------------------------------------------------------
/demo/sst/packet-level-simulation/training_llm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import sst
4 | import os
5 | import math
6 |
7 | from sst.merlin.base import *
8 | from sst.merlin.endpoint import *
9 | from sst.merlin.interface import *
10 | from sst.merlin.topology import *
11 | from sst.ember import *
12 |
13 |
14 | params = {
15 | # in GB/s
16 | "link_bw" : 128,
17 |
18 | "input_buf_size" : "4MB",
19 | "output_buf_size" : "4MB",
20 |
21 | "flit_size" : "256B",
22 |
23 | "link_lat" : "10ns",
24 | "input_latency" : "10ns",
25 | "output_latency" : "10ns",
26 | "host_link_latency" : "100ns",
27 |
28 | "num_vns" : 2,
29 | "width": 2,
30 |
31 | "xbar_arb" : "merlin.xbar_arb_lru"
32 | }
33 |
34 | llm_config_default= os.path.join(os.path.dirname(os.path.realpath(__file__)), "small_config.json")
35 |
36 | parser = argparse.ArgumentParser(
37 | prog=f'sst {__file__} --',
38 | formatter_class=argparse.ArgumentDefaultsHelpFormatter)
39 | parser.add_argument("--tp", type=int, help="Tensor Parallelism level", default=8)
40 | parser.add_argument("--pp", type=int, help="Data Parallelism level", default=1)
41 | parser.add_argument("--dp", type=int, help="Pipeline Parallelism level", default=1)
42 | parser.add_argument("--batch_size", type=int, help="Number of sequence processed in parallel", default=32)
43 | parser.add_argument("--sequence_len", type=int, help="Number of token per sequence", default=8192)
44 | parser.add_argument("--n_batch", type=int, help="Number of batches", default=128)
45 | parser.add_argument("--llm_config", type=str, help="Configuration file of the Large Language Model", default=llm_config_default)
46 | parser.add_argument("--peak_flop", type=int, help="Peak flop throughput per end point for the targeted data type", default=78e12)
47 | parser.add_argument("--draw_bw", type=int, help="DRAM bandwidth per end point", default=1555e9)
48 | parser.add_argument("--verbose", type=int, help="Set verbosity", default=0)
49 | parser.add_argument("--log", type=str, help="Enable motif logger", action='store', nargs='?', const="logger")
50 | parser.add_argument("--stats", type=str, help="write statistics, argument changes the filename", nargs="?", const="-")
51 | parser.add_argument("--topology", type=str, help="Network topology", default="single",
52 | choices=["single", "dragonfly", "fattree"] )
53 | args = parser.parse_args()
54 |
55 | assert os.path.exists(args.llm_config), "LLM config file does not exist!"
56 |
57 | num_ranks = args.tp * args.pp * args.dp
58 | topology = args.topology.lower()
59 |
60 | if args.stats:
61 | enableStats = True
62 | sst.setStatisticLoadLevel(10)
63 |
64 | stat_params = {"type":"sst.AccumulatorStatistic"}
65 |
66 | fname = args.stats
67 | if fname.endswith(".csv"):
68 | sst.setStatisticOutput("sst.statOutputCSV",
69 | { "filepath" : fname,
70 | "separator" : ";"
71 | } )
72 | else:
73 | sst.setStatisticOutput("sst.statOutputConsole")
74 | else:
75 | enableStats = False
76 |
77 |
78 | # Network topology definition start
79 | PlatformDefinition.setCurrentPlatform("firefly-defaults")
80 |
81 | ### set up the endpoint
82 | networkif = ReorderLinkControl()
83 | networkif.link_bw = str(params["link_bw"]) + "GB/s"
84 | networkif.input_buf_size = params["input_buf_size"]
85 | networkif.output_buf_size = params["output_buf_size"]
86 |
87 | assert (num_ranks == args.tp or num_ranks == args.pp or num_ranks == args.dp or
88 | (args.dp > 1 and args.pp > 1 and args.tp > 1)), "Only 1D and 3D parallelism are supported"
89 |
90 | ep = EmberMPIJob(0,num_ranks)
91 |
92 | ep.ember.verbose = 0
93 |
94 | ep.network_interface = networkif
95 | ep.addMotif("Init")
96 |
97 | if args.tp > 1 and args.pp == 1 and args.dp == 1:
98 | ep.addMotif(f"LLMTensorParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
99 |
100 | if args.pp > 1 and args.tp == 1 and args.dp == 1:
101 | ep.addMotif(f"LLMPipelineParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
102 |
103 | if args.dp > 1 and args.tp == 1 and args.pp == 1:
104 | ep.addMotif(f"LLMDataParallelism batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
105 |
106 | if args.dp > 1 and args.tp > 1 and args.pp > 1:
107 | ep.addMotif(f"LLM3DParallelism tp={args.tp} pp={args.pp} dp={args.dp} batch_size={args.batch_size} sequence_len={args.sequence_len} n_batch={args.n_batch} llm_config={args.llm_config} verbose={args.verbose} draw_bw={args.draw_bw} peak_flop={args.peak_flop}")
108 |
109 | ep.addMotif("Fini")
110 | ep.nic.nic2host_lat= params["host_link_latency"]
111 |
112 | if args.log:
113 | ep.enableMotifLog(args.log)
114 |
115 | if topology == "single":
116 | topo = topoSingle()
117 | topo.num_ports = num_ranks
118 | rank_per_router = num_ranks
119 |
120 | elif topology == "dragonfly":
121 | rank_per_router = args.tp
122 | topo = topoDragonFly()
123 | topo.hosts_per_router = rank_per_router
124 |
125 | topo.routers_per_group = args.dp
126 | topo.num_groups = args.pp
127 |
128 | topo.intergroup_links = params["width"]
129 | topo.algorithm = "minimal"
130 |
131 | elif topology == "fattree":
132 | rank_per_router = args.tp
133 | topo = topoFatTree()
134 | fattree_shape = f"1,1:{args.pp},{args.pp}:{args.dp},{args.dp}:{rank_per_router}"
135 | topo.host_link_latency = params["host_link_latency"]
136 | topo.shape = fattree_shape
137 | else:
138 | print(topology, " unknown!")
139 | sys.exit()
140 |
141 | # Set up the routers
142 | router = hr_router()
143 | router.link_bw = str(params["link_bw"]) + "GB/s"
144 | router.flit_size = params["flit_size"]
145 | router.xbar_bw = str(params["link_bw"]*rank_per_router) + "GB/s"
146 | router.input_latency = params["input_latency"]
147 | router.output_latency = params["output_latency"]
148 | router.input_buf_size = params["input_buf_size"]
149 | router.output_buf_size = params["output_buf_size"]
150 | router.num_vns = params["num_vns"]
151 | router.xbar_arb = params["xbar_arb"]
152 |
153 |
154 | ### Setup the topology
155 | topo.link_latency = params["link_lat"]
156 | topo.router = router
157 |
158 | system = System()
159 | system.setTopology(topo)
160 | system.allocateNodes(ep,"linear")
161 |
162 | if enableStats:
163 | networkif.enableAllStatistics(stat_params)
164 | router.enableAllStatistics(stat_params)
165 |
166 | system.build()
167 |
--------------------------------------------------------------------------------
/demo/sst/software/.gitignore:
--------------------------------------------------------------------------------
1 | x86
2 | test_gemm.c
3 | test_gemm_t.c
4 | test_softmax.c
5 |
--------------------------------------------------------------------------------
/demo/sst/software/Makefile:
--------------------------------------------------------------------------------
1 | ## This source code is licensed under the MIT license found in the
2 | ## LICENSE file in the root directory of this source tree.
3 | ##
4 | ## Copyright (c) 2025 IMEC. All rights reserved.
5 | ## ******************************************************************************
6 |
7 | ARCH_LIST= x86 riscv64
8 |
9 | .PHONY: all clean
10 | all clean:
11 | @for arch in $(ARCH_LIST) ; \
12 | do \
13 | make -f $$arch.make $@ || exit 1; \
14 | done
15 |
--------------------------------------------------------------------------------
/demo/sst/software/check_mpi.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
13 |
14 | #define WORLD MPI_COMM_WORLD
15 |
16 | #define CHECK_RES
17 |
18 |
19 | //FP32
20 | //#define DATATYPE 0
21 | //FP64
22 | //#define DATATYPE 1
23 | //I8
24 | //#define DATATYPE 2
25 | //I16
26 | //#define DATATYPE 3
27 | //I32
28 | #define DATATYPE 4
29 |
30 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
31 |
32 |
33 | #if DATATYPE == 0
34 | typedef float data_t;
35 | MPI_Datatype mpi_data_type = MPI_FLOAT;
36 | static data_type_e data_type = FP32;
37 | #define TYPE_IS_FP
38 | #elif DATATYPE == 1
39 | typedef double data_t;
40 | MPI_Datatype mpi_data_type = MPI_DOUBLE;
41 | static data_type_e data_type = FP64;
42 | #define TYPE_IS_FP
43 | #elif DATATYPE == 2
44 | typedef int8_t data_t;
45 | MPI_Datatype mpi_data_type = MPI_INT8_T;
46 | static data_type_e data_type = I8;
47 | #define TYPE_IS_INT
48 | #elif DATATYPE == 3
49 | typedef int16_t data_t;
50 | MPI_Datatype mpi_data_type = MPI_INT16_T;
51 | static data_type_e data_type = I16;
52 | #define TYPE_IS_INT
53 | #elif DATATYPE == 4
54 | typedef int32_t data_t;
55 | MPI_Datatype mpi_data_type = MPI_INT32_T;
56 | static data_type_e data_type = I32;
57 | #define TYPE_IS_INT
58 | #else
59 | #error Unsupported choice setting
60 | #endif
61 |
62 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
63 |
64 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
65 | int stride_0, int stride_1, int stride_2);
66 |
67 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
68 | int stride_0, int stride_1, int stride_2);
69 |
70 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
71 |
72 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
73 |
74 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
75 |
76 | #ifdef CHECK_RES
77 |
78 | void gemm_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
79 | int stride_0, int stride_1, int stride_2);
80 |
81 | void gemm_t_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k,
82 | int stride_0, int stride_1, int stride_2);
83 |
84 | void scale_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
85 |
86 | void add_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
87 |
88 | void softmax_ref(void * dst, void * src, data_type_e data_type, int m, int n);
89 |
90 | #if defined(TYPE_IS_INT)
91 | static int cmp(data_t rhs, data_t lhs) {
92 | return (rhs) - (lhs);
93 | }
94 | static void print_diff(int count, int pos, data_t ref, data_t res) {
95 | printf("Difference %d at %d ref = %d mpi = %d\n", count, pos, ref, res);
96 | }
97 | #elif defined(TYPE_IS_FP)
98 | static int cmp(data_t rhs, data_t lhs) {
99 | int ret = 0;
100 | data_t diff = roundf(rhs) - roundf(lhs);
101 | ret = diff > 1.0 ? 1 : 0;
102 | return ret;
103 | }
104 | static void print_diff(int count, int pos, data_t ref, data_t res) {
105 | printf("Difference %d at %d ref = %.4f mpi = %.4f\n", count, pos, ref, res);
106 | }
107 | #else
108 | #error Unsupported choice setting
109 | #endif
110 |
111 | #endif // CHECK_RES
112 |
113 | int main(int argc, char ** argv) {
114 | const int root = 0;
115 | int n_ranks, rank;
116 | MPI_Request emb_req, Qw_req, Kw_req, Vw_req, attn_w_req;
117 |
118 | MPI_Init(&argc, &argv);
119 | MPI_Comm_size(WORLD, &n_ranks);
120 | MPI_Comm_rank(WORLD, &rank);
121 |
122 | MPI_Datatype col, col_type;
123 |
124 | if(argc != 4) {
125 | fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
126 | exit(EXIT_FAILURE);
127 | }
128 |
129 | struct timespec start, end;
130 | double time_elapsed_s;
131 | int dmodel = atoi(argv[1]);
132 | int h = atoi(argv[3]);
133 | int S = atoi(argv[2]);
134 | int dk = dmodel/h;
135 | int dv = dmodel/h;
136 |
137 | if((dmodel%n_ranks) != 0) {
138 | fprintf(stderr, "Error: dmodel must be a multiple of the number of ranks (dmodel: %d, rank: %d)\n", dmodel, rank);
139 | exit(EXIT_FAILURE);
140 | }
141 |
142 | fprintf(stdout, "Model n_ranks: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
143 |
144 |
145 | MPI_Type_vector(dmodel, dmodel/n_ranks, dmodel, mpi_data_type, &col);
146 | MPI_Type_commit(&col);
147 | MPI_Type_create_resized(col, 0, dmodel/n_ranks*sizeof(data_t), &col_type);
148 | MPI_Type_commit(&col_type);
149 |
150 |
151 | assert(n_ranks <= h && (h % n_ranks) == 0);
152 |
153 | #ifdef CHECK_RES
154 | data_t * embeddings_ref = NULL;
155 | #endif
156 |
157 | data_t *embeddings = NULL;
158 | data_t * Qw = NULL;
159 | data_t * Kw = NULL;
160 | data_t * Vw = NULL;
161 |
162 | data_t * Qw_heads = NULL;
163 | data_t * Kw_heads = NULL;
164 | data_t * Vw_heads = NULL;
165 |
166 | data_t * Q = NULL;
167 | data_t * K = NULL;
168 | data_t * V = NULL;
169 |
170 | data_t * KQ = NULL;
171 | data_t * softmax_out = NULL;
172 |
173 | data_t * QKV = NULL;
174 |
175 | data_t * ATTNw = NULL;
176 | data_t * ATTNout = NULL;
177 |
178 | data_t scale_f = 1.0f/sqrtf(((data_t)dk));
179 |
180 | srand(time(NULL));
181 |
182 | clock_gettime(CLOCK_MONOTONIC, &start);
183 |
184 | embeddings = calloc(dmodel*S, sizeof(data_t));
185 |
186 | ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
187 |
188 | if(rank == root) {
189 | init_random_tensor(embeddings, data_type, dmodel*S);
190 | init_random_tensor(ATTNw, data_type, dmodel*dmodel);
191 | #ifdef CHECK_RES
192 | embeddings_ref = calloc(S*dmodel,sizeof(data_t));
193 | memcpy(embeddings_ref, embeddings, S*dmodel*sizeof(data_t));
194 | #endif
195 | }
196 |
197 | MPI_Ibcast(embeddings, dmodel*S, mpi_data_type, root, WORLD, &emb_req);
198 | MPI_Ibcast(ATTNw, dmodel*dmodel, mpi_data_type, root, WORLD, &attn_w_req);
199 |
200 | if(rank == root) {
201 | Qw = calloc(dmodel*dmodel, sizeof(data_t));
202 | init_random_tensor(Qw, data_type, dmodel*dmodel);
203 |
204 | Kw = calloc(dmodel*dmodel, sizeof(data_t));
205 | init_random_tensor(Kw, data_type, dmodel*dmodel);
206 |
207 | Vw = calloc(dmodel*dmodel, sizeof(data_t));
208 | init_random_tensor(Vw, data_type, dmodel*dmodel);
209 | }
210 |
211 | Qw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
212 | Kw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
213 | Vw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
214 |
215 | MPI_Iscatter(Qw, 1, col_type, Qw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Qw_req);
216 | MPI_Iscatter(Kw, 1, col_type, Kw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Kw_req);
217 | MPI_Iscatter(Vw, 1, col_type, Vw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD, &Vw_req);
218 |
219 | Q = calloc(S*dmodel/n_ranks, sizeof(data_t));
220 | memset(Q, 0, S*dmodel*sizeof(data_t)/n_ranks);
221 |
222 | K = calloc(S*dmodel/n_ranks, sizeof(data_t));
223 | memset(K, 0, S*dmodel*sizeof(data_t)/n_ranks);
224 |
225 | V = calloc(S*dmodel/n_ranks, sizeof(data_t));
226 | memset(V, 0, S*dmodel*sizeof(data_t)/n_ranks);
227 |
228 | KQ = calloc(h/n_ranks*S*S, sizeof(data_t));
229 | memset(KQ, 0, h/n_ranks*S*S*sizeof(data_t));
230 |
231 | softmax_out = calloc(h/n_ranks*S*S, sizeof(data_t));
232 | memset(softmax_out, 0, h/n_ranks*S*S*sizeof(data_t));
233 |
234 | QKV = calloc(S/n_ranks*dmodel, sizeof(data_t));
235 | memset(QKV, 0, S/n_ranks*dmodel*sizeof(data_t));
236 |
237 | ATTNout = calloc(S*dmodel, sizeof(data_t));
238 | memset(ATTNout, 0, S*dmodel*sizeof(data_t));
239 |
240 | MPI_Wait(&emb_req, MPI_STATUS_IGNORE);
241 | MPI_Wait(&Qw_req, MPI_STATUS_IGNORE);
242 | MPI_Wait(&Kw_req, MPI_STATUS_IGNORE);
243 | MPI_Wait(&Vw_req, MPI_STATUS_IGNORE);
244 | MPI_Wait(&attn_w_req, MPI_STATUS_IGNORE);
245 |
246 | clock_gettime(CLOCK_MONOTONIC, &end);
247 |
248 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
249 | printf("[rank: %d] Init time: %.2f ms\n", rank, time_elapsed_s * 1000);
250 |
251 | clock_gettime(CLOCK_MONOTONIC, &start);
252 | /* MHA */
253 |
254 | gemm(Q, embeddings, Qw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
255 | gemm(K, embeddings, Kw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
256 | gemm(V, embeddings, Vw_heads, data_type, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
257 |
258 | for(int i = 0; i < h/n_ranks; i++) {
259 | gemm_t(&KQ[S*S*i], &Q[dmodel/h*i], K, data_type, S, S, dmodel/h, S, dmodel/n_ranks, dmodel/n_ranks);
260 | }
261 |
262 | scale(KQ, KQ, ((void*)&scale_f), data_type, h/n_ranks*S, S);
263 |
264 | softmax(softmax_out, KQ, data_type, h/n_ranks*S, S);
265 |
266 | for(int i = 0; i < h/n_ranks; i++) {
267 | gemm(&QKV[dmodel/h*i], &softmax_out[S*S*i], &V[dmodel/h*i], data_type, S, dmodel/h, S, dmodel/n_ranks, S, dmodel/n_ranks);
268 | }
269 |
270 | gemm(ATTNout, QKV, ATTNw, data_type, S, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel);
271 |
272 | add(&ATTNout[S/n_ranks*rank*dmodel], &ATTNout[S/n_ranks*rank*dmodel], &embeddings[S/n_ranks*rank*dmodel], data_type, S/n_ranks, dmodel);
273 |
274 |
275 | MPI_Allreduce(ATTNout, embeddings, S*dmodel, mpi_data_type, MPI_SUM, WORLD);
276 |
277 | clock_gettime(CLOCK_MONOTONIC, &end);
278 |
279 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
280 | printf("[rank: %d] MHA execution time: %.2f ms\n", rank, time_elapsed_s * 1000);
281 |
282 | #ifdef CHECK_RES
283 | if(rank == root) {
284 | data_t * Q_ref = calloc(S*dmodel, sizeof(data_t));
285 | assert(Q_ref);
286 | memset(Q_ref, 0, S*dmodel*sizeof(data_t));
287 |
288 | data_t * K_ref = calloc(S*dmodel, sizeof(data_t));
289 | assert(K_ref);
290 | memset(K_ref, 0, S*dmodel*sizeof(data_t));
291 |
292 | data_t * V_ref = calloc(S*dmodel, sizeof(data_t));
293 | assert(V_ref);
294 | memset(V_ref, 0, S*dmodel*sizeof(data_t));
295 |
296 | data_t * KQ_ref = calloc(S*S*h, sizeof(data_t));
297 | assert(KQ_ref);
298 | memset(KQ_ref, 0, h*S*S*sizeof(data_t));
299 |
300 | data_t * softmax_out_ref = calloc(h*S*S, sizeof(data_t));
301 | assert(softmax_out_ref);
302 | memset(softmax_out_ref, 0, h*S*S*sizeof(data_t));
303 |
304 | data_t * QKV_ref = calloc(S*dmodel, sizeof(data_t));
305 | assert(QKV_ref);
306 | memset(QKV_ref, 0, S*dmodel*sizeof(data_t));
307 |
308 | data_t * ATTNout_ref = calloc(S*dmodel, sizeof(data_t));
309 | assert(ATTNout_ref);
310 | memset(ATTNout_ref, 0, S*dmodel*sizeof(data_t));
311 |
312 | gemm_ref(Q_ref, embeddings_ref, Qw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
313 | gemm_ref(K_ref, embeddings_ref, Kw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
314 | gemm_ref(V_ref, embeddings_ref, Vw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
315 |
316 | for(int i = 0; i < h; i++) {
317 | gemm_t_ref(&KQ_ref[S*S*i], &Q_ref[dmodel/h*i], K_ref, data_type, S, S, dmodel/h, S, dmodel, dmodel);
318 | }
319 |
320 | scale_ref(KQ_ref, KQ_ref, ((void*)&scale_f), data_type, S*h, S);
321 |
322 | softmax_ref(softmax_out_ref, KQ_ref, data_type, S*h, S);
323 |
324 | for(int i = 0; i < h; i++) {
325 | gemm_ref(&QKV_ref[dmodel/h*i], &softmax_out_ref[S*S*i], &V_ref[dmodel/h*i], data_type, S, dmodel/h, S, dmodel, S, dmodel);
326 | }
327 |
328 | gemm_ref(ATTNout_ref, QKV_ref, ATTNw, data_type, S, dmodel, dmodel, dmodel, dmodel, dmodel);
329 |
330 | add_ref(embeddings_ref, embeddings_ref, ATTNout_ref, data_type, S, dmodel);
331 |
332 | int count = 0;
333 | for(int i = 0; i < S*dmodel; i++)
334 | if(cmp(embeddings_ref[i], embeddings[i]) != 0)
335 | print_diff(count++, i, embeddings_ref[i], embeddings[i]);
336 |
337 | free(Q_ref);
338 | free(K_ref);
339 | free(V_ref);
340 | free(KQ_ref);
341 | free(softmax_out_ref);
342 | free(QKV_ref);
343 | free(embeddings_ref);
344 | free(ATTNout_ref);
345 | }
346 | #endif
347 |
348 | if(rank == root) {
349 | free(Qw);
350 | free(Kw);
351 | free(Vw);
352 | }
353 |
354 | free(embeddings);
355 | free(Qw_heads);
356 | free(Kw_heads);
357 | free(Vw_heads);
358 | free(Q);
359 | free(K);
360 | free(V);
361 | free(KQ);
362 | free(softmax_out);
363 | free(QKV);
364 | free(ATTNw);
365 | free(ATTNout);
366 |
367 | MPI_Finalize();
368 | return 0;
369 | }
370 |
371 | static size_t get_element_size(data_type_e type) {
372 | size_t size;
373 | switch (type) {
374 | case I8:
375 | size = sizeof(uint8_t);
376 | break;
377 | case I16:
378 | size = sizeof(uint16_t);
379 | break;
380 | case I32:
381 | case FP32:
382 | size = sizeof(uint32_t);
383 | break;
384 | case FP64:
385 | size = sizeof(uint64_t);
386 | break;
387 | default:
388 | size = -1;
389 | break;
390 | }
391 |
392 | assert(size > 0 && "data type unknown");
393 |
394 | return size;
395 | }
396 |
397 |
398 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
399 | const data_t range = 10;
400 | #pragma omp parallel for shared (tensor)
401 | for(int i = 0; i < nmemb; i++)
402 | tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
403 | }
404 |
405 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
406 | assert(tensor);
407 | init_random_tensor_impl(((data_t*)tensor), nmemb);
408 | }
409 |
410 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
411 | const int bsize = 64;
412 | int ii0, ii1, ii2;
413 | int i0, i1, i2;
414 | data_t pp;
415 | #pragma omp parallel for shared (dst, src1, src2) private(i0,i1,i2,ii0,ii1,ii2,pp) collapse(3)
416 | for (ii0 = 0; ii0 src[i*n+j]) ? max : src[i*n+j];
478 |
479 | sum = 0.0;
480 | for(j = 0; j < n; j++) {
481 | const data_t e = expf(src[i*n+j] - max);
482 | sum += e;
483 | dst[i*n+j] = e;
484 | }
485 |
486 | for(j = 0; j < n; j++) {
487 | dst[i*n+j] *= sum;
488 | }
489 | }
490 | }
491 |
492 |
493 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
494 | assert(dst);
495 | assert(src1);
496 | assert(src2);
497 | gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
498 | }
499 |
500 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
501 | assert(dst);
502 | assert(src1);
503 | assert(src2);
504 | gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
505 | }
506 |
507 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
508 | assert(dst);
509 | assert(src1);
510 | assert(src2);
511 | scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
512 | }
513 |
514 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
515 | assert(dst);
516 | assert(src1);
517 | assert(src2);
518 | add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
519 | }
520 |
521 |
522 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
523 | assert(dst);
524 | assert(src);
525 | softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
526 | }
527 |
528 | #ifdef CHECK_RES
529 |
530 | static void gemm_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
531 | int i0 = 0, i1 = 0, i2 = 0;
532 | for (i0 = 0; i0 < m; ++i0) {
533 | for (i1 = 0; i1 < n; ++i1) {
534 | for (i2 = 0; i2 < k; ++i2) {
535 | dst[i0*(stride_0)+i1] += src1[i0*(stride_1)+i2] * src2[i2*stride_2+i1];
536 | }
537 | }
538 | }
539 | }
540 |
541 | static void gemm_t_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
542 | int i0 = 0, i1 = 0, i2 = 0;
543 | for (i0 = 0; i0 < m; ++i0) {
544 | for (i1 = 0; i1 < n; ++i1) {
545 | for (i2 = 0; i2 < k; ++i2) {
546 | dst[i0*stride_0+i1] += src1[i0*(stride_1)+i2] * src2[i1*stride_2+i2];
547 | }
548 | }
549 | }
550 | }
551 |
552 | static void scale_ref_impl(data_t * dst, const data_t * src1, const data_t src2, int m, int n) {
553 | int i, j;
554 | for(i = 0; i < m*n; i++)
555 | dst[i] = src1[i] * src2;
556 | }
557 |
558 | static void add_ref_impl(data_t * dst, const data_t * src1, const data_t * src2, int m, int n) {
559 | int i, j;
560 | for(i = 0; i < m*n; i++)
561 | dst[i] = src1[i] + src2[i];
562 | }
563 |
564 | static void softmax_ref_impl(data_t * dst, const data_t * src, int m, int n) {
565 | int i, j;
566 | data_t max, sum;
567 | for(i = 0; i < m; i++) {
568 | max = FLT_MIN;
569 | for(j = 0; j < n; j++)
570 | max = (max > src[i*n+j]) ? max : src[i*n+j];
571 |
572 | sum = 0.0;
573 | for(j = 0; j < n; j++) {
574 | const data_t e = expf(src[i*n+j] - max);
575 | sum += e;
576 | dst[i*n+j] = e;
577 | }
578 |
579 | for(j = 0; j < n; j++) {
580 | dst[i*n+j] *= sum;
581 | }
582 | }
583 | }
584 |
585 |
586 | void gemm_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
587 | assert(dst);
588 | assert(src1);
589 | assert(src2);
590 | gemm_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
591 | }
592 |
593 | void gemm_t_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n, int k, int stride_0, int stride_1, int stride_2) {
594 | assert(dst);
595 | assert(src1);
596 | assert(src2);
597 | gemm_t_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n, k, stride_0, stride_1, stride_2);
598 | }
599 |
600 | void scale_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
601 | assert(dst);
602 | assert(src1);
603 | assert(src2);
604 | scale_ref_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
605 | }
606 |
607 | void add_ref(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
608 | assert(dst);
609 | assert(src1);
610 | assert(src2);
611 | add_ref_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
612 | }
613 |
614 |
615 | void softmax_ref(void * dst, void * src, data_type_e data_type, int m, int n) {
616 | assert(dst);
617 | assert(src);
618 | softmax_ref_impl(((data_t*)dst), ((data_t*)src), m, n);
619 | }
620 |
621 | #endif
622 |
--------------------------------------------------------------------------------
/demo/sst/software/gemm_OMP.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
13 |
14 | #ifndef TILE_SIZE
15 | #define TILE_SIZE 16
16 | #endif
17 |
18 | //FP32
19 | //#define DATATYPE 0
20 | //FP64
21 | //#define DATATYPE 1
22 | //I8
23 | //#define DATATYPE 2
24 | //I16
25 | //#define DATATYPE 3
26 | //I32
27 | #define DATATYPE 4
28 |
29 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
30 |
31 | #if DATATYPE == 0
32 | typedef float data_t;
33 | static data_type_e data_type = FP32;
34 | #define DATA_MIN FLT_MIN
35 | #define TYPE_IS_FP
36 | #elif DATATYPE == 1
37 | typedef double data_t;
38 | static data_type_e data_type = FP64;
39 | #define DATA_MIN DBL_MIN
40 | #define TYPE_IS_FP
41 | #elif DATATYPE == 2
42 | typedef int8_t data_t;
43 | static data_type_e data_type = I8;
44 | #define DATA_MIN CHAR_MIN
45 | #define TYPE_IS_INT
46 | #elif DATATYPE == 3
47 | typedef int16_t data_t;
48 | static data_type_e data_type = I16;
49 | #define DATA_MIN SHRT_MIN
50 | #define TYPE_IS_INT
51 | #elif DATATYPE == 4
52 | typedef int32_t data_t;
53 | static data_type_e data_type = I32;
54 | #define DATA_MIN INT_MIN
55 | #define TYPE_IS_INT
56 | #else
57 | #error Unsupported choice setting
58 | #endif
59 |
60 |
61 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
62 |
63 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
64 | int stride_0, int stride_1, int stride_2);
65 | int main(int argc, char ** argv) {
66 |
67 | if(argc != 4) {
68 | fprintf(stderr, "Usage: %s M N K\n", argv[0]);
69 | exit(EXIT_FAILURE);
70 | }
71 |
72 | struct timespec start, end;
73 | double time_elapsed_s;
74 | int m = atoi(argv[1]);
75 | int n = atoi(argv[3]);
76 | int k = atoi(argv[2]);
77 |
78 | fprintf(stdout, "M: %d, N: %d, K: %d\n", m, n, k);
79 |
80 | data_t * A = NULL;
81 | data_t * B = NULL;
82 | data_t * C = NULL;
83 |
84 | srand(time(NULL));
85 |
86 | clock_gettime(CLOCK_MONOTONIC, &start);
87 |
88 | A = calloc(m*k, sizeof(data_t));
89 | B = calloc(k*n, sizeof(data_t));
90 | C = calloc(m*n, sizeof(data_t));
91 |
92 |
93 | init_random_tensor(A, data_type, m*k);
94 | init_random_tensor(B, data_type, n*k);
95 |
96 | clock_gettime(CLOCK_MONOTONIC, &end);
97 |
98 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
99 | printf("Init time: %.2f ms\n", time_elapsed_s * 1000);
100 |
101 | clock_gettime(CLOCK_MONOTONIC, &start);
102 | /* MHA */
103 |
104 | gemm(C, A, B, data_type, 1, m, n, k, n, k, n);
105 |
106 | clock_gettime(CLOCK_MONOTONIC, &end);
107 |
108 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
109 | const uint64_t flop_count = m*n*k*2;
110 |
111 | printf("Execution time: %.2f ms flop count: %lu\n", time_elapsed_s * 1000, flop_count);
112 |
113 | free(A);
114 | free(B);
115 | free(C);
116 |
117 | return 0;
118 | }
119 |
120 | static size_t get_element_size(data_type_e type) {
121 | size_t size;
122 | switch (type) {
123 | case I8:
124 | size = sizeof(uint8_t);
125 | break;
126 | case I16:
127 | size = sizeof(uint16_t);
128 | break;
129 | case I32:
130 | case FP32:
131 | size = sizeof(uint32_t);
132 | break;
133 | case FP64:
134 | size = sizeof(uint64_t);
135 | break;
136 | default:
137 | size = -1;
138 | break;
139 | }
140 |
141 | assert(size > 0 && "data type unknown");
142 |
143 | return size;
144 | }
145 |
146 |
147 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
148 | const data_t range = 10;
149 | #pragma omp parallel for shared (tensor)
150 | for(int i = 0; i < nmemb; i++)
151 | tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
152 | }
153 |
154 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
155 | assert(tensor);
156 | init_random_tensor_impl(((data_t*)tensor), nmemb);
157 | }
158 |
159 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
160 | int stride_0, int stride_1, int stride_2) {
161 | const int bsize = TILE_SIZE;
162 | int ii0, ii1, ii2;
163 | int i0, i1, i2;
164 | int h;
165 | int start_head, stop_head;
166 | data_t pp;
167 | #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
168 | for (h=0; h < heads; h++) {
169 | for (ii0 = 0; ii0
2 | #include
3 |
4 | int main(int argc, char ** argv) {
5 |
6 | int n_ranks, rid;
7 |
8 | printf("Initializing MPI\n");
9 |
10 | MPI_Init(&argc, &argv);
11 | MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
12 | MPI_Comm_rank(MPI_COMM_WORLD, &rid);
13 |
14 | printf("Hello from process %d out of %d\n", rid, n_ranks);
15 |
16 | MPI_Finalize();
17 |
18 | return 0;
19 | }
20 |
--------------------------------------------------------------------------------
/demo/sst/software/hello_MPI_OMP.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 |
6 | int main(int argc, char ** argv) {
7 |
8 | int n_ranks, rid;
9 | int n_threads = 0, tid = 0;
10 |
11 |
12 | printf("Initializing MPI\n");
13 |
14 | MPI_Init(&argc, &argv);
15 | MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);
16 | MPI_Comm_rank(MPI_COMM_WORLD, &rid);
17 |
18 | printf("[rank %d] Entering OMP section\n", rid);
19 |
20 | #pragma omp parallel private(tid, n_threads)
21 | {
22 | n_threads = omp_get_num_threads();
23 | tid = omp_get_thread_num();
24 | printf("Hello from thread %d out of %d from process %d out of %d\n",
25 | tid, n_threads, rid, n_ranks);
26 | }
27 |
28 | MPI_Finalize();
29 |
30 | return 0;
31 | }
32 |
--------------------------------------------------------------------------------
/demo/sst/software/mha_MPI_OMP.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
14 |
15 | #define WORLD MPI_COMM_WORLD
16 |
17 | //FP32
18 | //#define DATATYPE 0
19 | //FP64
20 | //#define DATATYPE 1
21 | //I8
22 | //#define DATATYPE 2
23 | //I16
24 | //#define DATATYPE 3
25 | //I32
26 | #define DATATYPE 4
27 |
28 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
29 |
30 |
31 | #if DATATYPE == 0
32 | typedef float data_t;
33 | MPI_Datatype mpi_data_type = MPI_FLOAT;
34 | static data_type_e data_type = FP32;
35 | #define DATA_MIN FLT_MIN
36 | #define TYPE_IS_FP
37 | #elif DATATYPE == 1
38 | typedef double data_t;
39 | MPI_Datatype mpi_data_type = MPI_DOUBLE;
40 | static data_type_e data_type = FP64;
41 | #define DATA_MIN DBL_MIN
42 | #define TYPE_IS_FP
43 | #elif DATATYPE == 2
44 | typedef int8_t data_t;
45 | MPI_Datatype mpi_data_type = MPI_INT8_T;
46 | static data_type_e data_type = I8;
47 | #define DATA_MIN CHAR_MIN
48 | #define TYPE_IS_INT
49 | #elif DATATYPE == 3
50 | typedef int16_t data_t;
51 | MPI_Datatype mpi_data_type = MPI_INT16_T;
52 | static data_type_e data_type = I16;
53 | #define DATA_MIN SHRT_MIN
54 | #define TYPE_IS_INT
55 | #elif DATATYPE == 4
56 | typedef int32_t data_t;
57 | MPI_Datatype mpi_data_type = MPI_INT32_T;
58 | static data_type_e data_type = I32;
59 | #define DATA_MIN INT_MIN
60 | #define TYPE_IS_INT
61 | #else
62 | #error Unsupported choice setting
63 | #endif
64 |
65 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
66 |
67 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
68 | int stride_0, int stride_1, int stride_2);
69 |
70 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
71 | int stride_0, int stride_1, int stride_2);
72 |
73 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
74 |
75 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
76 |
77 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
78 |
79 | int main(int argc, char ** argv) {
80 | const int root = 0;
81 | int n_ranks, rank;
82 |
83 | MPI_Init(&argc, &argv);
84 | MPI_Comm_size(WORLD, &n_ranks);
85 | MPI_Comm_rank(WORLD, &rank);
86 |
87 | MPI_Datatype col, col_type;
88 |
89 | if(argc != 4) {
90 | fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
91 | exit(EXIT_FAILURE);
92 | }
93 |
94 | struct timespec start, end;
95 | double time_elapsed_s;
96 | int dmodel = atoi(argv[1]);
97 | int h = atoi(argv[3]);
98 | int S = atoi(argv[2]);
99 | int dk = dmodel/h;
100 | int dv = dmodel/h;
101 |
102 | if((dmodel%n_ranks) != 0) {
103 | fprintf(stderr, "Error: dmodel must be a multiple of the number of ranks (dmodel: %d, rank: %d)\n", dmodel, rank);
104 | exit(EXIT_FAILURE);
105 | }
106 |
107 | if(n_ranks > h || (h % n_ranks) != 0) {
108 | fprintf(stderr, "Error: the number of heads must be a multiple of the number of MPI \
109 | ranks and the number of heads must be equal or greater than the \
110 | number of ranks. (heads = %d, n_ranks=%s\n", h, n_ranks);
111 | exit(EXIT_FAILURE);
112 | }
113 |
114 | fprintf(stdout, "Model dim: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
115 |
116 | MPI_Type_vector(dmodel, dmodel/n_ranks, dmodel, mpi_data_type, &col);
117 | MPI_Type_commit(&col);
118 | MPI_Type_create_resized(col, 0, dmodel/n_ranks*sizeof(data_t), &col_type);
119 | MPI_Type_commit(&col_type);
120 |
121 | data_t *embeddings = NULL;
122 | data_t * Qw = NULL;
123 | data_t * Kw = NULL;
124 | data_t * Vw = NULL;
125 |
126 | data_t * Qw_heads = NULL;
127 | data_t * Kw_heads = NULL;
128 | data_t * Vw_heads = NULL;
129 |
130 | data_t * Q = NULL;
131 | data_t * K = NULL;
132 | data_t * V = NULL;
133 |
134 | data_t * KQ = NULL;
135 | data_t * softmax_out = NULL;
136 |
137 | data_t * QKV = NULL;
138 |
139 | data_t * ATTNw = NULL;
140 | data_t * ATTNout = NULL;
141 |
142 | data_t scale_f = 1.0f/sqrtf(((data_t)dk));
143 |
144 | srand(time(NULL));
145 |
146 | clock_gettime(CLOCK_MONOTONIC, &start);
147 |
148 | embeddings = calloc(dmodel*S, sizeof(data_t));
149 |
150 | ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
151 |
152 | if(rank == root) {
153 | init_random_tensor(embeddings, data_type, dmodel*S);
154 | init_random_tensor(ATTNw, data_type, dmodel*dmodel);
155 | }
156 |
157 | MPI_Bcast(embeddings, dmodel*S, mpi_data_type, root, WORLD);
158 | MPI_Bcast(ATTNw, dmodel*dmodel, mpi_data_type, root, WORLD);
159 |
160 | if(rank == root) {
161 | Qw = calloc(dmodel*dmodel, sizeof(data_t));
162 | init_random_tensor(Qw, data_type, dmodel*dmodel);
163 |
164 | Kw = calloc(dmodel*dmodel, sizeof(data_t));
165 | init_random_tensor(Kw, data_type, dmodel*dmodel);
166 |
167 | Vw = calloc(dmodel*dmodel, sizeof(data_t));
168 | init_random_tensor(Vw, data_type, dmodel*dmodel);
169 | }
170 |
171 | Qw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
172 | Kw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
173 | Vw_heads = calloc(dmodel*dmodel/n_ranks, sizeof(data_t));
174 |
175 | MPI_Scatter(Qw, 1, col_type, Qw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
176 | MPI_Scatter(Kw, 1, col_type, Kw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
177 | MPI_Scatter(Vw, 1, col_type, Vw_heads, dmodel*dmodel/n_ranks, mpi_data_type, root, WORLD);
178 |
179 | Q = calloc(S*dmodel/n_ranks, sizeof(data_t));
180 | memset(Q, 0, S*dmodel*sizeof(data_t)/n_ranks);
181 |
182 | K = calloc(S*dmodel/n_ranks, sizeof(data_t));
183 | memset(K, 0, S*dmodel*sizeof(data_t)/n_ranks);
184 |
185 | V = calloc(S*dmodel/n_ranks, sizeof(data_t));
186 | memset(V, 0, S*dmodel*sizeof(data_t)/n_ranks);
187 |
188 | KQ = calloc(h/n_ranks*S*S, sizeof(data_t));
189 | memset(KQ, 0, h/n_ranks*S*S*sizeof(data_t));
190 |
191 | softmax_out = calloc(h/n_ranks*S*S, sizeof(data_t));
192 | memset(softmax_out, 0, h/n_ranks*S*S*sizeof(data_t));
193 |
194 | QKV = calloc(S/n_ranks*dmodel, sizeof(data_t));
195 | memset(QKV, 0, S/n_ranks*dmodel*sizeof(data_t));
196 |
197 | ATTNout = calloc(S*dmodel, sizeof(data_t));
198 | memset(ATTNout, 0, S*dmodel*sizeof(data_t));
199 |
200 | clock_gettime(CLOCK_MONOTONIC, &end);
201 |
202 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
203 | printf("[rank: %d] Init time: %.2f ms\n", rank, time_elapsed_s * 1000);
204 |
205 | clock_gettime(CLOCK_MONOTONIC, &start);
206 | /* MHA */
207 |
208 | gemm(Q, embeddings, Qw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
209 | gemm(K, embeddings, Kw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
210 | gemm(V, embeddings, Vw_heads, data_type, 1, S, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks);
211 |
212 | gemm_t(KQ, Q, K, data_type, h/n_ranks, S, S, dmodel/h, S, dmodel/n_ranks, dmodel/n_ranks);
213 |
214 | scale(KQ, KQ, ((void*)&scale_f), data_type, h/n_ranks*S, S);
215 |
216 | softmax(softmax_out, KQ, data_type, h/n_ranks*S, S);
217 |
218 | gemm(QKV, softmax_out, V, data_type, h/n_ranks, S, dmodel/h, S, dmodel/n_ranks, S, dmodel/n_ranks);
219 |
220 | gemm(ATTNout, QKV, ATTNw, data_type, 1, S, dmodel, dmodel/n_ranks, dmodel, dmodel/n_ranks, dmodel);
221 |
222 | add(&ATTNout[S/n_ranks*rank*dmodel], &ATTNout[S/n_ranks*rank*dmodel], &embeddings[S/n_ranks*rank*dmodel], data_type, S/n_ranks, dmodel);
223 |
224 | MPI_Allreduce(ATTNout, embeddings, S*dmodel, mpi_data_type, MPI_SUM, WORLD);
225 |
226 | clock_gettime(CLOCK_MONOTONIC, &end);
227 |
228 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
229 | const uint64_t flop_count = S/n_ranks * (dmodel * (8*dmodel + 4*S + 1) + 8*h*S) + S*dmodel;
230 |
231 | printf("[rank: %d] MHA execution time: %.2f ms flop count per rank: %lu\n", rank, time_elapsed_s * 1000, flop_count);
232 |
233 | if(rank == root) {
234 | free(Qw);
235 | free(Kw);
236 | free(Vw);
237 | }
238 |
239 | free(embeddings);
240 | free(Qw_heads);
241 | free(Kw_heads);
242 | free(Vw_heads);
243 | free(Q);
244 | free(K);
245 | free(V);
246 | free(KQ);
247 | free(softmax_out);
248 | free(QKV);
249 | free(ATTNw);
250 | free(ATTNout);
251 |
252 | MPI_Finalize();
253 |
254 | return 0;
255 | }
256 |
257 | static size_t get_element_size(data_type_e type) {
258 | size_t size;
259 | switch (type) {
260 | case I8:
261 | size = sizeof(uint8_t);
262 | break;
263 | case I16:
264 | size = sizeof(uint16_t);
265 | break;
266 | case I32:
267 | case FP32:
268 | size = sizeof(uint32_t);
269 | break;
270 | case FP64:
271 | size = sizeof(uint64_t);
272 | break;
273 | default:
274 | size = -1;
275 | break;
276 | }
277 |
278 | assert(size > 0 && "data type unknown");
279 |
280 | return size;
281 | }
282 |
283 |
284 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
285 | const data_t range = 10;
286 | #pragma omp parallel for shared (tensor)
287 | for(int i = 0; i < nmemb; i++)
288 | tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
289 | }
290 |
291 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
292 | assert(tensor);
293 | init_random_tensor_impl(((data_t*)tensor), nmemb);
294 | }
295 |
296 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
297 | int stride_0, int stride_1, int stride_2) {
298 | const int bsize = MIN(32,k);
299 | int ii0, ii1, ii2;
300 | int i0, i1, i2;
301 | int h;
302 | int start_head, stop_head;
303 | data_t pp;
304 | #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
305 | for (h=0; h < heads; h++) {
306 | for (ii0 = 0; ii0 src[i*n+j]) ? max : src[i*n+j];
373 |
374 | sum = 0.0;
375 | for(j = 0; j < n; j++) {
376 | const data_t e = expf(src[i*n+j] - max);
377 | sum += e;
378 | dst[i*n+j] = e;
379 | }
380 |
381 | for(j = 0; j < n; j++) {
382 | dst[i*n+j] *= sum;
383 | }
384 | }
385 | }
386 |
387 |
388 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
389 | int stride_0, int stride_1, int stride_2) {
390 | assert(dst);
391 | assert(src1);
392 | assert(src2);
393 | assert(heads*n == stride_0);
394 | gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
395 | }
396 |
397 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
398 | int stride_0, int stride_1, int stride_2) {
399 | assert(dst);
400 | assert(src1);
401 | assert(src2);
402 | assert(heads*k == stride_1);
403 | gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
404 | }
405 |
406 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
407 | assert(dst);
408 | assert(src1);
409 | assert(src2);
410 | scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
411 | }
412 |
413 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
414 | assert(dst);
415 | assert(src1);
416 | assert(src2);
417 | add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
418 | }
419 |
420 |
421 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
422 | assert(dst);
423 | assert(src);
424 | softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
425 | }
426 |
--------------------------------------------------------------------------------
/demo/sst/software/mha_OMP.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
13 |
14 | #ifndef TILE_SIZE
15 | #define TILE_SIZE 16
16 | #endif
17 |
18 | //FP32
19 | //#define DATATYPE 0
20 | //FP64
21 | //#define DATATYPE 1
22 | //I8
23 | //#define DATATYPE 2
24 | //I16
25 | //#define DATATYPE 3
26 | //I32
27 | #define DATATYPE 4
28 |
29 | typedef enum {FP32, FP64, I8, I16, I32} data_type_e;
30 |
31 | #if DATATYPE == 0
32 | typedef float data_t;
33 | static data_type_e data_type = FP32;
34 | #define DATA_MIN FLT_MIN
35 | #define TYPE_IS_FP
36 | #elif DATATYPE == 1
37 | typedef double data_t;
38 | static data_type_e data_type = FP64;
39 | #define DATA_MIN DBL_MIN
40 | #define TYPE_IS_FP
41 | #elif DATATYPE == 2
42 | typedef int8_t data_t;
43 | static data_type_e data_type = I8;
44 | #define DATA_MIN CHAR_MIN
45 | #define TYPE_IS_INT
46 | #elif DATATYPE == 3
47 | typedef int16_t data_t;
48 | static data_type_e data_type = I16;
49 | #define DATA_MIN SHRT_MIN
50 | #define TYPE_IS_INT
51 | #elif DATATYPE == 4
52 | typedef int32_t data_t;
53 | static data_type_e data_type = I32;
54 | #define DATA_MIN INT_MIN
55 | #define TYPE_IS_INT
56 | #else
57 | #error Unsupported choice setting
58 | #endif
59 |
60 |
61 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb);
62 |
63 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
64 | int stride_0, int stride_1, int stride_2);
65 |
66 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
67 | int stride_0, int stride_1, int stride_2);
68 |
69 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
70 |
71 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n);
72 |
73 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n);
74 |
75 | int main(int argc, char ** argv) {
76 |
77 | if(argc != 4) {
78 | fprintf(stderr, "Usage: %s MODEL_SIZE SEQUENCE_LENGHT HEAD_COUNT\n", argv[0]);
79 | exit(EXIT_FAILURE);
80 | }
81 |
82 | struct timespec start, end;
83 | double time_elapsed_s;
84 | int dmodel = atoi(argv[1]);
85 | int h = atoi(argv[3]);
86 | int S = atoi(argv[2]);
87 | int dk = dmodel/h;
88 | int dv = dmodel/h;
89 |
90 | fprintf(stdout, "Model n_ranks: %d, Sequence lenght: %d, head count: %d\n", dmodel, S, h);
91 |
92 | data_t *embeddings = NULL;
93 | data_t * Qw = NULL;
94 | data_t * Kw = NULL;
95 | data_t * Vw = NULL;
96 |
97 | data_t * Qw_heads = NULL;
98 | data_t * Kw_heads = NULL;
99 | data_t * Vw_heads = NULL;
100 |
101 | data_t * Q = NULL;
102 | data_t * K = NULL;
103 | data_t * V = NULL;
104 |
105 | data_t * KQ = NULL;
106 | data_t * softmax_out = NULL;
107 |
108 | data_t * QKV = NULL;
109 |
110 | data_t * ATTNw = NULL;
111 | data_t * ATTNout = NULL;
112 |
113 | data_t scale_f = 1.0f/sqrtf(((data_t)dk));
114 |
115 | srand(time(NULL));
116 |
117 | clock_gettime(CLOCK_MONOTONIC, &start);
118 |
119 | embeddings = calloc(dmodel*S, sizeof(data_t));
120 |
121 | ATTNw = calloc(dmodel*dmodel, sizeof(data_t));
122 |
123 | init_random_tensor(embeddings, data_type, dmodel*S);
124 | init_random_tensor(ATTNw, data_type, dmodel*dmodel);
125 | Qw = calloc(dmodel*dmodel, sizeof(data_t));
126 | init_random_tensor(Qw, data_type, dmodel*dmodel);
127 |
128 | Kw = calloc(dmodel*dmodel, sizeof(data_t));
129 | init_random_tensor(Kw, data_type, dmodel*dmodel);
130 |
131 | Vw = calloc(dmodel*dmodel, sizeof(data_t));
132 | init_random_tensor(Vw, data_type, dmodel*dmodel);
133 |
134 | Q = calloc(S*dmodel, sizeof(data_t));
135 | memset(Q, 0, S*dmodel*sizeof(data_t));
136 |
137 | K = calloc(S*dmodel, sizeof(data_t));
138 | memset(K, 0, S*dmodel*sizeof(data_t));
139 |
140 | V = calloc(S*dmodel, sizeof(data_t));
141 | memset(V, 0, S*dmodel*sizeof(data_t));
142 |
143 | KQ = calloc(h*S*S, sizeof(data_t));
144 | memset(KQ, 0, h*S*S*sizeof(data_t));
145 |
146 | softmax_out = calloc(h*S*S, sizeof(data_t));
147 | memset(softmax_out, 0, h*S*S*sizeof(data_t));
148 |
149 | QKV = calloc(S*dmodel, sizeof(data_t));
150 | memset(QKV, 0, S*dmodel*sizeof(data_t));
151 |
152 | ATTNout = calloc(S*dmodel, sizeof(data_t));
153 | memset(ATTNout, 0, S*dmodel*sizeof(data_t));
154 |
155 | clock_gettime(CLOCK_MONOTONIC, &end);
156 |
157 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
158 | printf("Init time: %.2f ms\n", time_elapsed_s * 1000);
159 |
160 | clock_gettime(CLOCK_MONOTONIC, &start);
161 | /* MHA */
162 |
163 | gemm(Q, embeddings, Qw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
164 | gemm(K, embeddings, Kw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
165 | gemm(V, embeddings, Vw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
166 |
167 | gemm_t(KQ, Q, K, data_type, h, S, S, dmodel/h, S, dmodel, dmodel);
168 |
169 | scale(KQ, KQ, ((void*)&scale_f), data_type, h*S, S);
170 |
171 | softmax(softmax_out, KQ, data_type, h*S, S);
172 |
173 |
174 | gemm(QKV, softmax_out, V, data_type, h, S, dmodel/h, S, dmodel, S, dmodel);
175 |
176 | gemm(ATTNout, QKV, ATTNw, data_type, 1, S, dmodel, dmodel, dmodel, dmodel, dmodel);
177 |
178 | add(embeddings, ATTNout, embeddings, data_type, S, dmodel);
179 |
180 |
181 | clock_gettime(CLOCK_MONOTONIC, &end);
182 |
183 | time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
184 | const uint64_t flop_count = S * (dmodel * (8*dmodel + 4*S + 1) + 8*h*S);
185 |
186 | printf("MHA execution time: %.2f ms flop count: %lu\n", time_elapsed_s * 1000, flop_count);
187 |
188 | free(Qw);
189 | free(Kw);
190 | free(Vw);
191 |
192 | free(embeddings);
193 | free(Qw_heads);
194 | free(Kw_heads);
195 | free(Vw_heads);
196 | free(Q);
197 | free(K);
198 | free(V);
199 | free(KQ);
200 | free(softmax_out);
201 | free(QKV);
202 | free(ATTNw);
203 | free(ATTNout);
204 |
205 | return 0;
206 | }
207 |
208 | static size_t get_element_size(data_type_e type) {
209 | size_t size;
210 | switch (type) {
211 | case I8:
212 | size = sizeof(uint8_t);
213 | break;
214 | case I16:
215 | size = sizeof(uint16_t);
216 | break;
217 | case I32:
218 | case FP32:
219 | size = sizeof(uint32_t);
220 | break;
221 | case FP64:
222 | size = sizeof(uint64_t);
223 | break;
224 | default:
225 | size = -1;
226 | break;
227 | }
228 |
229 | assert(size > 0 && "data type unknown");
230 |
231 | return size;
232 | }
233 |
234 |
235 | static void init_random_tensor_impl(data_t * tensor, size_t nmemb) {
236 | const data_t range = 10;
237 | #pragma omp parallel for shared (tensor)
238 | for(int i = 0; i < nmemb; i++)
239 | tensor[i] = ((data_t)rand()/(data_t)(RAND_MAX)) * range;
240 | }
241 |
242 | void init_random_tensor(void * tensor, data_type_e data_type, size_t nmemb) {
243 | assert(tensor);
244 | init_random_tensor_impl(((data_t*)tensor), nmemb);
245 | }
246 |
247 | static void gemm_impl(data_t * dst, const data_t * src1, const data_t * src2, int heads, int m, int n, int k,
248 | int stride_0, int stride_1, int stride_2) {
249 | const int bsize = TILE_SIZE;
250 | int ii0, ii1, ii2;
251 | int i0, i1, i2;
252 | int h;
253 | int start_head, stop_head;
254 | data_t pp;
255 | #pragma omp parallel for shared (dst, src1, src2) private(h,i0,i1,i2,ii0,ii1,ii2,pp) collapse(2)
256 | for (h=0; h < heads; h++) {
257 | for (ii0 = 0; ii0 src[i*n+j]) ? max : src[i*n+j];
324 |
325 | sum = 0.0;
326 | for(j = 0; j < n; j++) {
327 | const data_t e = expf(src[i*n+j] - max);
328 | sum += e;
329 | dst[i*n+j] = e;
330 | }
331 |
332 | for(j = 0; j < n; j++) {
333 | dst[i*n+j] *= sum;
334 | }
335 | }
336 | }
337 |
338 |
339 | void gemm(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
340 | int stride_0, int stride_1, int stride_2) {
341 | assert(dst);
342 | assert(src1);
343 | assert(src2);
344 | assert(heads*n == stride_0);
345 | gemm_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
346 | }
347 |
348 | void gemm_t(void * dst, void * src1, void * src2, data_type_e data_type, int heads, int m, int n, int k,
349 | int stride_0, int stride_1, int stride_2) {
350 | assert(dst);
351 | assert(src1);
352 | assert(src2);
353 | assert(heads*k == stride_1);
354 | gemm_t_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), heads, m, n, k, stride_0, stride_1, stride_2);
355 | }
356 |
357 | void scale(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
358 | assert(dst);
359 | assert(src1);
360 | assert(src2);
361 | scale_impl(((data_t*)dst), ((data_t*)src1), *((data_t*)src2), m, n);
362 | }
363 |
364 | void add(void * dst, void * src1, void * src2, data_type_e data_type, int m, int n) {
365 | assert(dst);
366 | assert(src1);
367 | assert(src2);
368 | add_impl(((data_t*)dst), ((data_t*)src1), ((data_t*)src2), m, n);
369 | }
370 |
371 |
372 | void softmax(void * dst, void * src, data_type_e data_type, int m, int n) {
373 | assert(dst);
374 | assert(src);
375 | softmax_impl(((data_t*)dst), ((data_t*)src), m, n);
376 | }
377 |
--------------------------------------------------------------------------------
/demo/sst/software/riscv64.make:
--------------------------------------------------------------------------------
1 | ARCH= riscv64
2 | CC=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-gcc
3 | MPICC=$(MVAPICH2_INSTALL_DIR)/bin/mpicc
4 | CFLAGS=-O3 -fopenmp
5 | LDFLAGS=-static -lm
6 |
7 | .PHONY: all clean
8 | all : $(ARCH)/mha_OMP_8 $(ARCH)/mha_OMP_16 $(ARCH)/mha_OMP_32 $(ARCH)/mha_OMP_64 \
9 | $(ARCH)/mha_MPI_OMP $(ARCH)/hello_MPI_OMP
10 |
11 | $(ARCH)/mha_OMP_8 : mha_OMP.c
12 | @mkdir -p $(@D)
13 | $(CC) $(CFLAGS) -DTILE_SIZE=8 -o $@ $^ $(LDFLAGS)
14 |
15 | $(ARCH)/mha_OMP_16 : mha_OMP.c
16 | @mkdir -p $(@D)
17 | $(CC) $(CFLAGS) -DTILE_SIZE=16 -o $@ $^ $(LDFLAGS)
18 |
19 | $(ARCH)/mha_OMP_32 : mha_OMP.c
20 | @mkdir -p $(@D)
21 | $(CC) $(CFLAGS) -DTILE_SIZE=32 -o $@ $^ $(LDFLAGS)
22 |
23 | $(ARCH)/mha_OMP_64 : mha_OMP.c
24 | @mkdir -p $(@D)
25 | $(CC) $(CFLAGS) -DTILE_SIZE=64 -o $@ $^ $(LDFLAGS)
26 |
27 |
28 | $(ARCH)/mha_MPI_OMP : mha_MPI_OMP.c
29 | @mkdir -p $(@D)
30 | $(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
31 |
32 | $(ARCH)/hello_MPI_OMP : hello_MPI_OMP.c
33 | @mkdir -p $(@D)
34 | $(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
35 |
36 | clean:
37 | rm -rf $(ARCH)
38 |
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/gemm_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/gemm_OMP
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/hello_MPI:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/hello_MPI
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/hello_MPI_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/hello_MPI_OMP
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_MPI_OMP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_MPI_OMP
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_16:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_16
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_32:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_32
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_64
--------------------------------------------------------------------------------
/demo/sst/software/riscv64/mha_OMP_8:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/demo/sst/software/riscv64/mha_OMP_8
--------------------------------------------------------------------------------
/demo/sst/software/x86.make:
--------------------------------------------------------------------------------
1 | ARCH=x86
2 | CFLAGS=-O3 -fopenmp
3 | LDFLAGS=-lm
4 |
5 | ifndef CC
6 | $(error CC is not set)
7 | endif
8 |
9 | ifndef MPICC
10 | $(error MPICC is not set)
11 | endif
12 |
13 | .PHONY: all clean
14 | all : $(ARCH)/mha_OMP $(ARCH)/mha_MPI_OMP $(ARCH)/check_mpi $(ARCH)/hello_MPI_OMP
15 |
16 | $(ARCH)/mha_OMP : mha_OMP.c
17 | @mkdir -p $(@D)
18 | $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
19 |
20 | $(ARCH)/mha_MPI_OMP : mha_MPI_OMP.c
21 | @mkdir -p $(@D)
22 | $(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
23 |
24 | $(ARCH)/check_mpi : check_mpi.c
25 | @mkdir -p $(@D)
26 | $(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
27 |
28 | $(ARCH)/hello_MPI_OMP : hello_MPI_OMP.c
29 | @mkdir -p $(@D)
30 | $(MPICC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
31 |
32 | clean:
33 | rm -rf $(ARCH)
34 |
--------------------------------------------------------------------------------
/docs/512px-LOGO-IMEC_black.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/512px-LOGO-IMEC_black.svg.png
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | ## This source code is licensed under the MIT license found in the
2 | ## LICENSE file in the root directory of this source tree.
3 | ##
4 | ## Copyright (c) 2025 IMEC. All rights reserved.
5 | ## ******************************************************************************
6 | # Configuration file for the Sphinx documentation builder.
7 | #
8 | # For the full list of built-in configuration values, see the documentation:
9 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
10 |
11 | # -- Project information -----------------------------------------------------
12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
13 |
14 | project = 'Scalable System Simulations Tutorial'
15 | copyright = '2024-2025, imec vzw'
16 | author = 'imec vzw'
17 | release = '0.1'
18 |
19 | # -- General configuration ---------------------------------------------------
20 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
21 |
22 | extensions = ['sphinx_rtd_theme', 'sphinxcontrib.bibtex']
23 |
24 | templates_path = ['_templates']
25 | exclude_patterns = []
26 | bibtex_bibfiles = ['references.bib']
27 |
28 |
29 |
30 | # -- Options for HTML output -------------------------------------------------
31 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
32 |
33 | # html_theme = 'alabaster'
34 | html_theme = "sphinx_rtd_theme"
35 | html_static_path = ['_static']
36 | html_logo = "512px-LOGO-IMEC_black.svg.png"
37 | html_theme_options = {
38 | 'logo_only': False,
39 | 'display_version': False,
40 | }
41 | html_css_files = ['custom_svg.css']
42 | numfig = True
43 |
--------------------------------------------------------------------------------
/docs/gem5.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../demo/gem5/README.rst
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. vlsid_2025 documentation master file, created by
2 | sphinx-quickstart on Fri Dec 6 11:47:00 2024.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | A tutorial on scalable system simulations for RISC-V architectures and performance analysis for machine learning workloads
7 | ##########################################################################################################################
8 | This tutorial aims to provide a comprehensive introduction to computer system simulations
9 | and performance analysis, focusing on the integration and application of consensual open-source frameworks
10 | like Structural Simulation Toolkit (SST), Gem5, and Multi-Level Intermediate Representation (MLIR).
11 | Participants will gain hands-on experience in conducting an architectural design exploration with a pragmatic approach
12 | where the simulation framework used is chosen based on a trade-off between fidelity and scalability requirements. By the end of the tutorial, participants will be equipped with the skills necessary to conduct in-depth performance analysis and optimize complex systems using state-of-the-art tools.
13 |
14 |
15 |
16 | .. toctree::
17 | :maxdepth: 3
18 | :caption: Contents:
19 |
20 |
21 | gem5
22 | sst
23 | About us - CSA, imec
24 |
25 |
--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
1 | @article{DBLP:journals/corr/VaswaniSPUJGKP17,
2 | author = {Ashish Vaswani and
3 | Noam Shazeer and
4 | Niki Parmar and
5 | Jakob Uszkoreit and
6 | Llion Jones and
7 | Aidan N. Gomez and
8 | Lukasz Kaiser and
9 | Illia Polosukhin},
10 | title = {Attention Is All You Need},
11 | journal = {CoRR},
12 | volume = {abs/1706.03762},
13 | year = {2017},
14 | url = {http://arxiv.org/abs/1706.03762},
15 | eprinttype = {arXiv},
16 | eprint = {1706.03762},
17 | timestamp = {Sat, 23 Jan 2021 01:20:40 +0100},
18 | biburl = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
19 | bibsource = {dblp computer science bibliography, https://dblp.org}
20 | }
21 |
--------------------------------------------------------------------------------
/docs/slides/2025_05_ISPASS_Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/slides/2025_05_ISPASS_Presentation.pdf
--------------------------------------------------------------------------------
/docs/slides/VLSID25_Tutorial_Slides_imec_CSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/docs/slides/VLSID25_Tutorial_Slides_imec_CSA.pdf
--------------------------------------------------------------------------------
/docs/sst.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../external/INSTALL.rst
2 | .. include:: ../demo/sst/README.rst
3 |
--------------------------------------------------------------------------------
/external/.gitignore:
--------------------------------------------------------------------------------
1 | mvapich2-2.3.7-1
2 |
--------------------------------------------------------------------------------
/external/INSTALL.rst:
--------------------------------------------------------------------------------
1 | .. _Installation instructions:
2 |
3 | Installation instructions for scale-out system simulation
4 | ==========================================================
5 |
6 | To run the demo on your side you need to install at least SST.
7 | The rv64 binaries are already compiled. However, if you want to compile new applications you must install the mpi compiler as described below.
8 |
9 | Run the following command to download the required sub-modules:
10 |
11 | .. code:: bash
12 |
13 | git submodule init --update
14 |
15 | Install instructions for SST
16 | ----------------------------
17 |
18 | You must install **SST-core** SST. To do this, run the following commands in a terminal:
19 |
20 | .. code:: bash
21 |
22 | cd sst/sst-core
23 | export SST_CORE_HOME=$(pwd)/install
24 | ./autogen.sh
25 | mkdir build
26 | cd build
27 | ../configure --prefix=$SST_CORE_HOME
28 | make -j all
29 | make install
30 | export PATH=$SST_CORE_HOME/bin:$PATH
31 | cd ../../../
32 |
33 | Then, you can install **SST-elements** as follow:
34 |
35 | .. code:: bash
36 |
37 | cd sst/sst-elements
38 | git apply ../../demo/sst/docker/sst-elements.patch
39 | export SST_ELEMENTS_HOME=$(pwd)/install
40 | ./autogen.sh
41 | mkdir build
42 | cd build
43 | ../configure --prefix=$SST_ELEMENTS_HOME --with-sst-core=$SST_CORE_HOME
44 | make -j all
45 | make install
46 | cd ../../../
47 |
48 | Install instructions for rv64 mpi compiler
49 | ------------------------------------------
50 | The first step is to install **riscv64-unknown-linux-musl-gcc**. To do this, run the following commands in a terminal:
51 |
52 | .. code:: bash
53 |
54 | cd riscv-gnu-toolchain
55 | export RV64_GNU_INSTALL=$(pwd)/install
56 | CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC" ./configure --prefix=$RV64_GNU_INSTALL --disable-multilib --with-languages=c,c++
57 | make -j8 musl
58 |
59 | Then, you must build the RDMA library
60 |
61 | .. code:: bash
62 |
63 | cd sst/libRDMA
64 | make
65 |
66 | Finally, you can build and install **mpicc** as follow:
67 |
68 | .. code:: bash
69 |
70 | export RDMA_NIC_DIR=$(realpath ./sst/sst-elements/src/sst/elements/rdmaNic)
71 | export RDMA_LIB_DIR=$(realpath ./sst/libRDMA/riscv64/)
72 |
73 | tar xzvf mvapich2-2.3.7-1.tar.gz
74 | ulimit -n 4096
75 | patch --directory=mvapich2-2.3.7-1/ -p1 < mvapich2-2.3.7-1.patch
76 |
77 | cd mvapich2-2.3.7-1/
78 | ./autogen.sh
79 |
80 | mkdir install
81 | mkdir build
82 |
83 | export MVAPICH2_INSTALL_DIR=$(pwd)/install
84 |
85 | cd build
86 |
87 | ../configure \
88 | --prefix=${MVAPICH2_INSTALL_DIR} \
89 | --enable-fortran=no \
90 | --with-device=ch3:rdma \
91 | --enable-romio=no \
92 | --enable-hybrid=no \
93 | --enable-shared=no \
94 | --enable-static=yes \
95 | --with-pmi=vanadis \
96 | --with-pm=none \
97 | --enable-threads=single \
98 | --enable-rsh=yes \
99 | --host=riscv64-unknown-linux-musl \
100 | CC=${RV64_GNU_INSTALL}/bin/riscv64-unknown-linux-musl-gcc \
101 | CFLAGS="-I${RDMA_NIC_DIR}/tests/app/rdma/include -I${RDMA_NIC_DIR} -fPIC" \
102 | CXX=${RV64_GNU_INSTALL}/bin/riscv64-unknown-linux-musl-g++ \
103 | CXXFLAGS="-I${RDMA_NIC_DIR}/tests/app/rdma/include -I${RDMA_NIC_DIR} -fPIC" \
104 | LDFLAGS="-L${RDMA_LIB_DIR}" \
105 | LIBS=-lrdma
106 |
107 | make -j8 install
108 |
--------------------------------------------------------------------------------
/external/mvapich2-2.3.7-1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CSA-infra/RISCV-Scalable-Simulation-tutorial/3a502b242583c8c75ac1fa72ba79fae79a708d1f/external/mvapich2-2.3.7-1.tar.gz
--------------------------------------------------------------------------------
/external/sst/libRDMA/Makefile:
--------------------------------------------------------------------------------
1 | ARCH = riscv64
2 | ADDR_TYPE ?= uint64_t
3 | PRIxBITS ?= PRIx64
4 | PRIuBITS ?= PRIu64
5 |
6 | AR=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-ar
7 | CC=$(RV64_GNU_INSTALL)/bin/riscv64-unknown-linux-musl-gcc
8 |
9 | RDMADIR=$(abspath ../sst-elements/src/sst/elements/rdmaNic/)
10 | APPDIR=$(RDMADIR)/tests/app/rdma/
11 |
12 | CFLAGS=-I$(APPDIR)/include -I$(RDMADIR) -Wattributes -Wall -DADDR_TYPE=$(ADDR_TYPE) -DPRIxBITS=$(PRIxBITS) -DPRIuBITS=$(PRIuBITS)
13 | LIBS=-lrdma -Lriscv64
14 |
15 | OBJS=riscv64/base.o riscv64/rdma.o
16 |
17 |
18 | .PHONY: all clean
19 |
20 | all: librdma.a
21 | librdma.a: ${OBJS}
22 | $(AR) rcs riscv64/librdma.a $^
23 |
24 | riscv64/rdma.o: $(APPDIR)/src/rdma.c $(APPDIR)/include/rdma.h $(RDMADIR)/rdmaNicHostInterface.h
25 | @mkdir -p $(@D)
26 | $(CC) $(CFLAGS) -c $< -o $@
27 |
28 | riscv64/base.o: $(APPDIR)/src/base.c $(APPDIR)/include/base.h $(RDMADIR)/rdmaNicHostInterface.h
29 | @mkdir -p $(@D)
30 | $(CC) $(CFLAGS) -c $< -o $@
31 |
32 | clean:
33 | rm -rf riscv64
34 |
35 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==8.1.3
2 | sphinx_rtd_theme==3.0.2
3 | sphinxcontrib.bibtex==2.6.3
4 |
--------------------------------------------------------------------------------