├── .gitignore
├── artifacts
    ├── tiny_gpt_speedups.png
    ├── attention_speedups.png
    ├── benchmark_speedups.png
    ├── attention_components.csv
    ├── tiny_gpt_components.csv
    └── benchmark_results.csv
├── assets
    └── intel_isa_families.jpeg
├── src
    ├── 03_Examples
    │   ├── 03_data_types
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 01_conditional_code
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 02_quadratic_equations
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 04_image_processing
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 05_mha_block
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   └── 06_tiny_gpt
    │   │   ├── Makefile
    │   │   └── main.cpp
    ├── 01_Basics
    │   ├── 01_importing_simd
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 04_loading_data
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   ├── 02_initializing_data
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   └── 03_binding_with_unions
    │   │   ├── Makefile
    │   │   └── main.cpp
    ├── 02_Computations
    │   ├── 01_simple_maths
    │   │   ├── Makefile
    │   │   └── main.cpp
    │   └── 02_dot_product
    │   │   ├── Makefile
    │   │   └── main.cpp
    └── include
    │   └── simd_utils.h
├── LICENSE
├── README.md
├── runme.sh
└── scripts
    └── plot_results.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode


--------------------------------------------------------------------------------
/artifacts/tiny_gpt_speedups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/tiny_gpt_speedups.png


--------------------------------------------------------------------------------
/assets/intel_isa_families.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/assets/intel_isa_families.jpeg


--------------------------------------------------------------------------------
/artifacts/attention_speedups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/attention_speedups.png


--------------------------------------------------------------------------------
/artifacts/benchmark_speedups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/benchmark_speedups.png


--------------------------------------------------------------------------------
/src/03_Examples/03_data_types/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE) 


--------------------------------------------------------------------------------
/src/01_Basics/01_importing_simd/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/01_Basics/04_loading_data/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/01_Basics/02_initializing_data/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/01_Basics/03_binding_with_unions/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/03_Examples/01_conditional_code/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/02_Computations/01_simple_maths/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/02_Computations/02_dot_product/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/03_Examples/02_quadratic_equations/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE)
17 | 


--------------------------------------------------------------------------------
/src/03_Examples/04_image_processing/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 $(EXTRA_FLAGS)
 3 | TARGET=simd_program
 4 | ASMFILE=main.s
 5 | SRCFILE=main.cpp
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(SRCFILE)
10 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET)
11 | 
12 | asm: $(SRCFILE)
13 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 
14 | 
15 | clean:
16 | 	rm -f $(TARGET) $(ASMFILE) 
17 | 


--------------------------------------------------------------------------------
/src/03_Examples/05_mha_block/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++17
 3 | LDFLAGS=-lstdc++fs
 4 | TARGET=simd_program
 5 | ASMFILE=main.s
 6 | SRCFILE=main.cpp
 7 | 
 8 | all: $(TARGET)
 9 | 
10 | $(TARGET): $(SRCFILE)
11 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) $(LDFLAGS)
12 | 
13 | asm: $(SRCFILE)
14 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE)
15 | 
16 | clean:
17 | 	rm -f $(TARGET) $(ASMFILE)
18 | 


--------------------------------------------------------------------------------
/src/03_Examples/06_tiny_gpt/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++17
 3 | LDFLAGS=-lstdc++fs
 4 | TARGET=simd_program
 5 | ASMFILE=main.s
 6 | SRCFILE=main.cpp
 7 | 
 8 | all: $(TARGET)
 9 | 
10 | $(TARGET): $(SRCFILE)
11 | 	$(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) $(LDFLAGS)
12 | 
13 | asm: $(SRCFILE)
14 | 	$(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE)
15 | 
16 | clean:
17 | 	rm -f $(TARGET) $(ASMFILE)
18 | 


--------------------------------------------------------------------------------
/artifacts/attention_components.csv:
--------------------------------------------------------------------------------
 1 | component,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct
 2 | rmsnorm,2,18.9,4,4.725,14.9,1.06634
 3 | qkv_projections,1,733.2,233.7,3.13736,499.5,35.7475
 4 | attention_scores,1,32.6,16.4,1.9878,16.2,1.15938
 5 | context_projection,1,36.2,25.4,1.4252,10.8,0.772919
 6 | output_projection,1,245.7,80.8,3.04084,164.9,11.8013
 7 | ffn_expand,1,487.4,158.2,3.08091,329.2,23.5597
 8 | activation,1,14.9,1.6,9.3125,13.3,0.951836
 9 | ffn_contract,1,484.3,138.4,3.49928,345.9,24.7549
10 | others,1,111.8,109.2,1.02381,2.6,0.186073
11 | overall,1,2165,767.7,2.82011,1397.3,100
12 | 


--------------------------------------------------------------------------------
/artifacts/tiny_gpt_components.csv:
--------------------------------------------------------------------------------
 1 | stage,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct
 2 | embedding,1,0.1,0.1,1,0,0
 3 | rmsnorm_1,61,557.25,122.3,4.55642,434.95,0.663332
 4 | qkv_linear,61,44915.1,22530.9,1.99349,22384.2,34.1376
 5 | attention_scores,244,1989.7,992.15,2.00544,997.55,1.52134
 6 | attention_softmax,244,749.55,744.65,1.00658,4.9,0.00747288
 7 | attention_context,244,2234.05,1495.8,1.49355,738.25,1.12589
 8 | attention_projection,61,14978.7,7483.55,2.00154,7495.1,11.4306
 9 | residual_1,61,249.3,61.3,4.06688,188,0.286715
10 | rmsnorm_2,61,557.45,122.7,4.54319,434.75,0.663027
11 | ffn_expand,61,29886.5,14959.3,1.99785,14927.1,22.7651
12 | activation,61,1128.6,63.95,17.6482,1064.65,1.62367
13 | ffn_contract,61,29661.8,12946.5,2.29111,16715.3,25.4921
14 | residual_2,61,247.8,61.6,4.02273,186.2,0.283969
15 | logits_projection,1,30,15.05,1.99336,14.95,0.0227999
16 | sampling,1,5,5,1,0,0
17 | overall,61,129142,63571.6,2.03144,65570.4,100
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Yuning Xia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/artifacts/benchmark_results.csv:
--------------------------------------------------------------------------------
 1 | suite,label,iterations,scalar_us,simd_us,speedup
 2 | "01_Basics/04_loading_data","Load Operations",10,4504,5414,0.831917
 3 | "01_Basics/04_loading_data","Store Operations",10,6309,7212,0.874792
 4 | "02_Computations/01_simple_maths","Addition",1000,64,18,3.55556
 5 | "02_Computations/01_simple_maths","Subtraction",1000,64,10,6.4
 6 | "02_Computations/01_simple_maths","Multiplication",1000,64,10,6.4
 7 | "02_Computations/01_simple_maths","Division",1000,64,10,6.4
 8 | "02_Computations/01_simple_maths","Fused Multiply-Add",1000,64,14,4.57143
 9 | "02_Computations/01_simple_maths","Square Root",1000,127,9,14.1111
10 | "02_Computations/01_simple_maths","Minimum",1000,123,10,12.3
11 | "02_Computations/02_dot_product","Dot Product (1024 vectors)",50,1757,514,3.41829
12 | "02_Computations/02_dot_product","AoS vs SoA",50,12837,513,25.0234
13 | "02_Computations/02_dot_product","Single Dot Product (1000 iterations)",10,134,400,0.335
14 | "03_Examples/01_conditional_code","Clamping",200,39,4,9.75
15 | "03_Examples/01_conditional_code","Filtering",200,18,3,6
16 | "03_Examples/01_conditional_code","Complex Filtering",200,22,5,4.4
17 | "03_Examples/02_quadratic_equations","Quadratic Equation Solver",50,12,3,4
18 | "03_Examples/04_image_processing","Brightness Adjustment",10,129963,3211,40.4743
19 | "03_Examples/04_image_processing","Contrast Enhancement",10,159323,41794,3.8121
20 | "03_Examples/04_image_processing","Grayscale Conversion",10,43116,31358,1.37496
21 | "03_Examples/05_mha_block","attention_block",50,107658,37903,2.84036
22 | "03_Examples/06_tiny_gpt","tiny_gpt_decode",50,6441320,3174565,2.02904
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Intel Logo](https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/Intel_logo_%282006-2020%29.svg/200px-Intel_logo_%282006-2020%29.svg.png)
 2 | 
 3 | # Hands-on SIMD Programming with C++
 4 | 
 5 | From “what is SIMD?” to “how do I speed up transformer layers?”—this repository walks through reproducible AVX2 microbenchmarks, tuning tricks, and a quantised decoder block.
 6 | 
 7 | ![Intel ISA Families and Features](./assets/intel_isa_families.jpeg)
 8 | ![SIMD Speedups](artifacts/benchmark_speedups.png)
 9 | ![Attention Breakdown](artifacts/attention_speedups.png)
10 | ![Tiny GPT Breakdown](artifacts/tiny_gpt_speedups.png)
11 | 
12 | ## Quick Start
13 | 
14 | ```bash
15 | ./runme.sh
16 | # optional: tweak CSVs then regenerate the figures
17 | python scripts/plot_results.py
18 | ```
19 | 
20 | `runme.sh` rebuilds every sample, refreshes `artifacts/*.csv`, and redraws all figures (kept under [`artifacts/`](artifacts) so the root stays clean).
21 | 
22 | ## Highlights by Module
23 | 
24 | | Module | Highlights | Use Cases / Benchmarks |
25 | | --- | --- | --- |
26 | | **01_Basics** | Loads, alignment, data initialisation, intrinsics setup | `01_importing_simd`, `04_loading_data` |
27 | | **02_Computations** | Vector arithmetic, FMA, AoS→SoA dot products | `01_simple_maths`, `02_dot_product` |
28 | | **03_Examples** | Conditional masks, quadratic solver, image ops, quantised attention, 61-block decoder | `01_conditional_code`, `04_image_processing`, `05_mha_block`, `06_tiny_gpt` |
29 | 
30 | Every example ships with scalar **vs.** SIMD implementations and an embedded benchmark so you can quantify the payoff.
31 | 
32 | ## Reading the Figures
33 | 
34 | 1. **SIMD Speedups** – six canonical kernels showing alignment, arithmetic, SoA wins, mask-driven control flow, equation solving, and image transforms (speedups from 0.8× to 40×).
35 | 2. **Attention Breakdown** – RMSNorm + MHA + FFN block with component speedups, end-to-end latency, and contribution share (≈2.8× faster overall).
36 | 3. **Tiny GPT Breakdown** – 61-block decoder with int8 weight stores and SIMD dequantisation; the 2× end-to-end gain is unpacked by stage, absolute savings, and contribution percentages.
37 | 
38 | ## Key Takeaways
39 | 
40 | - Memory layout matters: we transpose matrices and lean on SoA buffers so AVX2 loads stay contiguous.
41 | - Quantised linear layers use per-channel scales plus `_mm256_cvtepi16_epi32` / `_mm256_fmadd_ps` to recover float outputs without leaving vector code.
42 | - Accuracy is always checked—SIMD activations are compared against scalar references, and quantised logits agree on the predicted token.
43 | - Automation keeps results fresh: rerunning `runme.sh` recompiles, re-benchmarks, and redraws the conference-style plots.
44 | - The tiny GPT demo stacks 61 decoder blocks, so the CSV/plot counts capture how repeated kernels dominate end-to-end latency.
45 | 
46 | ## License
47 | 
48 | MIT
49 | 


--------------------------------------------------------------------------------
/runme.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -uo pipefail
  3 | 
  4 | ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  5 | ARTIFACT_DIR="${ROOT_DIR}/artifacts"
  6 | mkdir -p "${ARTIFACT_DIR}"
  7 | 
  8 | if [[ -z "${SIMD_BENCHMARK_CSV:-}" ]]; then
  9 |     SIMD_BENCHMARK_CSV="${ARTIFACT_DIR}/benchmark_results.csv"
 10 | else
 11 |     csv_path="${SIMD_BENCHMARK_CSV}"
 12 |     if [[ "${csv_path}" != /* ]]; then
 13 |         csv_path="${ROOT_DIR}/${csv_path}"
 14 |     fi
 15 |     SIMD_BENCHMARK_CSV="${csv_path}"
 16 | fi
 17 | rm -f "${SIMD_BENCHMARK_CSV}"
 18 | export SIMD_BENCHMARK_CSV
 19 | 
 20 | mapfile -t examples < <(cd "$ROOT_DIR" && find src -type f -name 'main.cpp' -printf '%h\n' | sort)
 21 | 
 22 | if (( ${#examples[@]} == 0 )); then
 23 |     echo "No example directories with main.cpp found." >&2
 24 |     exit 1
 25 | fi
 26 | 
 27 | failures=()
 28 | 
 29 | for example in "${examples[@]}"; do
 30 |     echo
 31 |     echo "=== Building and running ${example} ==="
 32 |     pushd "$ROOT_DIR/$example" > /dev/null
 33 | 
 34 |     if [ ! -f Makefile ]; then
 35 |         echo "Skipping ${example}: Makefile not found." >&2
 36 |         failures+=("${example}: missing Makefile")
 37 |         popd > /dev/null
 38 |         continue
 39 |     fi
 40 | 
 41 |     if ! make clean >/dev/null 2>&1; then
 42 |         echo "make clean failed for ${example}" >&2
 43 |         failures+=("${example}: make clean failed")
 44 |         popd > /dev/null
 45 |         continue
 46 |     fi
 47 | 
 48 |     if ! make; then
 49 |         echo "make failed for ${example}" >&2
 50 |         failures+=("${example}: make failed")
 51 |         popd > /dev/null
 52 |         continue
 53 |     fi
 54 | 
 55 |     if [ ! -x ./simd_program ]; then
 56 |         echo "Executable simd_program not produced in ${example}" >&2
 57 |         failures+=("${example}: missing simd_program")
 58 |         popd > /dev/null
 59 |         continue
 60 |     fi
 61 | 
 62 |     if ! ./simd_program; then
 63 |         echo "Execution failed for ${example}" >&2
 64 |         failures+=("${example}: execution failed")
 65 |         popd > /dev/null
 66 |         continue
 67 |     fi
 68 | 
 69 |     popd > /dev/null
 70 |     echo "--- Completed ${example} ---"
 71 | 
 72 |     if [[ -n "${KEEP_BUILD_ARTIFACTS:-}" ]]; then
 73 |         continue
 74 |     fi
 75 | 
 76 |     pushd "$ROOT_DIR/$example" > /dev/null
 77 |     make clean >/dev/null 2>&1 || true
 78 |     popd > /dev/null
 79 |     echo "Cleaned ${example} artifacts."
 80 | 
 81 | done
 82 | 
 83 | if (( ${#failures[@]} )); then
 84 |     echo
 85 |     echo "Failures detected:" >&2
 86 |     for entry in "${failures[@]}"; do
 87 |         echo " - ${entry}" >&2
 88 |     done
 89 |     exit 1
 90 | fi
 91 | 
 92 | echo
 93 | echo "All SIMD examples built and ran successfully."
 94 | if [[ -n "${SIMD_BENCHMARK_CSV:-}" && -f "${SIMD_BENCHMARK_CSV}" ]]; then
 95 |     echo "Benchmark CSV saved to ${SIMD_BENCHMARK_CSV}"
 96 | fi
 97 | 
 98 | echo
 99 | echo "Generating plots via Python scripts..."
100 | python3 "${ROOT_DIR}/scripts/plot_results.py" \
101 |     --benchmarks-csv "${SIMD_BENCHMARK_CSV}" \
102 |     --benchmarks-output "${ARTIFACT_DIR}/benchmark_speedups.png" \
103 |     --attention-csv "${ARTIFACT_DIR}/attention_components.csv" \
104 |     --attention-output "${ARTIFACT_DIR}/attention_speedups.png" \
105 |     --tiny-gpt-csv "${ARTIFACT_DIR}/tiny_gpt_components.csv" \
106 |     --tiny-gpt-output "${ARTIFACT_DIR}/tiny_gpt_speedups.png"
107 | echo "Plots saved to ${ARTIFACT_DIR}"
108 | 


--------------------------------------------------------------------------------
/src/01_Basics/01_importing_simd/main.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 01_Basics/01_importing_simd - Introduction to SIMD headers and basic operations
  3 |  * 
  4 |  * This example demonstrates:
  5 |  * 1. How to include SIMD headers in your C/C++ programs
  6 |  * 2. The hierarchy of SIMD instruction sets
  7 |  * 3. Basic SIMD vector operations
  8 |  */
  9 | 
 10 | // SIMD operations can be included in C/C++ programs via specific header files.
 11 | // Below is a hierarchy of headers provided by Intel, grouped by the instruction sets they implement.
 12 | 
 13 | #include "../../include/simd_utils.h" // Our utility header that includes <immintrin.h>
 14 | 
 15 | // If you're not using our utility header, you would typically include:
 16 | // #include <immintrin.h> // The all-encompassing header for Intel SIMD: AVX, AVX2, FMA, AVX-512, etc.
 17 | 
 18 | /**
 19 |  * SIMD Instruction Set Hierarchy:
 20 |  * 
 21 |  * 1. MMX (MultiMedia eXtensions) - 64-bit operations on integers
 22 |  *    - Header: <mmintrin.h>
 23 |  *    - Introduced in 1997 with Intel Pentium MMX
 24 |  * 
 25 |  * 2. SSE (Streaming SIMD Extensions) - 128-bit operations on 4 floats
 26 |  *    - Header: <xmmintrin.h>
 27 |  *    - Introduced in 1999 with Intel Pentium III
 28 |  * 
 29 |  * 3. SSE2 - Added support for integers and doubles in 128-bit registers
 30 |  *    - Header: <emmintrin.h>
 31 |  *    - Introduced in 2001 with Intel Pentium 4
 32 |  * 
 33 |  * 4. SSE3 - Added horizontal operations and better handling of unaligned data
 34 |  *    - Header: <pmmintrin.h>
 35 |  *    - Introduced in 2004 with Intel Pentium 4 (Prescott)
 36 |  * 
 37 |  * 5. SSSE3 (Supplemental SSE3) - Added more integer instructions
 38 |  *    - Header: <tmmintrin.h>
 39 |  *    - Introduced in 2006 with Intel Core 2
 40 |  * 
 41 |  * 6. SSE4.1 and SSE4.2 - Added dot product, string processing, etc.
 42 |  *    - Headers: <smmintrin.h> and <nmmintrin.h>
 43 |  *    - Introduced in 2007-2008 with Intel Core i7
 44 |  * 
 45 |  * 7. AVX (Advanced Vector Extensions) - 256-bit operations (8 floats)
 46 |  *    - Header: <immintrin.h>
 47 |  *    - Introduced in 2011 with Intel Sandy Bridge
 48 |  * 
 49 |  * 8. AVX2 - Added 256-bit integer operations and more instructions
 50 |  *    - Header: <immintrin.h>
 51 |  *    - Introduced in 2013 with Intel Haswell
 52 |  * 
 53 |  * 9. AVX-512 - 512-bit operations (16 floats)
 54 |  *    - Header: <immintrin.h>
 55 |  *    - Introduced in 2016 with Intel Xeon Phi
 56 |  */
 57 | 
 58 | // Generally, "immintrin.h" is sufficient for most modern SIMD operations as it includes all the above.
 59 | 
 60 | #include <iostream>
 61 | 
 62 | int main() {
 63 |     set_benchmark_suite("01_Basics/01_importing_simd");
 64 | 
 65 |     std::cout << "=== SIMD Header Introduction ===" << std::endl;
 66 |     std::cout << "This example demonstrates basic SIMD vector operations." << std::endl;
 67 |     std::cout << std::endl;
 68 | 
 69 |     // Example 1: Basic vector addition with AVX2
 70 |     std::cout << "Example 1: Vector Addition" << std::endl;
 71 |     
 72 |     // Initialize two SIMD vectors with 8 float values each
 73 |     __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
 74 |     __m256 b = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
 75 |     
 76 |     // Add the vectors element-wise
 77 |     __m256 c = _mm256_add_ps(a, b);
 78 | 
 79 |     // Print the vectors using our utility function
 80 |     print_m256(a, "Vector A");
 81 |     print_m256(b, "Vector B");
 82 |     print_m256(c, "A + B");
 83 |     
 84 |     // Example 2: Storing SIMD results back to memory
 85 |     std::cout << std::endl;
 86 |     std::cout << "Example 2: Storing SIMD Results" << std::endl;
 87 |     
 88 |     // Allocate aligned memory for results
 89 |     float* result = aligned_alloc<float>(8);
 90 |     
 91 |     // Store the SIMD vector to memory
 92 |     _mm256_store_ps(result, c);
 93 |     
 94 |     // Print the results from memory
 95 |     std::cout << "Result array: [";
 96 |     for (int i = 0; i < 7; i++) {
 97 |         std::cout << result[i] << ", ";
 98 |     }
 99 |     std::cout << result[7] << "]" << std::endl;
100 |     
101 |     // Example 3: Different data types
102 |     std::cout << std::endl;
103 |     std::cout << "Example 3: Different Data Types" << std::endl;
104 |     
105 |     // Integer SIMD operations
106 |     __m256i int_a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
107 |     __m256i int_b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
108 |     __m256i int_sum = _mm256_add_epi32(int_a, int_b);
109 |     
110 |     print_m256i(int_a, "Integer Vector A");
111 |     print_m256i(int_b, "Integer Vector B");
112 |     print_m256i(int_sum, "A + B (Integer)");
113 |     
114 |     // Double precision SIMD operations (4 doubles in a 256-bit register)
115 |     __m256d double_a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
116 |     __m256d double_b = _mm256_set_pd(4.0, 3.0, 2.0, 1.0);
117 |     __m256d double_sum = _mm256_add_pd(double_a, double_b);
118 |     
119 |     print_m256d(double_a, "Double Vector A");
120 |     print_m256d(double_b, "Double Vector B");
121 |     print_m256d(double_sum, "A + B (Double)");
122 |     
123 |     // Clean up
124 |     free(result);
125 |     
126 |     return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/src/03_Examples/02_quadratic_equations/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <math.h>
  4 | 
  5 | /**
  6 |  * This example demonstrates solving multiple quadratic equations in parallel using SIMD.
  7 |  * 
  8 |  * For each quadratic equation ax² + bx + c = 0, we compute the discriminant b² - 4ac
  9 |  * and then calculate the solution using the quadratic formula: x = (-b ± √(b² - 4ac)) / 2a
 10 |  * 
 11 |  * We'll solve 8 different quadratic equations simultaneously using AVX2 instructions.
 12 |  */
 13 | 
 14 | int main() {
 15 |     set_benchmark_suite("03_Examples/02_quadratic_equations");
 16 | 
 17 | 	std::cout << "=== Solving Quadratic Equations with SIMD ===" << std::endl;
 18 | 	std::cout << "This example solves 8 quadratic equations in parallel." << std::endl;
 19 | 	std::cout << "For each equation ax² + bx + c = 0, we find the smaller root." << std::endl;
 20 | 	std::cout << std::endl;
 21 | 
 22 | 	// Allocate aligned memory for coefficients
 23 | 	float* a = aligned_alloc<float>(8);
 24 | 	float* b = aligned_alloc<float>(8);
 25 | 	float* c = aligned_alloc<float>(8);
 26 | 
 27 | 	// Initialize coefficients for 8 different quadratic equations
 28 | 	// Equation 1: 5x² + 3x - 1 = 0
 29 | 	a[0] = 5.0f;  b[0] = 3.0f;  c[0] = -1.0f;
 30 | 	
 31 | 	// Equation 2: 12x² + 1x - 5 = 0
 32 | 	a[1] = 12.0f; b[1] = 1.0f;  c[1] = -5.0f;
 33 | 	
 34 | 	// Equation 3: 6x² + 4x - 6 = 0
 35 | 	a[2] = 6.0f;  b[2] = 4.0f;  c[2] = -6.0f;
 36 | 	
 37 | 	// Equation 4: 7x² - 2x - 6 = 0
 38 | 	a[3] = 7.0f;  b[3] = -2.0f; c[3] = -6.0f;
 39 | 	
 40 | 	// Equation 5: 1x² + 2x + 5 = 0 (complex roots, will return NaN)
 41 | 	a[4] = 1.0f;  b[4] = 2.0f;  c[4] = 5.0f;
 42 | 	
 43 | 	// Equation 6: 1x² + 1x + 30 = 0 (complex roots, will return NaN)
 44 | 	a[5] = 1.0f;  b[5] = 1.0f;  c[5] = 30.0f;
 45 | 	
 46 | 	// Equation 7: 1x² + 1x + 35 = 0 (complex roots, will return NaN)
 47 | 	a[6] = 1.0f;  b[6] = 1.0f;  c[6] = 35.0f;
 48 | 	
 49 | 	// Equation 8: 1x² + 1x - 40 = 0
 50 | 	a[7] = 1.0f;  b[7] = 1.0f;  c[7] = -40.0f;
 51 | 
 52 | 	// Print the equations we're solving
 53 | 	std::cout << "Equations to solve:" << std::endl;
 54 | 	for (int i = 0; i < 8; i++) {
 55 | 		std::cout << "Equation " << (i+1) << ": " 
 56 | 				  << a[i] << "x² + " << b[i] << "x + " << c[i] << " = 0" << std::endl;
 57 | 	}
 58 | 	std::cout << std::endl;
 59 | 
 60 | 	// Union to access SIMD results
 61 | 	float8 result;
 62 | 
 63 | 	// -------- Standard scalar approach ---------------
 64 | 	std::cout << "----------- Standard scalar approach -----------" << std::endl;
 65 | 	
 66 | 	// Initialize result array with placeholder values
 67 | 	for (int lane = 0; lane < 8; ++lane) {
 68 | 		result.a[lane] = std::numeric_limits<float>::quiet_NaN();
 69 | 	}
 70 | 	
 71 | 	// Define the scalar implementation as a lambda for benchmarking
 72 | 	auto scalar_func = [&]() {
 73 | 		for (int lane = 0; lane < 8; ++lane) {
 74 | 			float discriminant = b[lane] * b[lane] - 4.0f * a[lane] * c[lane];
 75 | 			if (discriminant >= 0) {
 76 | 				// Calculate the smaller root: (-b - sqrt(discriminant)) / (2*a)
 77 | 				result.a[lane] = (-b[lane] - sqrtf(discriminant)) / (2.0f * a[lane]);
 78 | 			} else {
 79 | 				// Complex roots, set to NaN
 80 | 				result.a[lane] = std::numeric_limits<float>::quiet_NaN();
 81 | 			}
 82 | 		}
 83 | 	};
 84 | 	
 85 | 	// Run the scalar implementation once to get the results
 86 | 	scalar_func();
 87 | 
 88 | 	// Print scalar results
 89 | 	std::cout << "Scalar solutions (smaller root):" << std::endl;
 90 | 	for (int lane = 0; lane < 8; ++lane) {
 91 | 		std::cout << "Equation " << (lane+1) << ": ";
 92 | 		if (std::isnan(result.a[lane])) {
 93 | 			std::cout << "Complex roots" << std::endl;
 94 | 		} else {
 95 | 			std::cout << result.a[lane] << std::endl;
 96 | 		}
 97 | 	}
 98 | 	std::cout << std::endl;
 99 | 
100 | 	// -------- SIMD approach ---------------
101 | 	std::cout << "----------- SIMD approach -----------" << std::endl;
102 | 	
103 | 	// Define the SIMD implementation as a lambda for benchmarking
104 | 	auto simd_func = [&]() {
105 | 		// Load coefficients into SIMD registers
106 | 		__m256 aCoeffs = _mm256_loadu_ps(a);
107 | 		__m256 bCoeffs = _mm256_loadu_ps(b);
108 | 		__m256 cCoeffs = _mm256_loadu_ps(c);
109 | 		
110 | 		// Calculate discriminant: b² - 4ac
111 | 		// Using fused multiply-add for better precision: b*b - 4*a*c
112 | 		__m256 four = _mm256_set1_ps(4.0f);
113 | 		__m256 ac = _mm256_mul_ps(aCoeffs, cCoeffs);
114 | 		__m256 four_ac = _mm256_mul_ps(four, ac);
115 | 		__m256 b_squared = _mm256_mul_ps(bCoeffs, bCoeffs);
116 | 		__m256 discriminant = _mm256_sub_ps(b_squared, four_ac);
117 | 		
118 | 		// Create mask for discriminant >= 0 (real roots)
119 | 		__m256 zero = _mm256_setzero_ps();
120 | 		__m256 mask = _mm256_cmp_ps(discriminant, zero, _CMP_GE_OQ);
121 | 		
122 | 		// Calculate sqrt(discriminant) where discriminant >= 0
123 | 		__m256 sqrt_discriminant = _mm256_sqrt_ps(discriminant);
124 | 		
125 | 		// Calculate -b
126 | 		__m256 neg_b = _mm256_sub_ps(zero, bCoeffs);
127 | 		
128 | 		// Calculate numerator: -b - sqrt(discriminant)
129 | 		__m256 numerator = _mm256_sub_ps(neg_b, sqrt_discriminant);
130 | 		
131 | 		// Calculate denominator: 2*a
132 | 		__m256 two = _mm256_set1_ps(2.0f);
133 | 		__m256 denominator = _mm256_mul_ps(two, aCoeffs);
134 | 		
135 | 		// Calculate result: (-b - sqrt(discriminant)) / (2*a)
136 | 		__m256 solution = _mm256_div_ps(numerator, denominator);
137 | 		
138 | 		// Set NaN for complex roots (discriminant < 0)
139 | 		__m256 nan = _mm256_set1_ps(std::numeric_limits<float>::quiet_NaN());
140 | 		__m256 final_result = _mm256_blendv_ps(nan, solution, mask);
141 | 		
142 | 		// Store result
143 | 		result.v = final_result;
144 | 	};
145 | 	
146 | 	// Run the SIMD implementation once to get the results
147 | 	simd_func();
148 | 
149 | 	// Print SIMD results
150 | 	std::cout << "SIMD solutions (smaller root):" << std::endl;
151 | 	for (int lane = 0; lane < 8; ++lane) {
152 | 		std::cout << "Equation " << (lane+1) << ": ";
153 | 		if (std::isnan(result.a[lane])) {
154 | 			std::cout << "Complex roots" << std::endl;
155 | 		} else {
156 | 			std::cout << result.a[lane] << std::endl;
157 | 		}
158 | 	}
159 | 	std::cout << std::endl;
160 | 
161 | 	// Benchmark comparison
162 | 	benchmark_comparison(
163 | 		"Quadratic Equation Solver",
164 | 		scalar_func,
165 | 		simd_func,
166 | 		50
167 | 	);
168 | 
169 | 	// Free allocated memory
170 | 	free(a);
171 | 	free(b);
172 | 	free(c);
173 | 
174 | 	return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/src/03_Examples/03_data_types/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | 
  5 | /**
  6 |  * This example demonstrates SIMD operations with different data types.
  7 |  * 
  8 |  * We'll explore:
  9 |  * 1. Working with different numeric types (float, double, int, short)
 10 |  * 2. Converting between different SIMD types
 11 |  * 3. Handling different vector widths
 12 |  * 4. Performing operations specific to certain data types
 13 |  */
 14 | 
 15 | int main() {
 16 |     set_benchmark_suite("03_Examples/03_data_types");
 17 | 
 18 |     std::cout << "=== SIMD Operations with Different Data Types ===" << std::endl;
 19 |     std::cout << std::endl;
 20 | 
 21 |     // -------- 1. Float operations (32-bit) --------
 22 |     std::cout << "1. Float Operations (32-bit, 8 elements per vector)" << std::endl;
 23 |     
 24 |     // Initialize float vector
 25 |     __m256 float_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
 26 |     __m256 float_vec2 = _mm256_set1_ps(2.0f);  // Set all elements to 2.0
 27 |     
 28 |     // Perform operations
 29 |     __m256 float_sum = _mm256_add_ps(float_vec1, float_vec2);
 30 |     __m256 float_product = _mm256_mul_ps(float_vec1, float_vec2);
 31 |     
 32 |     // Print results
 33 |     print_m256(float_vec1, "Float Vector 1");
 34 |     print_m256(float_vec2, "Float Vector 2");
 35 |     print_m256(float_sum, "Sum (float_vec1 + float_vec2)");
 36 |     print_m256(float_product, "Product (float_vec1 * float_vec2)");
 37 |     std::cout << std::endl;
 38 | 
 39 |     // -------- 2. Double operations (64-bit) --------
 40 |     std::cout << "2. Double Operations (64-bit, 4 elements per vector)" << std::endl;
 41 |     
 42 |     // Initialize double vector
 43 |     __m256d double_vec1 = _mm256_set_pd(4.0, 3.0, 2.0, 1.0);
 44 |     __m256d double_vec2 = _mm256_set1_pd(3.0);  // Set all elements to 3.0
 45 |     
 46 |     // Perform operations
 47 |     __m256d double_sum = _mm256_add_pd(double_vec1, double_vec2);
 48 |     __m256d double_product = _mm256_mul_pd(double_vec1, double_vec2);
 49 |     
 50 |     // Print results
 51 |     print_m256d(double_vec1, "Double Vector 1");
 52 |     print_m256d(double_vec2, "Double Vector 2");
 53 |     print_m256d(double_sum, "Sum (double_vec1 + double_vec2)");
 54 |     print_m256d(double_product, "Product (double_vec1 * double_vec2)");
 55 |     std::cout << std::endl;
 56 | 
 57 |     // -------- 3. Integer operations (32-bit) --------
 58 |     std::cout << "3. Integer Operations (32-bit, 8 elements per vector)" << std::endl;
 59 |     
 60 |     // Initialize integer vector
 61 |     __m256i int_vec1 = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
 62 |     __m256i int_vec2 = _mm256_set1_epi32(10);  // Set all elements to 10
 63 |     
 64 |     // Perform operations
 65 |     __m256i int_sum = _mm256_add_epi32(int_vec1, int_vec2);
 66 |     __m256i int_sub = _mm256_sub_epi32(int_vec1, int_vec2);
 67 |     
 68 |     // Print results
 69 |     print_m256i(int_vec1, "Int Vector 1");
 70 |     print_m256i(int_vec2, "Int Vector 2");
 71 |     print_m256i(int_sum, "Sum (int_vec1 + int_vec2)");
 72 |     print_m256i(int_sub, "Difference (int_vec1 - int_vec2)");
 73 |     std::cout << std::endl;
 74 | 
 75 |     // -------- 4. Type Conversions --------
 76 |     std::cout << "4. Type Conversions" << std::endl;
 77 |     
 78 |     // Convert float to integer (truncation)
 79 |     __m256i float_to_int = _mm256_cvttps_epi32(float_vec1);
 80 |     print_m256i(float_to_int, "Float to Int (truncated)");
 81 |     
 82 |     // Convert integer to float
 83 |     __m256 int_to_float = _mm256_cvtepi32_ps(int_vec1);
 84 |     print_m256(int_to_float, "Int to Float");
 85 |     
 86 |     // Convert between float and double (need to split/combine)
 87 |     // Extract lower 4 floats and convert to double
 88 |     __m128 float_low = _mm256_extractf128_ps(float_vec1, 0);
 89 |     __m256d float_to_double_low = _mm256_cvtps_pd(float_low);
 90 |     print_m256d(float_to_double_low, "Lower 4 Floats to Double");
 91 |     
 92 |     // Extract upper 4 floats and convert to double
 93 |     __m128 float_high = _mm256_extractf128_ps(float_vec1, 1);
 94 |     __m256d float_to_double_high = _mm256_cvtps_pd(float_high);
 95 |     print_m256d(float_to_double_high, "Upper 4 Floats to Double");
 96 |     std::cout << std::endl;
 97 | 
 98 |     // -------- 5. Bitwise Operations --------
 99 |     std::cout << "5. Bitwise Operations" << std::endl;
100 |     
101 |     // Create test vectors
102 |     __m256i bits1 = _mm256_set1_epi32(0x0F0F0F0F);  // 00001111 00001111 00001111 00001111
103 |     __m256i bits2 = _mm256_set1_epi32(0x33333333);  // 00110011 00110011 00110011 00110011
104 |     
105 |     // Perform bitwise operations
106 |     __m256i bit_and = _mm256_and_si256(bits1, bits2);
107 |     __m256i bit_or = _mm256_or_si256(bits1, bits2);
108 |     __m256i bit_xor = _mm256_xor_si256(bits1, bits2);
109 |     
110 |     // Print results in hex format
111 |     std::cout << "Bits1 (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 
112 |               << reinterpret_cast<int*>(&bits1)[0] << std::endl;
113 |     std::cout << "Bits2 (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 
114 |               << reinterpret_cast<int*>(&bits2)[0] << std::endl;
115 |     std::cout << "AND (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 
116 |               << reinterpret_cast<int*>(&bit_and)[0] << std::endl;
117 |     std::cout << "OR (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 
118 |               << reinterpret_cast<int*>(&bit_or)[0] << std::endl;
119 |     std::cout << "XOR (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 
120 |               << reinterpret_cast<int*>(&bit_xor)[0] << std::endl;
121 |     std::cout << std::dec << std::endl;  // Reset to decimal
122 | 
123 |     // -------- 6. Specialized Operations --------
124 |     std::cout << "6. Specialized Operations" << std::endl;
125 |     
126 |     // Horizontal addition (add adjacent pairs)
127 |     __m256 hadd_result = _mm256_hadd_ps(float_vec1, float_vec2);
128 |     print_m256(hadd_result, "Horizontal Add (pairs from float_vec1, float_vec2)");
129 |     
130 |     // Permute (rearrange elements)
131 |     __m256 permute_result = _mm256_permute_ps(float_vec1, 0b10010011);
132 |     print_m256(permute_result, "Permuted float_vec1");
133 |     
134 |     // Blend (select elements from two vectors based on mask)
135 |     __m256 blend_result = _mm256_blend_ps(float_vec1, float_vec2, 0b10101010);
136 |     print_m256(blend_result, "Blend of float_vec1 and float_vec2");
137 |     
138 |     return 0;
139 | } 
140 | 


--------------------------------------------------------------------------------
/src/01_Basics/03_binding_with_unions/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | 
  5 | /**
  6 |  * 01_Basics/03_binding_with_unions - Techniques for accessing SIMD data
  7 |  * 
  8 |  * This example demonstrates different ways to access and manipulate data in SIMD vectors:
  9 |  * 1. Using pointer conversion (reinterpret_cast)
 10 |  * 2. Using unions to create an alias between SIMD types and arrays
 11 |  * 3. Using the _mm256_store_* and _mm256_load_* functions
 12 |  * 4. Using the extract and insert element functions
 13 |  * 
 14 |  * Each method has its advantages and use cases.
 15 |  */
 16 | 
 17 | int main() {
 18 |     set_benchmark_suite("01_Basics/03_binding_with_unions");
 19 | 
 20 |     std::cout << "=== Accessing SIMD Data ===" << std::endl;
 21 |     std::cout << std::endl;
 22 | 
 23 |     // --------- 1. Pointer Conversion -------------
 24 |     std::cout << "1. Pointer Conversion" << std::endl;
 25 |     std::cout << "---------------------------------------------------" << std::endl;
 26 |     std::cout << "Using reinterpret_cast to convert between SIMD types and arrays." << std::endl;
 27 |     std::cout << "This is a simple but potentially unsafe method." << std::endl;
 28 |     std::cout << std::endl;
 29 |     
 30 |     // Initialize a SIMD vector with ascending values
 31 |     __m256 simd_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
 32 |     
 33 |     // Access the data using pointer conversion
 34 |     float* float_ptr = reinterpret_cast<float*>(&simd_vec1);
 35 |     
 36 |     // Print the data
 37 |     std::cout << "SIMD vector values via pointer: [";
 38 |     for (int i = 0; i < 7; i++) {
 39 |         std::cout << float_ptr[i] << ", ";
 40 |     }
 41 |     std::cout << float_ptr[7] << "]" << std::endl;
 42 |     
 43 |     // Modify the data through the pointer
 44 |     std::cout << "Modifying values via pointer..." << std::endl;
 45 |     float_ptr[0] = 100.0f;
 46 |     float_ptr[4] = 200.0f;
 47 |     
 48 |     // Print the modified SIMD vector
 49 |     print_m256(simd_vec1, "Modified SIMD vector");
 50 |     std::cout << std::endl;
 51 | 
 52 |     // --------- 2. Using Unions -------------
 53 |     std::cout << "2. Using Unions" << std::endl;
 54 |     std::cout << "---------------------------------------------------" << std::endl;
 55 |     std::cout << "Using unions to create an alias between SIMD types and arrays." << std::endl;
 56 |     std::cout << "This is a cleaner and safer approach than pointer conversion." << std::endl;
 57 |     std::cout << std::endl;
 58 |     
 59 |     // Define a union for float SIMD vector
 60 |     union FloatSIMD {
 61 |         __m256 v;
 62 |         float a[8];
 63 |     };
 64 |     
 65 |     // Initialize the union with a SIMD vector
 66 |     FloatSIMD float_union;
 67 |     float_union.v = _mm256_set_ps(16.0f, 14.0f, 12.0f, 10.0f, 8.0f, 6.0f, 4.0f, 2.0f);
 68 |     
 69 |     // Access the data through the array
 70 |     std::cout << "SIMD vector values via union: [";
 71 |     for (int i = 0; i < 7; i++) {
 72 |         std::cout << float_union.a[i] << ", ";
 73 |     }
 74 |     std::cout << float_union.a[7] << "]" << std::endl;
 75 |     
 76 |     // Modify the data through the array
 77 |     std::cout << "Modifying values via union..." << std::endl;
 78 |     float_union.a[1] = 42.0f;
 79 |     float_union.a[6] = 99.0f;
 80 |     
 81 |     // Print the modified SIMD vector
 82 |     print_m256(float_union.v, "Modified SIMD vector (union)");
 83 |     
 84 |     // Using our utility union from simd_utils.h
 85 |     float8 float8_union;
 86 |     float8_union.v = _mm256_set1_ps(5.0f);
 87 |     float8_union.a[2] = 10.0f;
 88 |     float8_union.a[5] = 20.0f;
 89 |     
 90 |     print_m256(float8_union.v, "Using float8 union from simd_utils.h");
 91 |     std::cout << std::endl;
 92 | 
 93 |     // --------- 3. Store and Load Functions -------------
 94 |     std::cout << "3. Store and Load Functions" << std::endl;
 95 |     std::cout << "---------------------------------------------------" << std::endl;
 96 |     std::cout << "Using _mm256_store_* and _mm256_load_* functions to transfer data." << std::endl;
 97 |     std::cout << "This is the recommended approach for most situations." << std::endl;
 98 |     std::cout << std::endl;
 99 |     
100 |     // Initialize a SIMD vector
101 |     __m256 simd_vec3 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
102 |     
103 |     // Allocate aligned memory for the array
104 |     float* aligned_array = aligned_alloc<float>(8);
105 |     
106 |     // Store the SIMD vector to the array
107 |     _mm256_store_ps(aligned_array, simd_vec3);
108 |     
109 |     // Print the array
110 |     std::cout << "SIMD vector values via store: [";
111 |     for (int i = 0; i < 7; i++) {
112 |         std::cout << aligned_array[i] << ", ";
113 |     }
114 |     std::cout << aligned_array[7] << "]" << std::endl;
115 |     
116 |     // Modify the array
117 |     std::cout << "Modifying values in the array..." << std::endl;
118 |     aligned_array[3] = 30.0f;
119 |     aligned_array[7] = 80.0f;
120 |     
121 |     // Load the modified array back to a SIMD vector
122 |     __m256 modified_vec = _mm256_load_ps(aligned_array);
123 |     
124 |     // Print the modified SIMD vector
125 |     print_m256(modified_vec, "Modified SIMD vector (store/load)");
126 |     std::cout << std::endl;
127 | 
128 |     // --------- 4. Extract and Insert Elements -------------
129 |     std::cout << "4. Extract and Insert Elements" << std::endl;
130 |     std::cout << "---------------------------------------------------" << std::endl;
131 |     std::cout << "Using _mm256_extract_* and _mm256_insert_* functions to access individual elements." << std::endl;
132 |     std::cout << "This is useful when you only need to access a few elements." << std::endl;
133 |     std::cout << std::endl;
134 |     
135 |     // Initialize a SIMD vector with integers
136 |     __m256i simd_int_vec = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
137 |     
138 |     // Extract individual elements
139 |     // Note: For AVX2, we need to extract 128-bit lanes first, then extract from those
140 |     __m128i low_lane = _mm256_extracti128_si256(simd_int_vec, 0);  // Extract lower 128 bits
141 |     __m128i high_lane = _mm256_extracti128_si256(simd_int_vec, 1); // Extract upper 128 bits
142 |     
143 |     int element0 = _mm_extract_epi32(low_lane, 0);  // Extract element 0
144 |     int element3 = _mm_extract_epi32(low_lane, 3);  // Extract element 3
145 |     int element4 = _mm_extract_epi32(high_lane, 0); // Extract element 4
146 |     int element7 = _mm_extract_epi32(high_lane, 3); // Extract element 7
147 |     
148 |     std::cout << "Extracted elements: " << element0 << ", " << element3 << ", " 
149 |               << element4 << ", " << element7 << std::endl;
150 |     
151 |     // Insert elements
152 |     // For inserting, we need to create new 128-bit vectors and then combine them
153 |     __m128i new_low = _mm_insert_epi32(low_lane, 100, 1);  // Replace element 1
154 |     __m128i new_high = _mm_insert_epi32(high_lane, 200, 2); // Replace element 6
155 |     
156 |     // Combine the lanes back into a 256-bit vector
157 |     __m256i modified_int_vec = _mm256_setr_m128i(new_low, new_high);
158 |     
159 |     // Print the modified vector
160 |     print_m256i(modified_int_vec, "Modified integer vector (extract/insert)");
161 |     
162 |     // Clean up
163 |     free(aligned_array);
164 |     
165 |     return 0;
166 | }
167 | 


--------------------------------------------------------------------------------
/src/include/simd_utils.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * simd_utils.h - Utility functions and macros for SIMD programming
  3 |  * 
  4 |  * This header provides common utilities for SIMD programming, including:
  5 |  * - Type definitions for SIMD vectors
  6 |  * - Helper macros for alignment
  7 |  * - Utility functions for printing SIMD vectors
  8 |  * - Performance measurement utilities
  9 |  */
 10 | 
 11 | #ifndef SIMD_UTILS_H
 12 | #define SIMD_UTILS_H
 13 | 
 14 | #include <immintrin.h> // AVX2, 256-bit operations
 15 | #include <chrono>
 16 | #include <cstdlib>
 17 | #include <fstream>
 18 | #include <functional>
 19 | #include <iomanip>
 20 | #include <iostream>
 21 | #include <mutex>
 22 | #include <string>
 23 | #include <vector>
 24 | 
 25 | // Alignment macros
 26 | #define SIMD_ALIGN_32 alignas(32)
 27 | #define SIMD_ALIGN_64 alignas(64)
 28 | 
 29 | // Helper union for accessing SIMD vector elements
 30 | union float8 {
 31 |     __m256 v;
 32 |     float a[8];
 33 |     
 34 |     float8(__m256 _v) : v(_v) {}
 35 |     float8() : v(_mm256_setzero_ps()) {}
 36 | };
 37 | 
 38 | union double4 {
 39 |     __m256d v;
 40 |     double a[4];
 41 |     
 42 |     double4(__m256d _v) : v(_v) {}
 43 |     double4() : v(_mm256_setzero_pd()) {}
 44 | };
 45 | 
 46 | union int8 {
 47 |     __m256i v;
 48 |     int a[8];
 49 |     
 50 |     int8(__m256i _v) : v(_v) {}
 51 |     int8() : v(_mm256_setzero_si256()) {}
 52 | };
 53 | 
 54 | // Print utilities
 55 | inline void print_m256(const __m256& v, const std::string& label = "") {
 56 |     float8 tmp(v);
 57 |     if (!label.empty()) {
 58 |         std::cout << label << ": ";
 59 |     }
 60 |     std::cout << "[";
 61 |     for (int i = 0; i < 7; i++) {
 62 |         std::cout << tmp.a[i] << ", ";
 63 |     }
 64 |     std::cout << tmp.a[7] << "]" << std::endl;
 65 | }
 66 | 
 67 | inline void print_m256d(const __m256d& v, const std::string& label = "") {
 68 |     double4 tmp(v);
 69 |     if (!label.empty()) {
 70 |         std::cout << label << ": ";
 71 |     }
 72 |     std::cout << "[";
 73 |     for (int i = 0; i < 3; i++) {
 74 |         std::cout << tmp.a[i] << ", ";
 75 |     }
 76 |     std::cout << tmp.a[3] << "]" << std::endl;
 77 | }
 78 | 
 79 | inline void print_m256i(const __m256i& v, const std::string& label = "") {
 80 |     int8 tmp(v);
 81 |     if (!label.empty()) {
 82 |         std::cout << label << ": ";
 83 |     }
 84 |     std::cout << "[";
 85 |     for (int i = 0; i < 7; i++) {
 86 |         std::cout << tmp.a[i] << ", ";
 87 |     }
 88 |     std::cout << tmp.a[7] << "]" << std::endl;
 89 | }
 90 | 
 91 | // Performance measurement utilities
 92 | class Timer {
 93 | private:
 94 |     std::chrono::high_resolution_clock::time_point start_time;
 95 |     std::string label;
 96 | 
 97 | public:
 98 |     Timer(const std::string& _label = "Operation") : label(_label) {
 99 |         start_time = std::chrono::high_resolution_clock::now();
100 |     }
101 | 
102 |     ~Timer() {
103 |         auto end_time = std::chrono::high_resolution_clock::now();
104 |         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
105 |         std::cout << label << " took " << duration.count() << " microseconds" << std::endl;
106 |     }
107 | };
108 | 
109 | namespace simd_bench_detail {
110 | 
111 | inline std::string& suite_label() {
112 |     static std::string label = "unspecified_suite";
113 |     return label;
114 | }
115 | 
116 | inline std::string& csv_path_store() {
117 |     static std::string path;
118 |     return path;
119 | }
120 | 
121 | inline std::mutex& csv_mutex() {
122 |     static std::mutex m;
123 |     return m;
124 | }
125 | 
126 | inline std::string csv_escape(const std::string& input) {
127 |     std::string out;
128 |     out.reserve(input.size() + 2);
129 |     out.push_back('"');
130 |     for (char ch : input) {
131 |         if (ch == '"') {
132 |             out.push_back('"');
133 |             out.push_back('"');
134 |         } else {
135 |             out.push_back(ch);
136 |         }
137 |     }
138 |     out.push_back('"');
139 |     return out;
140 | }
141 | 
142 | inline std::string effective_csv_path() {
143 |     const char* env_path = std::getenv("SIMD_BENCHMARK_CSV");
144 |     if (env_path && *env_path) {
145 |         return std::string(env_path);
146 |     }
147 |     return csv_path_store();
148 | }
149 | 
150 | } // namespace simd_bench_detail
151 | 
152 | inline void set_benchmark_suite(const std::string& suite) {
153 |     simd_bench_detail::suite_label() = suite;
154 | }
155 | 
156 | inline void set_benchmark_csv_path(const std::string& path) {
157 |     simd_bench_detail::csv_path_store() = path;
158 | }
159 | 
160 | // Benchmark function to compare scalar vs SIMD implementations
161 | template<typename ScalarFunc, typename SimdFunc>
162 | void benchmark_comparison(
163 |     const std::string& label,
164 |     ScalarFunc scalar_func,
165 |     SimdFunc simd_func,
166 |     int iterations = 1000
167 | ) {
168 |     // Warm-up
169 |     scalar_func();
170 |     simd_func();
171 |     
172 |     // Benchmark scalar implementation
173 |     auto scalar_start = std::chrono::high_resolution_clock::now();
174 |     for (int i = 0; i < iterations; i++) {
175 |         scalar_func();
176 |     }
177 |     auto scalar_end = std::chrono::high_resolution_clock::now();
178 |     auto scalar_duration = std::chrono::duration_cast<std::chrono::microseconds>(scalar_end - scalar_start);
179 |     
180 |     // Benchmark SIMD implementation
181 |     auto simd_start = std::chrono::high_resolution_clock::now();
182 |     for (int i = 0; i < iterations; i++) {
183 |         simd_func();
184 |     }
185 |     auto simd_end = std::chrono::high_resolution_clock::now();
186 |     auto simd_duration = std::chrono::duration_cast<std::chrono::microseconds>(simd_end - simd_start);
187 |     
188 |     // Print results
189 |     std::cout << "===== " << label << " Benchmark =====" << std::endl;
190 |     std::cout << "Scalar implementation: " << scalar_duration.count() << " microseconds" << std::endl;
191 |     std::cout << "SIMD implementation: " << simd_duration.count() << " microseconds" << std::endl;
192 | 
193 |     double speedup = static_cast<double>(scalar_duration.count()) / simd_duration.count();
194 |     std::cout << "Speedup: " << std::fixed << std::setprecision(2) << speedup << "x" << std::endl;
195 |     std::cout << "===============================" << std::endl;
196 | 
197 |     const std::string csv_path = simd_bench_detail::effective_csv_path();
198 |     if (!csv_path.empty()) {
199 |         std::lock_guard<std::mutex> lock(simd_bench_detail::csv_mutex());
200 |         bool need_header = false;
201 |         {
202 |             std::ifstream existing(csv_path);
203 |             if (!existing.good() || existing.peek() == std::ifstream::traits_type::eof()) {
204 |                 need_header = true;
205 |             }
206 |         }
207 | 
208 |         std::ofstream csv(csv_path, std::ios::app);
209 |         if (csv) {
210 |             if (need_header) {
211 |                 csv << "suite,label,iterations,scalar_us,simd_us,speedup" << '\n';
212 |             }
213 |             csv << simd_bench_detail::csv_escape(simd_bench_detail::suite_label()) << ','
214 |                 << simd_bench_detail::csv_escape(label) << ','
215 |                 << iterations << ','
216 |                 << scalar_duration.count() << ','
217 |                 << simd_duration.count() << ','
218 |                 << std::setprecision(6) << speedup << '\n';
219 |         } else {
220 |             std::cerr << "Failed to write benchmark CSV at " << csv_path << std::endl;
221 |         }
222 |     }
223 | }
224 | 
225 | // Allocate aligned memory
226 | template<typename T>
227 | T* aligned_alloc(size_t size, size_t alignment = 32) {
228 |     void* ptr = nullptr;
229 |     if (posix_memalign(&ptr, alignment, size * sizeof(T)) != 0) {
230 |         throw std::bad_alloc();
231 |     }
232 |     return static_cast<T*>(ptr);
233 | }
234 | 
235 | #endif // SIMD_UTILS_H 
236 | 


--------------------------------------------------------------------------------
/src/01_Basics/04_loading_data/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | #include <memory>
  5 | 
  6 | /**
  7 |  * 01_Basics/04_loading_data - Loading and storing SIMD data
  8 |  * 
  9 |  * This example demonstrates different ways to load data into SIMD vectors:
 10 |  * 1. Aligned load (_mm256_load_ps) - Requires 32-byte aligned memory
 11 |  * 2. Unaligned load (_mm256_loadu_ps) - Works with any memory address
 12 |  * 3. Masked load (_mm256_maskload_ps) - Selectively loads elements based on a mask
 13 |  * 4. Stream load (_mm256_stream_load_si256) - Non-temporal load that bypasses cache
 14 |  * 
 15 |  * And different ways to store SIMD data:
 16 |  * 1. Aligned store (_mm256_store_ps) - Requires 32-byte aligned memory
 17 |  * 2. Unaligned store (_mm256_storeu_ps) - Works with any memory address
 18 |  * 3. Masked store (_mm256_maskstore_ps) - Selectively stores elements based on a mask
 19 |  * 4. Stream store (_mm256_stream_ps) - Non-temporal store that bypasses cache
 20 |  * 
 21 |  * We'll also compare the performance of these methods.
 22 |  */
 23 | 
 24 | const int ARRAY_SIZE = 8;
 25 | // Fewer iterations keep the demo responsive while still highlighting relative costs.
 26 | const int TEST_ITERATIONS = 100000;
 27 | 
 28 | int main() {
 29 |     set_benchmark_suite("01_Basics/04_loading_data");
 30 | 
 31 |     std::cout << "=== SIMD Data Loading and Storing ===" << std::endl;
 32 |     std::cout << std::endl;
 33 | 
 34 |     // --------- 1. Aligned vs. Unaligned Load -------------
 35 |     std::cout << "1. Aligned vs. Unaligned Load" << std::endl;
 36 |     std::cout << "---------------------------------------------------" << std::endl;
 37 |     std::cout << "Comparing aligned and unaligned memory access." << std::endl;
 38 |     std::cout << std::endl;
 39 |     
 40 |     // Allocate aligned and unaligned memory
 41 |     float* aligned_data = aligned_alloc<float>(ARRAY_SIZE, 32);  // 32-byte alignment for AVX
 42 |     float* unaligned_data = new float[ARRAY_SIZE + 1];  // +1 to ensure we can create unaligned pointer
 43 |     float* unaligned_ptr = unaligned_data + 1;  // Offset by 1 to ensure unalignment
 44 |     
 45 |     // Initialize data
 46 |     for (int i = 0; i < ARRAY_SIZE; i++) {
 47 |         aligned_data[i] = static_cast<float>(i + 1);
 48 |         unaligned_ptr[i] = static_cast<float>(i + 1);
 49 |     }
 50 |     
 51 |     // Demonstrate aligned load
 52 |     __m256 aligned_vec = _mm256_load_ps(aligned_data);
 53 |     print_m256(aligned_vec, "Aligned load result");
 54 |     
 55 |     // Demonstrate unaligned load
 56 |     __m256 unaligned_vec = _mm256_loadu_ps(unaligned_ptr);
 57 |     print_m256(unaligned_vec, "Unaligned load result");
 58 |     
 59 |     // Performance comparison
 60 |     Timer timer("Aligned vs. Unaligned Load Performance");
 61 |     
 62 |     // Benchmark aligned load
 63 |     auto aligned_load = [&]() {
 64 |         __m256 result;
 65 |         for (int i = 0; i < TEST_ITERATIONS; i++) {
 66 |             result = _mm256_load_ps(aligned_data);
 67 |         }
 68 |         return result;
 69 |     };
 70 |     
 71 |     // Benchmark unaligned load
 72 |     auto unaligned_load = [&]() {
 73 |         __m256 result;
 74 |         for (int i = 0; i < TEST_ITERATIONS; i++) {
 75 |             result = _mm256_loadu_ps(unaligned_ptr);
 76 |         }
 77 |         return result;
 78 |     };
 79 |     
 80 |     benchmark_comparison("Load Operations", aligned_load, unaligned_load, 10);
 81 |     std::cout << std::endl;
 82 | 
 83 |     // --------- 2. Masked Load -------------
 84 |     std::cout << "2. Masked Load" << std::endl;
 85 |     std::cout << "---------------------------------------------------" << std::endl;
 86 |     std::cout << "Selectively loading elements based on a mask." << std::endl;
 87 |     std::cout << std::endl;
 88 |     
 89 |     // Create a mask to load only elements 0, 2, 4, and 6
 90 |     __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
 91 |     
 92 |     // Perform masked load (elements not selected by mask will be zero)
 93 |     __m256 masked_vec = _mm256_maskload_ps(aligned_data, mask);
 94 |     print_m256(masked_vec, "Masked load result (even indices only)");
 95 |     std::cout << std::endl;
 96 | 
 97 |     // --------- 3. Aligned vs. Unaligned Store -------------
 98 |     std::cout << "3. Aligned vs. Unaligned Store" << std::endl;
 99 |     std::cout << "---------------------------------------------------" << std::endl;
100 |     std::cout << "Comparing aligned and unaligned store operations." << std::endl;
101 |     std::cout << std::endl;
102 |     
103 |     // Create a test vector
104 |     __m256 test_vec = _mm256_set_ps(16.0f, 14.0f, 12.0f, 10.0f, 8.0f, 6.0f, 4.0f, 2.0f);
105 |     
106 |     // Perform aligned store
107 |     _mm256_store_ps(aligned_data, test_vec);
108 |     
109 |     std::cout << "Aligned store result: [";
110 |     for (int i = 0; i < ARRAY_SIZE - 1; i++) {
111 |         std::cout << aligned_data[i] << ", ";
112 |     }
113 |     std::cout << aligned_data[ARRAY_SIZE - 1] << "]" << std::endl;
114 |     
115 |     // Perform unaligned store
116 |     _mm256_storeu_ps(unaligned_ptr, test_vec);
117 |     
118 |     std::cout << "Unaligned store result: [";
119 |     for (int i = 0; i < ARRAY_SIZE - 1; i++) {
120 |         std::cout << unaligned_ptr[i] << ", ";
121 |     }
122 |     std::cout << unaligned_ptr[ARRAY_SIZE - 1] << "]" << std::endl;
123 |     
124 |     // Performance comparison
125 |     Timer timer2("Aligned vs. Unaligned Store Performance");
126 |     
127 |     // Benchmark aligned store
128 |     auto aligned_store = [&]() {
129 |         for (int i = 0; i < TEST_ITERATIONS; i++) {
130 |             _mm256_store_ps(aligned_data, test_vec);
131 |         }
132 |     };
133 |     
134 |     // Benchmark unaligned store
135 |     auto unaligned_store = [&]() {
136 |         for (int i = 0; i < TEST_ITERATIONS; i++) {
137 |             _mm256_storeu_ps(unaligned_ptr, test_vec);
138 |         }
139 |     };
140 |     
141 |     benchmark_comparison("Store Operations", aligned_store, unaligned_store, 10);
142 |     std::cout << std::endl;
143 | 
144 |     // --------- 4. Masked Store -------------
145 |     std::cout << "4. Masked Store" << std::endl;
146 |     std::cout << "---------------------------------------------------" << std::endl;
147 |     std::cout << "Selectively storing elements based on a mask." << std::endl;
148 |     std::cout << std::endl;
149 |     
150 |     // Reset aligned data
151 |     for (int i = 0; i < ARRAY_SIZE; i++) {
152 |         aligned_data[i] = 0.0f;
153 |     }
154 |     
155 |     // Create a mask to store only elements 1, 3, 5, and 7
156 |     __m256i mask2 = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
157 |     
158 |     // Perform masked store
159 |     _mm256_maskstore_ps(aligned_data, mask2, test_vec);
160 |     
161 |     std::cout << "Masked store result (odd indices only): [";
162 |     for (int i = 0; i < ARRAY_SIZE - 1; i++) {
163 |         std::cout << aligned_data[i] << ", ";
164 |     }
165 |     std::cout << aligned_data[ARRAY_SIZE - 1] << "]" << std::endl;
166 |     std::cout << std::endl;
167 | 
168 |     // --------- 5. Stream Load/Store (Non-temporal) -------------
169 |     std::cout << "5. Stream Load/Store (Non-temporal)" << std::endl;
170 |     std::cout << "---------------------------------------------------" << std::endl;
171 |     std::cout << "Using non-temporal loads and stores that bypass the cache." << std::endl;
172 |     std::cout << "Useful for large data sets that won't be reused soon." << std::endl;
173 |     std::cout << std::endl;
174 |     
175 |     // Allocate a large array to demonstrate streaming operations
176 |     const int LARGE_SIZE = 1024;
177 |     float* large_array = aligned_alloc<float>(LARGE_SIZE, 32);
178 |     
179 |     // Initialize the array
180 |     for (int i = 0; i < LARGE_SIZE; i++) {
181 |         large_array[i] = static_cast<float>(i);
182 |     }
183 |     
184 |     // Perform stream load and store
185 |     for (int i = 0; i < LARGE_SIZE; i += 8) {
186 |         // Stream load (using _mm256_stream_load_si256 which requires casting)
187 |         __m256 loaded = _mm256_loadu_ps(&large_array[i]);
188 |         
189 |         // Process the data (simple multiplication by 2)
190 |         __m256 processed = _mm256_mul_ps(loaded, _mm256_set1_ps(2.0f));
191 |         
192 |         // Stream store (non-temporal store that bypasses cache)
193 |         _mm256_stream_ps(&large_array[i], processed);
194 |     }
195 |     
196 |     // Ensure all streaming stores are visible
197 |     _mm_sfence();
198 |     
199 |     // Print a small section of the result
200 |     std::cout << "Stream store result (first 16 elements): [";
201 |     for (int i = 0; i < 15; i++) {
202 |         std::cout << large_array[i] << ", ";
203 |     }
204 |     std::cout << large_array[15] << "]" << std::endl;
205 |     
206 |     // Clean up
207 |     free(aligned_data);
208 |     delete[] unaligned_data;
209 |     free(large_array);
210 |     
211 |     return 0;
212 | }
213 | 


--------------------------------------------------------------------------------
/src/03_Examples/01_conditional_code/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | #include <algorithm>
  5 | #include <vector>
  6 | #include <chrono>
  7 | #include <random>
  8 | #include <bitset>
  9 | #include <immintrin.h>
 10 | 
 11 | /**
 12 |  * 03_Examples/01_conditional_code - Implementing conditional operations with SIMD
 13 |  * 
 14 |  * This example demonstrates how to implement conditional logic using SIMD:
 15 |  * 1. Clamping values to a range
 16 |  * 2. Filtering positive values
 17 |  * 3. Complex conditional operations (multiple conditions)
 18 |  * 4. Using masks and blending for conditional selection
 19 |  * 
 20 |  * Conditional operations are challenging in SIMD because traditional branching
 21 |  * (if/else statements) doesn't work well with vector operations. Instead, we use
 22 |  * comparison operations to create masks, and then use those masks to select values.
 23 |  */
 24 | 
 25 | int main() {
 26 |     set_benchmark_suite("03_Examples/01_conditional_code");
 27 | 
 28 |     std::cout << "=== SIMD Conditional Operations ===" << std::endl;
 29 | 	std::cout << std::endl;
 30 | 
 31 | 	// Initialize test data
 32 | 	// Allocate aligned memory for better performance
 33 | 	float* data1 = aligned_alloc<float>(8);
 34 | 	float* data2 = aligned_alloc<float>(8);
 35 | 	float* result_scalar = aligned_alloc<float>(8);
 36 | 	float* result_simd = aligned_alloc<float>(8);
 37 | 	
 38 | 	// Initialize data1 with ascending values
 39 | 	data1[0] = 5.0f;  data1[1] = 10.0f; data1[2] = 15.0f; data1[3] = 20.0f;
 40 | 	data1[4] = 25.0f; data1[5] = 30.0f; data1[6] = 35.0f; data1[7] = 40.0f;
 41 | 	
 42 | 	// Initialize data2 with mixed positive and negative values
 43 | 	data2[0] = -1.0f; data2[1] = 4.0f;  data2[2] = 9.0f;  data2[3] = -16.0f;
 44 | 	data2[4] = 25.0f; data2[5] = -36.0f; data2[6] = 49.0f; data2[7] = -64.0f;
 45 | 	
 46 | 	// Load data into SIMD registers
 47 | 	__m256 vector1 = _mm256_load_ps(data1);
 48 | 	__m256 vector2 = _mm256_load_ps(data2);
 49 | 	
 50 | 	// Print input data
 51 | 	print_m256(vector1, "Vector 1");
 52 | 	print_m256(vector2, "Vector 2");
 53 | 	std::cout << std::endl;
 54 | 
 55 | 	// --------- 1. Clamping Values -------------
 56 | 	std::cout << "1. Clamping Values" << std::endl;
 57 | 	std::cout << "---------------------------------------------------" << std::endl;
 58 | 	std::cout << "Clamping values in Vector 2 to the range [5, 30]" << std::endl;
 59 | 	std::cout << std::endl;
 60 | 	
 61 | 	// Scalar implementation of clamping
 62 | 	auto scalar_clamp = [&]() {
 63 | 		for (int i = 0; i < 8; i++) {
 64 | 			result_scalar[i] = std::max(5.0f, std::min(30.0f, data2[i]));
 65 | 		}
 66 | 	};
 67 | 	
 68 | 	// SIMD implementation of clamping
 69 | 	auto simd_clamp = [&]() {
 70 | 		__m256 min_val = _mm256_set1_ps(5.0f);
 71 | 		__m256 max_val = _mm256_set1_ps(30.0f);
 72 | 		
 73 | 		// First, clamp to upper bound (min operation)
 74 | 		__m256 upper_clamped = _mm256_min_ps(vector2, max_val);
 75 | 		
 76 | 		// Then, clamp to lower bound (max operation)
 77 | 		__m256 result = _mm256_max_ps(upper_clamped, min_val);
 78 | 		
 79 | 		_mm256_store_ps(result_simd, result);
 80 | 	};
 81 | 	
 82 | 	// Execute both implementations
 83 | 	scalar_clamp();
 84 | 	simd_clamp();
 85 | 	
 86 | 	// Print results
 87 | 	std::cout << "Scalar clamping result: [";
 88 | 	for (int i = 0; i < 7; i++) {
 89 | 		std::cout << result_scalar[i] << ", ";
 90 | 	}
 91 | 	std::cout << result_scalar[7] << "]" << std::endl;
 92 | 	
 93 | 	std::cout << "SIMD clamping result:   [";
 94 | 	for (int i = 0; i < 7; i++) {
 95 | 		std::cout << result_simd[i] << ", ";
 96 | 	}
 97 | 	std::cout << result_simd[7] << "]" << std::endl;
 98 | 	
 99 | 	// Benchmark comparison
100 | 	benchmark_comparison("Clamping", scalar_clamp, simd_clamp, 200);
101 | 	std::cout << std::endl;
102 | 
103 | 	// --------- 2. Filtering Positive Values -------------
104 | 	std::cout << "2. Filtering Positive Values" << std::endl;
105 | 	std::cout << "---------------------------------------------------" << std::endl;
106 | 	std::cout << "Creating a mask for positive values in Vector 2" << std::endl;
107 | 	std::cout << std::endl;
108 | 	
109 | 	// Create a mask for positive values
110 | 	__m256 zero = _mm256_setzero_ps();
111 | 	__m256 positive_mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ);
112 | 	
113 | 	// Print the mask (all bits set for true, all bits clear for false)
114 | 	float8 mask_values(positive_mask);
115 | 	std::cout << "Positive mask (as floats): [";
116 | 	for (int i = 0; i < 7; i++) {
117 | 		std::cout << mask_values.a[i] << ", ";
118 | 	}
119 | 	std::cout << mask_values.a[7] << "]" << std::endl;
120 | 	
121 | 	// Convert the mask to a bitmask (one bit per element)
122 | 	int bitmask = _mm256_movemask_ps(positive_mask);
123 | 	std::cout << "Positive mask (as bitmask): " << std::bitset<8>(bitmask) << " (decimal: " << bitmask << ")" << std::endl;
124 | 	
125 | 	// Explain the bitmask
126 | 	std::cout << "Explanation: Positions 1, 2, 4, and 6 have positive values," << std::endl;
127 | 	std::cout << "corresponding to bits 1, 2, 4, and 6 in the bitmask." << std::endl;
128 | 	std::cout << "As a decimal: 2^1 + 2^2 + 2^4 + 2^6 = 2 + 4 + 16 + 64 = 86" << std::endl;
129 | 	std::cout << std::endl;
130 | 	
131 | 	// Scalar implementation of filtering
132 | 	auto scalar_filter = [&]() {
133 | 		for (int i = 0; i < 8; i++) {
134 | 			if (data2[i] > 0) {
135 | 				result_scalar[i] = data2[i];
136 | 			} else {
137 | 				result_scalar[i] = 0.0f;
138 | 			}
139 | 		}
140 | 	};
141 | 	
142 | 	// SIMD implementation of filtering
143 | 	auto simd_filter = [&]() {
144 | 		__m256 mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ);
145 | 		__m256 result = _mm256_and_ps(vector2, mask);  // Keep only positive values
146 | 		_mm256_store_ps(result_simd, result);
147 | 	};
148 | 	
149 | 	// Execute both implementations
150 | 	scalar_filter();
151 | 	simd_filter();
152 | 	
153 | 	// Print results
154 | 	std::cout << "Scalar filtering result: [";
155 | 	for (int i = 0; i < 7; i++) {
156 | 		std::cout << result_scalar[i] << ", ";
157 | 	}
158 | 	std::cout << result_scalar[7] << "]" << std::endl;
159 | 	
160 | 	std::cout << "SIMD filtering result:   [";
161 | 	for (int i = 0; i < 7; i++) {
162 | 		std::cout << result_simd[i] << ", ";
163 | 	}
164 | 	std::cout << result_simd[7] << "]" << std::endl;
165 | 	
166 | 	// Benchmark comparison
167 | 	benchmark_comparison("Filtering", scalar_filter, simd_filter, 200);
168 | 	std::cout << std::endl;
169 | 
170 | 	// --------- 3. Complex Conditional Operations -------------
171 | 	std::cout << "3. Complex Conditional Operations" << std::endl;
172 | 	std::cout << "---------------------------------------------------" << std::endl;
173 | 	std::cout << "Finding values in Vector 2 that are both positive and greater than Vector 1" << std::endl;
174 | 	std::cout << std::endl;
175 | 	
176 | 	// Create masks for both conditions
177 | 	__m256 positive_mask2 = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ);
178 | 	__m256 greater_mask = _mm256_cmp_ps(vector2, vector1, _CMP_GT_OQ);
179 | 	
180 | 	// Combine masks with logical AND
181 | 	__m256 combined_mask = _mm256_and_ps(positive_mask2, greater_mask);
182 | 	
183 | 	// Print the combined mask
184 | 	float8 combined_mask_values(combined_mask);
185 | 	std::cout << "Combined mask (as floats): [";
186 | 	for (int i = 0; i < 7; i++) {
187 | 		std::cout << combined_mask_values.a[i] << ", ";
188 | 	}
189 | 	std::cout << combined_mask_values.a[7] << "]" << std::endl;
190 | 	
191 | 	// Convert the combined mask to a bitmask
192 | 	int combined_bitmask = _mm256_movemask_ps(combined_mask);
193 | 	std::cout << "Combined mask (as bitmask): " << std::bitset<8>(combined_bitmask) << " (decimal: " << combined_bitmask << ")" << std::endl;
194 | 	std::cout << std::endl;
195 | 	
196 | 	// Scalar implementation of complex filtering
197 | 	auto scalar_complex = [&]() {
198 | 		for (int i = 0; i < 8; i++) {
199 | 			if (data2[i] > 0 && data2[i] > data1[i]) {
200 | 				result_scalar[i] = data2[i];
201 | 			} else {
202 | 				result_scalar[i] = 0.0f;
203 | 			}
204 | 		}
205 | 	};
206 | 	
207 | 	// SIMD implementation of complex filtering using blendv
208 | 	auto simd_complex = [&]() {
209 | 		__m256 pos_mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ);
210 | 		__m256 gt_mask = _mm256_cmp_ps(vector2, vector1, _CMP_GT_OQ);
211 | 		__m256 combined = _mm256_and_ps(pos_mask, gt_mask);
212 | 		
213 | 		// Use blendv to select values: if mask is true, take from vector2, else take 0
214 | 		__m256 result = _mm256_blendv_ps(zero, vector2, combined);
215 | 		_mm256_store_ps(result_simd, result);
216 | 	};
217 | 	
218 | 	// Execute both implementations
219 | 	scalar_complex();
220 | 	simd_complex();
221 | 	
222 | 	// Print results
223 | 	std::cout << "Scalar complex filtering result: [";
224 | 	for (int i = 0; i < 7; i++) {
225 | 		std::cout << result_scalar[i] << ", ";
226 | 	}
227 | 	std::cout << result_scalar[7] << "]" << std::endl;
228 | 	
229 | 	std::cout << "SIMD complex filtering result:   [";
230 | 	for (int i = 0; i < 7; i++) {
231 | 		std::cout << result_simd[i] << ", ";
232 | 	}
233 | 	std::cout << result_simd[7] << "]" << std::endl;
234 | 	
235 | 	// Benchmark comparison
236 | 	benchmark_comparison("Complex Filtering", scalar_complex, simd_complex, 200);
237 | 	std::cout << std::endl;
238 | 
239 | 	// --------- 4. Conditional Selection with Blending -------------
240 | 	std::cout << "4. Conditional Selection with Blending" << std::endl;
241 | 	std::cout << "---------------------------------------------------" << std::endl;
242 | 	std::cout << "Using _mm256_blendv_ps for conditional selection" << std::endl;
243 | 	std::cout << std::endl;
244 | 	
245 | 	// Create a new vector with different values
246 | 	__m256 vector3 = _mm256_set_ps(80.0f, 70.0f, 60.0f, 50.0f, 40.0f, 30.0f, 20.0f, 10.0f);
247 | 	print_m256(vector3, "Vector 3");
248 | 	
249 | 	// Create a mask based on a condition (e.g., values > 50)
250 | 	__m256 threshold = _mm256_set1_ps(50.0f);
251 | 	__m256 blend_mask = _mm256_cmp_ps(vector3, threshold, _CMP_GT_OQ);
252 | 	
253 | 	// Use blendv to select values from vector1 or vector2 based on the mask
254 | 	__m256 blended = _mm256_blendv_ps(vector1, vector2, blend_mask);
255 | 	print_m256(blended, "Blended Result (Vector 2 if > 50, else Vector 1)");
256 | 	
257 | 	// Explain the blending operation
258 | 	std::cout << "Explanation: For each element, if Vector 3 > 50, we take the value from Vector 2," << std::endl;
259 | 	std::cout << "otherwise we take the value from Vector 1." << std::endl;
260 | 	std::cout << std::endl;
261 | 	
262 | 	// Clean up
263 | 	free(data1);
264 | 	free(data2);
265 | 	free(result_scalar);
266 | 	free(result_simd);
267 | 	
268 | 	return 0;
269 | }
270 | 


--------------------------------------------------------------------------------
/src/02_Computations/01_simple_maths/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | #include <cmath>
  5 | 
  6 | /**
  7 |  * 02_Computations/01_simple_maths - Basic SIMD mathematical operations
  8 |  * 
  9 |  * This example demonstrates various mathematical operations using SIMD:
 10 |  * 1. Addition (_mm256_add_ps)
 11 |  * 2. Subtraction (_mm256_sub_ps)
 12 |  * 3. Multiplication (_mm256_mul_ps)
 13 |  * 4. Division (_mm256_div_ps)
 14 |  * 5. Fused Multiply-Add (_mm256_fmadd_ps)
 15 |  * 6. Square Root (_mm256_sqrt_ps)
 16 |  * 7. Minimum/Maximum (_mm256_min_ps, _mm256_max_ps)
 17 |  * 8. Horizontal operations (_mm256_hadd_ps, _mm256_hsub_ps)
 18 |  * 
 19 |  * For each operation, we compare the performance of SIMD vs. scalar implementation.
 20 |  */
 21 | 
 22 | int main() {
 23 |     set_benchmark_suite("02_Computations/01_simple_maths");
 24 | 
 25 |     std::cout << "=== SIMD Mathematical Operations ===" << std::endl;
 26 |     std::cout << std::endl;
 27 | 
 28 |     // Initialize test data
 29 |     float data1[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
 30 |     float data2[8] = {8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f};
 31 | 
 32 |     // Load data into SIMD vectors
 33 |     __m256 vector1 = _mm256_loadu_ps(data1);
 34 |     __m256 vector2 = _mm256_loadu_ps(data2);
 35 | 
 36 |     // --------- 1. Addition -------------
 37 |     std::cout << "1. Addition (_mm256_add_ps)" << std::endl;
 38 |     std::cout << "---------------------------------------------------" << std::endl;
 39 |     std::cout << "Adds corresponding elements of two vectors." << std::endl;
 40 |     std::cout << std::endl;
 41 |     
 42 |     // Print input vectors
 43 |     print_m256(vector1, "Vector 1");
 44 |     print_m256(vector2, "Vector 2");
 45 |     
 46 |     // Perform addition
 47 |     __m256 add_result = _mm256_add_ps(vector1, vector2);
 48 |     print_m256(add_result, "Addition Result (Vector 1 + Vector 2)");
 49 |     
 50 |     // Compare performance: scalar vs. SIMD
 51 |     auto scalar_add = [&]() {
 52 |         float result[8];
 53 |         for (int i = 0; i < 8; i++) {
 54 |             result[i] = data1[i] + data2[i];
 55 |         }
 56 |     };
 57 |     
 58 |     auto simd_add = [&]() {
 59 |         __m256 result = _mm256_add_ps(vector1, vector2);
 60 |     };
 61 |     
 62 |     benchmark_comparison("Addition", scalar_add, simd_add);
 63 |     std::cout << std::endl;
 64 | 
 65 |     // --------- 2. Subtraction -------------
 66 |     std::cout << "2. Subtraction (_mm256_sub_ps)" << std::endl;
 67 |     std::cout << "---------------------------------------------------" << std::endl;
 68 |     std::cout << "Subtracts corresponding elements of two vectors." << std::endl;
 69 |     std::cout << std::endl;
 70 |     
 71 |     // Perform subtraction
 72 |     __m256 sub_result = _mm256_sub_ps(vector1, vector2);
 73 |     print_m256(sub_result, "Subtraction Result (Vector 1 - Vector 2)");
 74 |     
 75 |     // Compare performance: scalar vs. SIMD
 76 |     auto scalar_sub = [&]() {
 77 |         float result[8];
 78 |         for (int i = 0; i < 8; i++) {
 79 |             result[i] = data1[i] - data2[i];
 80 |         }
 81 |     };
 82 |     
 83 |     auto simd_sub = [&]() {
 84 |         __m256 result = _mm256_sub_ps(vector1, vector2);
 85 |     };
 86 |     
 87 |     benchmark_comparison("Subtraction", scalar_sub, simd_sub);
 88 |     std::cout << std::endl;
 89 | 
 90 |     // --------- 3. Multiplication -------------
 91 |     std::cout << "3. Multiplication (_mm256_mul_ps)" << std::endl;
 92 |     std::cout << "---------------------------------------------------" << std::endl;
 93 |     std::cout << "Multiplies corresponding elements of two vectors." << std::endl;
 94 |     std::cout << std::endl;
 95 |     
 96 |     // Perform multiplication
 97 |     __m256 mul_result = _mm256_mul_ps(vector1, vector2);
 98 |     print_m256(mul_result, "Multiplication Result (Vector 1 * Vector 2)");
 99 |     
100 |     // Compare performance: scalar vs. SIMD
101 |     auto scalar_mul = [&]() {
102 |         float result[8];
103 |         for (int i = 0; i < 8; i++) {
104 |             result[i] = data1[i] * data2[i];
105 |         }
106 |     };
107 |     
108 |     auto simd_mul = [&]() {
109 |         __m256 result = _mm256_mul_ps(vector1, vector2);
110 |     };
111 |     
112 |     benchmark_comparison("Multiplication", scalar_mul, simd_mul);
113 |     std::cout << std::endl;
114 | 
115 |     // --------- 4. Division -------------
116 |     std::cout << "4. Division (_mm256_div_ps)" << std::endl;
117 |     std::cout << "---------------------------------------------------" << std::endl;
118 |     std::cout << "Divides corresponding elements of two vectors." << std::endl;
119 |     std::cout << std::endl;
120 |     
121 |     // Perform division
122 |     __m256 div_result = _mm256_div_ps(vector1, vector2);
123 |     print_m256(div_result, "Division Result (Vector 1 / Vector 2)");
124 |     
125 |     // Compare performance: scalar vs. SIMD
126 |     auto scalar_div = [&]() {
127 |         float result[8];
128 |         for (int i = 0; i < 8; i++) {
129 |             result[i] = data1[i] / data2[i];
130 |         }
131 |     };
132 |     
133 |     auto simd_div = [&]() {
134 |         __m256 result = _mm256_div_ps(vector1, vector2);
135 |     };
136 |     
137 |     benchmark_comparison("Division", scalar_div, simd_div);
138 |     std::cout << std::endl;
139 | 
140 |     // --------- 5. Fused Multiply-Add -------------
141 |     std::cout << "5. Fused Multiply-Add (_mm256_fmadd_ps)" << std::endl;
142 |     std::cout << "---------------------------------------------------" << std::endl;
143 |     std::cout << "Performs a fused multiply-add operation: a*b + c" << std::endl;
144 |     std::cout << "This is more accurate and faster than separate multiply and add." << std::endl;
145 |     std::cout << std::endl;
146 |     
147 |     // Create a third vector for FMA
148 |     __m256 vector3 = _mm256_set1_ps(2.0f);
149 |     print_m256(vector3, "Vector 3");
150 |     
151 |     // Perform FMA: vector1 * vector2 + vector3
152 |     __m256 fma_result = _mm256_fmadd_ps(vector1, vector2, vector3);
153 |     print_m256(fma_result, "FMA Result (Vector 1 * Vector 2 + Vector 3)");
154 |     
155 |     // Compare performance: scalar vs. SIMD
156 |     auto scalar_fma = [&]() {
157 |         float result[8];
158 |         for (int i = 0; i < 8; i++) {
159 |             result[i] = data1[i] * data2[i] + 2.0f;
160 |         }
161 |     };
162 |     
163 |     auto simd_fma = [&]() {
164 |         __m256 result = _mm256_fmadd_ps(vector1, vector2, vector3);
165 |     };
166 |     
167 |     benchmark_comparison("Fused Multiply-Add", scalar_fma, simd_fma);
168 |     std::cout << std::endl;
169 | 
170 |     // --------- 6. Square Root -------------
171 |     std::cout << "6. Square Root (_mm256_sqrt_ps)" << std::endl;
172 |     std::cout << "---------------------------------------------------" << std::endl;
173 |     std::cout << "Computes the square root of each element in a vector." << std::endl;
174 |     std::cout << std::endl;
175 |     
176 |     // Create a vector of positive values
177 |     __m256 pos_vector = _mm256_set_ps(64.0f, 49.0f, 36.0f, 25.0f, 16.0f, 9.0f, 4.0f, 1.0f);
178 |     print_m256(pos_vector, "Input Vector");
179 |     
180 |     // Compute square root
181 |     __m256 sqrt_result = _mm256_sqrt_ps(pos_vector);
182 |     print_m256(sqrt_result, "Square Root Result");
183 |     
184 |     // Compare performance: scalar vs. SIMD
185 |     auto scalar_sqrt = [&]() {
186 |         float result[8];
187 |         union {
188 |             __m256 v;
189 |             float a[8];
190 |         } u;
191 |         u.v = pos_vector;
192 |         for (int i = 0; i < 8; i++) {
193 |             result[i] = std::sqrt(u.a[i]);
194 |         }
195 |     };
196 |     
197 |     auto simd_sqrt = [&]() {
198 |         __m256 result = _mm256_sqrt_ps(pos_vector);
199 |     };
200 |     
201 |     benchmark_comparison("Square Root", scalar_sqrt, simd_sqrt);
202 |     std::cout << std::endl;
203 | 
204 |     // --------- 7. Min/Max Operations -------------
205 |     std::cout << "7. Min/Max Operations (_mm256_min_ps, _mm256_max_ps)" << std::endl;
206 |     std::cout << "---------------------------------------------------" << std::endl;
207 |     std::cout << "Computes the minimum or maximum of corresponding elements." << std::endl;
208 |     std::cout << std::endl;
209 |     
210 |     // Print input vectors again
211 |     print_m256(vector1, "Vector 1");
212 |     print_m256(vector2, "Vector 2");
213 |     
214 |     // Compute min and max
215 |     __m256 min_result = _mm256_min_ps(vector1, vector2);
216 |     __m256 max_result = _mm256_max_ps(vector1, vector2);
217 |     
218 |     print_m256(min_result, "Minimum Result");
219 |     print_m256(max_result, "Maximum Result");
220 |     
221 |     // Compare performance: scalar vs. SIMD for min
222 |     auto scalar_min = [&]() {
223 |         float result[8];
224 |         for (int i = 0; i < 8; i++) {
225 |             result[i] = std::min(data1[i], data2[i]);
226 |         }
227 |     };
228 |     
229 |     auto simd_min = [&]() {
230 |         __m256 result = _mm256_min_ps(vector1, vector2);
231 |     };
232 |     
233 |     benchmark_comparison("Minimum", scalar_min, simd_min);
234 |     std::cout << std::endl;
235 | 
236 |     // --------- 8. Horizontal Operations -------------
237 |     std::cout << "8. Horizontal Operations (_mm256_hadd_ps, _mm256_hsub_ps)" << std::endl;
238 |     std::cout << "---------------------------------------------------" << std::endl;
239 |     std::cout << "Performs horizontal addition or subtraction of adjacent elements." << std::endl;
240 |     std::cout << std::endl;
241 |     
242 |     // Create test vectors
243 |     __m256 hadd_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
244 |     __m256 hadd_vec2 = _mm256_set_ps(16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f);
245 |     
246 |     print_m256(hadd_vec1, "Vector A");
247 |     print_m256(hadd_vec2, "Vector B");
248 |     
249 |     // Perform horizontal addition
250 |     // This adds adjacent pairs: (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
251 |     __m256 hadd_result = _mm256_hadd_ps(hadd_vec1, hadd_vec2);
252 |     print_m256(hadd_result, "Horizontal Addition Result");
253 |     
254 |     // Perform horizontal subtraction
255 |     // This subtracts adjacent pairs: (a0-a1, a2-a3, b0-b1, b2-b3, a4-a5, a6-a7, b4-b5, b6-b7)
256 |     __m256 hsub_result = _mm256_hsub_ps(hadd_vec1, hadd_vec2);
257 |     print_m256(hsub_result, "Horizontal Subtraction Result");
258 |     
259 |     // Note: Horizontal operations are typically slower than vertical operations
260 |     // They are useful for specific algorithms like dot products and matrix operations
261 |     
262 |     return 0;
263 | }
264 | 


--------------------------------------------------------------------------------
/src/01_Basics/02_initializing_data/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <chrono>
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | 
  6 | /**
  7 |  * 01_Basics/02_initializing_data - Different ways to initialize SIMD vectors
  8 |  * 
  9 |  * This example demonstrates various methods to initialize SIMD vectors:
 10 |  * 1. _mm256_setzero_ps/pd/si256 - Initialize all elements to zero
 11 |  * 2. _mm256_set1_ps/pd/epi32/etc - Initialize all elements to the same value
 12 |  * 3. _mm256_set_ps/pd/epi32/etc - Initialize each element individually
 13 |  * 4. _mm256_setr_ps/pd/epi32/etc - Initialize each element in reverse order
 14 |  * 
 15 |  * We'll also compare the performance of SIMD initialization vs. standard array initialization.
 16 |  */
 17 | 
 18 | // Constants
 19 | // A lighter loop count keeps turnaround snappy while still magnifying the perf gap.
 20 | constexpr int NUM_ITERATIONS = 100000;
 21 | 
 22 | template <typename T, size_t N>
 23 | void printArray(const T (&arr)[N], const std::string &description) {
 24 |     std::cout << description << ": ";
 25 |     for (size_t i = 0; i < N; ++i) {
 26 |         std::cout << arr[i] << ", ";
 27 |     }
 28 |     std::cout << std::endl;
 29 | }
 30 | 
 31 | void copyFromSIMD(float* dest, const __m256& src) {
 32 |     _mm256_storeu_ps(dest, src);
 33 | }
 34 | 
 35 | void copyFromSIMD(double* dest, const __m256d& src) {
 36 |     _mm256_storeu_pd(dest, src);
 37 | }
 38 | 
 39 | void copyFromSIMD(int* dest, const __m256i& src) {
 40 |     _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest), src);
 41 | }
 42 | 
 43 | void copyFromSIMD(short* dest, const __m256i& src) {
 44 |     _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest), src);
 45 | }
 46 | 
 47 | int main() {
 48 |     set_benchmark_suite("01_Basics/02_initializing_data");
 49 | 
 50 |     std::cout << "=== SIMD Data Initialization Methods ===" << std::endl;
 51 |     std::cout << std::endl;
 52 | 
 53 |     // --------- 1. Zero Initialization (_mm256_setzero_*) -------------
 54 |     std::cout << "1. Zero Initialization (_mm256_setzero_*)" << std::endl;
 55 |     std::cout << "---------------------------------------------------" << std::endl;
 56 |     std::cout << "Initializes all elements of a SIMD vector to zero." << std::endl;
 57 |     std::cout << std::endl;
 58 | 
 59 |     // Standard method for float array
 60 |     float std_float_array[8];
 61 |     auto start = std::chrono::high_resolution_clock::now();
 62 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
 63 |         for (int lane = 0; lane < 8; ++lane) {
 64 |             std_float_array[lane] = 0.0f;
 65 |         }
 66 |     }
 67 |     auto stop = std::chrono::high_resolution_clock::now();
 68 |     auto duration_std = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
 69 |     
 70 |     // SIMD method for float vector
 71 |     __m256 simd_float_vec;
 72 |     start = std::chrono::high_resolution_clock::now();
 73 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
 74 |         simd_float_vec = _mm256_setzero_ps();
 75 |     }
 76 |     stop = std::chrono::high_resolution_clock::now();
 77 |     auto duration_simd = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
 78 |     
 79 |     // Print results
 80 |     std::cout << "Float Zero Initialization:" << std::endl;
 81 |     std::cout << "  Standard method: " << duration_std.count() << " microseconds" << std::endl;
 82 |     std::cout << "  SIMD method:     " << duration_simd.count() << " microseconds" << std::endl;
 83 |     std::cout << "  Speedup:         " << std::fixed << std::setprecision(2) 
 84 |               << static_cast<double>(duration_std.count()) / duration_simd.count() << "x" << std::endl;
 85 |     
 86 |     // Print the SIMD vector
 87 |     print_m256(simd_float_vec, "Zero-initialized float vector");
 88 |     
 89 |     // Also demonstrate zero initialization for integers and doubles
 90 |     __m256i simd_int_vec = _mm256_setzero_si256();
 91 |     __m256d simd_double_vec = _mm256_setzero_pd();
 92 |     
 93 |     print_m256i(simd_int_vec, "Zero-initialized integer vector");
 94 |     print_m256d(simd_double_vec, "Zero-initialized double vector");
 95 |     std::cout << std::endl;
 96 | 
 97 |     // --------- 2. Broadcast Initialization (_mm256_set1_*) -------------
 98 |     std::cout << "2. Broadcast Initialization (_mm256_set1_*)" << std::endl;
 99 |     std::cout << "---------------------------------------------------" << std::endl;
100 |     std::cout << "Initializes all elements of a SIMD vector to the same value." << std::endl;
101 |     std::cout << std::endl;
102 |     
103 |     // Standard method for double array
104 |     double std_double_array[4];
105 |     start = std::chrono::high_resolution_clock::now();
106 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
107 |         for (int lane = 0; lane < 4; ++lane) {
108 |             std_double_array[lane] = 10.0;
109 |         }
110 |     }
111 |     stop = std::chrono::high_resolution_clock::now();
112 |     duration_std = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
113 |     
114 |     // SIMD method for double vector
115 |     __m256d simd_double_vec2;
116 |     start = std::chrono::high_resolution_clock::now();
117 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
118 |         simd_double_vec2 = _mm256_set1_pd(10.0);
119 |     }
120 |     stop = std::chrono::high_resolution_clock::now();
121 |     duration_simd = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
122 |     
123 |     // Print results
124 |     std::cout << "Double Broadcast Initialization:" << std::endl;
125 |     std::cout << "  Standard method: " << duration_std.count() << " microseconds" << std::endl;
126 |     std::cout << "  SIMD method:     " << duration_simd.count() << " microseconds" << std::endl;
127 |     std::cout << "  Speedup:         " << std::fixed << std::setprecision(2) 
128 |               << static_cast<double>(duration_std.count()) / duration_simd.count() << "x" << std::endl;
129 |     
130 |     // Print the SIMD vector
131 |     print_m256d(simd_double_vec2, "Broadcast-initialized double vector (10.0)");
132 |     
133 |     // Also demonstrate broadcast initialization for floats and integers
134 |     __m256 simd_float_vec2 = _mm256_set1_ps(42.0f);
135 |     __m256i simd_int_vec2 = _mm256_set1_epi32(100);
136 |     
137 |     print_m256(simd_float_vec2, "Broadcast-initialized float vector (42.0)");
138 |     print_m256i(simd_int_vec2, "Broadcast-initialized integer vector (100)");
139 |     std::cout << std::endl;
140 | 
141 |     // --------- 3. Individual Element Initialization (_mm256_set_*) -------------
142 |     std::cout << "3. Individual Element Initialization (_mm256_set_*)" << std::endl;
143 |     std::cout << "---------------------------------------------------" << std::endl;
144 |     std::cout << "Initializes each element of a SIMD vector individually." << std::endl;
145 |     std::cout << "Note: Elements are specified in reverse order (high to low)." << std::endl;
146 |     std::cout << std::endl;
147 |     
148 |     // Standard method for int array
149 |     int std_int_array[8];
150 |     start = std::chrono::high_resolution_clock::now();
151 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
152 |         for (int lane = 0; lane < 8; ++lane) {
153 |             std_int_array[lane] = lane + 1;
154 |         }
155 |     }
156 |     stop = std::chrono::high_resolution_clock::now();
157 |     duration_std = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
158 |     
159 |     // SIMD method for int vector
160 |     __m256i simd_int_vec3;
161 |     start = std::chrono::high_resolution_clock::now();
162 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
163 |         // Note: _mm256_set_epi32 takes arguments in reverse order (high to low)
164 |         simd_int_vec3 = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
165 |     }
166 |     stop = std::chrono::high_resolution_clock::now();
167 |     duration_simd = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
168 |     
169 |     // Print results
170 |     std::cout << "Integer Individual Initialization:" << std::endl;
171 |     std::cout << "  Standard method: " << duration_std.count() << " microseconds" << std::endl;
172 |     std::cout << "  SIMD method:     " << duration_simd.count() << " microseconds" << std::endl;
173 |     std::cout << "  Speedup:         " << std::fixed << std::setprecision(2) 
174 |               << static_cast<double>(duration_std.count()) / duration_simd.count() << "x" << std::endl;
175 |     
176 |     // Print the SIMD vector
177 |     print_m256i(simd_int_vec3, "Individually-initialized integer vector");
178 |     
179 |     // Also demonstrate individual initialization for floats and doubles
180 |     __m256 simd_float_vec3 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
181 |     __m256d simd_double_vec3 = _mm256_set_pd(4.0, 3.0, 2.0, 1.0);
182 |     
183 |     print_m256(simd_float_vec3, "Individually-initialized float vector");
184 |     print_m256d(simd_double_vec3, "Individually-initialized double vector");
185 |     std::cout << std::endl;
186 | 
187 |     // --------- 4. Reverse Order Initialization (_mm256_setr_*) -------------
188 |     std::cout << "4. Reverse Order Initialization (_mm256_setr_*)" << std::endl;
189 |     std::cout << "---------------------------------------------------" << std::endl;
190 |     std::cout << "Initializes each element of a SIMD vector individually in natural order." << std::endl;
191 |     std::cout << "Note: Elements are specified in natural order (low to high)." << std::endl;
192 |     std::cout << std::endl;
193 |     
194 |     // Standard method for short array
195 |     short std_short_array[16];
196 |     start = std::chrono::high_resolution_clock::now();
197 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
198 |         for (int lane = 0; lane < 16; ++lane) {
199 |             std_short_array[lane] = static_cast<short>(lane + 1);
200 |         }
201 |     }
202 |     stop = std::chrono::high_resolution_clock::now();
203 |     duration_std = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
204 |     
205 |     // SIMD method for short vector
206 |     __m256i simd_short_vec;
207 |     start = std::chrono::high_resolution_clock::now();
208 |     for (int i = 0; i < NUM_ITERATIONS; ++i) {
209 |         // Note: _mm256_setr_epi16 takes arguments in natural order (low to high)
210 |         simd_short_vec = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
211 |     }
212 |     stop = std::chrono::high_resolution_clock::now();
213 |     duration_simd = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
214 |     
215 |     // Print results
216 |     std::cout << "Short Reverse Order Initialization:" << std::endl;
217 |     std::cout << "  Standard method: " << duration_std.count() << " microseconds" << std::endl;
218 |     std::cout << "  SIMD method:     " << duration_simd.count() << " microseconds" << std::endl;
219 |     std::cout << "  Speedup:         " << std::fixed << std::setprecision(2) 
220 |               << static_cast<double>(duration_std.count()) / duration_simd.count() << "x" << std::endl;
221 |     
222 |     // Print the SIMD vector (first 8 elements)
223 |     // Note: We need to extract the shorts from the __m256i
224 |     short short_array[16];
225 |     _mm256_storeu_si256(reinterpret_cast<__m256i*>(short_array), simd_short_vec);
226 |     
227 |     std::cout << "Reverse-initialized short vector: [";
228 |     for (int i = 0; i < 15; i++) {
229 |         std::cout << short_array[i] << ", ";
230 |     }
231 |     std::cout << short_array[15] << "]" << std::endl;
232 |     
233 |     // Also demonstrate reverse initialization for floats
234 |     __m256 simd_float_vec4 = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
235 |     print_m256(simd_float_vec4, "Reverse-initialized float vector");
236 |     
237 |     return 0;
238 | }
239 | 
240 | 


--------------------------------------------------------------------------------
/src/02_Computations/02_dot_product/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <iomanip>
  4 | #include <vector>
  5 | #include <random>
  6 | 
  7 | /**
  8 |  * 02_Computations/02_dot_product - Implementing vector dot products with SIMD
  9 |  * 
 10 |  * This example demonstrates different ways to calculate dot products using SIMD:
 11 |  * 1. Scalar implementation (baseline)
 12 |  * 2. Basic SIMD implementation using separate vectors for x, y, z components
 13 |  * 3. SIMD implementation with Structure of Arrays (SoA) layout
 14 |  * 4. SIMD implementation with horizontal addition
 15 |  * 5. SIMD implementation for large arrays (batch processing)
 16 |  * 
 17 |  * The dot product is a fundamental operation in many fields including:
 18 |  * - Computer graphics (lighting calculations, projections)
 19 |  * - Machine learning (neural networks, similarity measures)
 20 |  * - Physics simulations (force calculations)
 21 |  */
 22 | 
 23 | // 3D vector structure (Array of Structures layout)
 24 | struct Vec3 {
 25 |     float x, y, z;
 26 |     
 27 |     Vec3(float x = 0.0f, float y = 0.0f, float z = 0.0f) : x(x), y(y), z(z) {}
 28 |     
 29 |     // Scalar dot product
 30 |     float dot(const Vec3& other) const {
 31 |         return x * other.x + y * other.y + z * other.z;
 32 |     }
 33 | };
 34 | 
 35 | // Structure of Arrays layout for better SIMD performance
 36 | struct Vec3Array {
 37 |     std::vector<float> x;
 38 |     std::vector<float> y;
 39 |     std::vector<float> z;
 40 |     
 41 |     Vec3Array(size_t size) : x(size), y(size), z(size) {}
 42 |     
 43 |     void set(size_t index, float x_val, float y_val, float z_val) {
 44 |         x[index] = x_val;
 45 |         y[index] = y_val;
 46 |         z[index] = z_val;
 47 |     }
 48 |     
 49 |     void set(size_t index, const Vec3& vec) {
 50 |         x[index] = vec.x;
 51 |         y[index] = vec.y;
 52 |         z[index] = vec.z;
 53 |     }
 54 | };
 55 | 
 56 | // Generate random 3D vectors
 57 | std::vector<Vec3> generateRandomVectors(size_t count) {
 58 |     std::random_device rd;
 59 |     std::mt19937 gen(rd());
 60 |     std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
 61 |     
 62 |     std::vector<Vec3> vectors;
 63 |     vectors.reserve(count);
 64 |     
 65 |     for (size_t i = 0; i < count; i++) {
 66 |         vectors.emplace_back(dist(gen), dist(gen), dist(gen));
 67 |     }
 68 |     
 69 |     return vectors;
 70 | }
 71 | 
 72 | // Convert Array of Structures to Structure of Arrays
 73 | Vec3Array convertToSoA(const std::vector<Vec3>& vectors) {
 74 |     Vec3Array result(vectors.size());
 75 |     
 76 |     for (size_t i = 0; i < vectors.size(); i++) {
 77 |         result.set(i, vectors[i]);
 78 |     }
 79 |     
 80 |     return result;
 81 | }
 82 | 
 83 | // 1. Scalar dot product implementation
 84 | float scalarDotProduct(const std::vector<Vec3>& vectors1, const std::vector<Vec3>& vectors2) {
 85 |     float sum = 0.0f;
 86 |     for (size_t i = 0; i < vectors1.size(); i++) {
 87 |         sum += vectors1[i].dot(vectors2[i]);
 88 |     }
 89 |     return sum;
 90 | }
 91 | 
 92 | // 2. Basic SIMD dot product implementation (for 8 vectors at a time)
 93 | __m256 simdDotProduct8(const std::vector<Vec3>& vectors1, const std::vector<Vec3>& vectors2) {
 94 |     // Load x, y, z components into separate SIMD registers
 95 |     float x1[8], y1[8], z1[8];
 96 |     float x2[8], y2[8], z2[8];
 97 |     
 98 |     for (int i = 0; i < 8; i++) {
 99 |         x1[i] = vectors1[i].x;
100 |         y1[i] = vectors1[i].y;
101 |         z1[i] = vectors1[i].z;
102 |         x2[i] = vectors2[i].x;
103 |         y2[i] = vectors2[i].y;
104 |         z2[i] = vectors2[i].z;
105 |     }
106 |     
107 |     // Load data into SIMD registers
108 |     __m256 vx1 = _mm256_loadu_ps(x1);
109 |     __m256 vy1 = _mm256_loadu_ps(y1);
110 |     __m256 vz1 = _mm256_loadu_ps(z1);
111 |     __m256 vx2 = _mm256_loadu_ps(x2);
112 |     __m256 vy2 = _mm256_loadu_ps(y2);
113 |     __m256 vz2 = _mm256_loadu_ps(z2);
114 |     
115 |     // Compute dot products using FMA (Fused Multiply-Add)
116 |     // (x1*x2 + y1*y2 + z1*z2)
117 |     __m256 result = _mm256_mul_ps(vx1, vx2);                  // x1*x2
118 |     result = _mm256_fmadd_ps(vy1, vy2, result);               // x1*x2 + y1*y2
119 |     result = _mm256_fmadd_ps(vz1, vz2, result);               // x1*x2 + y1*y2 + z1*z2
120 |     
121 |     return result;
122 | }
123 | 
124 | // 3. SIMD dot product with Structure of Arrays layout
125 | __m256 simdDotProductSoA8(const Vec3Array& vectors1, const Vec3Array& vectors2, size_t offset) {
126 |     // Load data into SIMD registers directly from SoA structure
127 |     __m256 vx1 = _mm256_loadu_ps(&vectors1.x[offset]);
128 |     __m256 vy1 = _mm256_loadu_ps(&vectors1.y[offset]);
129 |     __m256 vz1 = _mm256_loadu_ps(&vectors1.z[offset]);
130 |     __m256 vx2 = _mm256_loadu_ps(&vectors2.x[offset]);
131 |     __m256 vy2 = _mm256_loadu_ps(&vectors2.y[offset]);
132 |     __m256 vz2 = _mm256_loadu_ps(&vectors2.z[offset]);
133 |     
134 |     // Compute dot products using FMA (Fused Multiply-Add)
135 |     __m256 result = _mm256_mul_ps(vx1, vx2);
136 |     result = _mm256_fmadd_ps(vy1, vy2, result);
137 |     result = _mm256_fmadd_ps(vz1, vz2, result);
138 |     
139 |     return result;
140 | }
141 | 
142 | // 4. SIMD dot product with horizontal addition (for a single dot product)
143 | float simdDotProductSingle(const Vec3& v1, const Vec3& v2) {
144 |     // Load vector components into SIMD registers
145 |     __m128 vec1 = _mm_setr_ps(v1.x, v1.y, v1.z, 0.0f);
146 |     __m128 vec2 = _mm_setr_ps(v2.x, v2.y, v2.z, 0.0f);
147 |     
148 |     // Multiply components
149 |     __m128 mul = _mm_mul_ps(vec1, vec2);
150 |     
151 |     // Horizontal addition to sum up components
152 |     // First add pairs: (x+y, z+0, x+y, z+0)
153 |     __m128 hadd1 = _mm_hadd_ps(mul, mul);
154 |     // Then add pairs again: (x+y+z+0, x+y+z+0, x+y+z+0, x+y+z+0)
155 |     __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
156 |     
157 |     // Extract the result (first element)
158 |     return _mm_cvtss_f32(hadd2);
159 | }
160 | 
161 | // 5. SIMD dot product for large arrays
162 | float simdDotProductLarge(const Vec3Array& vectors1, const Vec3Array& vectors2) {
163 |     size_t size = vectors1.x.size();
164 |     size_t blocks = size / 8;
165 |     size_t remainder = size % 8;
166 |     
167 |     // Process 8 vectors at a time
168 |     __m256 sum = _mm256_setzero_ps();
169 |     for (size_t i = 0; i < blocks; i++) {
170 |         __m256 dot8 = simdDotProductSoA8(vectors1, vectors2, i * 8);
171 |         sum = _mm256_add_ps(sum, dot8);
172 |     }
173 |     
174 |     // Horizontal sum of the 8 dot products
175 |     float result_array[8];
176 |     _mm256_storeu_ps(result_array, sum);
177 |     float total = 0.0f;
178 |     for (int i = 0; i < 8; i++) {
179 |         total += result_array[i];
180 |     }
181 |     
182 |     // Process remaining vectors
183 |     for (size_t i = blocks * 8; i < size; i++) {
184 |         Vec3 v1(vectors1.x[i], vectors1.y[i], vectors1.z[i]);
185 |         Vec3 v2(vectors2.x[i], vectors2.y[i], vectors2.z[i]);
186 |         total += v1.dot(v2);
187 |     }
188 |     
189 |     return total;
190 | }
191 | 
192 | int main() {
193 |     set_benchmark_suite("02_Computations/02_dot_product");
194 | 
195 |     std::cout << "=== SIMD Dot Product Implementations ===" << std::endl;
196 |     std::cout << std::endl;
197 | 
198 |     // Generate random test vectors
199 |     const size_t NUM_VECTORS = 1024;
200 |     std::vector<Vec3> vectors1 = generateRandomVectors(NUM_VECTORS);
201 |     std::vector<Vec3> vectors2 = generateRandomVectors(NUM_VECTORS);
202 |     
203 |     // Convert to Structure of Arrays for more efficient SIMD processing
204 |     Vec3Array soa_vectors1 = convertToSoA(vectors1);
205 |     Vec3Array soa_vectors2 = convertToSoA(vectors2);
206 |     
207 |     // --------- 1. Basic Dot Product Comparison -------------
208 |     std::cout << "1. Basic Dot Product (8 vectors)" << std::endl;
209 |     std::cout << "---------------------------------------------------" << std::endl;
210 |     std::cout << "Comparing scalar vs. SIMD implementation for 8 vectors." << std::endl;
211 |     std::cout << std::endl;
212 |     
213 |     // Calculate dot products using scalar method
214 |     float scalar_results[8];
215 |     for (int i = 0; i < 8; i++) {
216 |         scalar_results[i] = vectors1[i].dot(vectors2[i]);
217 |     }
218 |     
219 |     // Calculate dot products using SIMD
220 |     __m256 simd_result = simdDotProduct8(vectors1, vectors2);
221 |     float simd_results[8];
222 |     _mm256_storeu_ps(simd_results, simd_result);
223 |     
224 |     // Print and compare results
225 |     std::cout << "Scalar results: [";
226 |     for (int i = 0; i < 7; i++) {
227 |         std::cout << scalar_results[i] << ", ";
228 |     }
229 |     std::cout << scalar_results[7] << "]" << std::endl;
230 |     
231 |     std::cout << "SIMD results:   [";
232 |     for (int i = 0; i < 7; i++) {
233 |         std::cout << simd_results[i] << ", ";
234 |     }
235 |     std::cout << simd_results[7] << "]" << std::endl;
236 |     std::cout << std::endl;
237 |     
238 |     // --------- 2. Performance Comparison -------------
239 |     std::cout << "2. Performance Comparison" << std::endl;
240 |     std::cout << "---------------------------------------------------" << std::endl;
241 |     std::cout << "Comparing performance of different dot product implementations." << std::endl;
242 |     std::cout << std::endl;
243 |     
244 |     // Benchmark scalar implementation
245 |     auto scalar_benchmark = [&]() {
246 |         volatile float result = scalarDotProduct(vectors1, vectors2);
247 |     };
248 |     
249 |     // Benchmark SIMD implementation with AoS layout
250 |     auto simd_aos_benchmark = [&]() {
251 |         float total = 0.0f;
252 |         for (size_t i = 0; i < NUM_VECTORS; i += 8) {
253 |             size_t remaining = std::min(size_t(8), NUM_VECTORS - i);
254 |             if (remaining < 8) break;  // Skip incomplete blocks for simplicity
255 |             
256 |             std::vector<Vec3> block1(vectors1.begin() + i, vectors1.begin() + i + 8);
257 |             std::vector<Vec3> block2(vectors2.begin() + i, vectors2.begin() + i + 8);
258 |             
259 |             __m256 result = simdDotProduct8(block1, block2);
260 |             float results[8];
261 |             _mm256_storeu_ps(results, result);
262 |             
263 |             for (int j = 0; j < 8; j++) {
264 |                 total += results[j];
265 |             }
266 |         }
267 |     };
268 |     
269 |     // Benchmark SIMD implementation with SoA layout
270 |     auto simd_soa_benchmark = [&]() {
271 |         volatile float result = simdDotProductLarge(soa_vectors1, soa_vectors2);
272 |     };
273 |     
274 |     // Run benchmarks
275 |     benchmark_comparison("Dot Product (1024 vectors)", scalar_benchmark, simd_soa_benchmark, 50);
276 |     std::cout << std::endl;
277 |     
278 |     // --------- 3. Structure of Arrays vs Array of Structures -------------
279 |     std::cout << "3. Structure of Arrays vs Array of Structures" << std::endl;
280 |     std::cout << "---------------------------------------------------" << std::endl;
281 |     std::cout << "Comparing AoS vs SoA memory layouts for SIMD processing." << std::endl;
282 |     std::cout << std::endl;
283 |     
284 |     benchmark_comparison("AoS vs SoA", simd_aos_benchmark, simd_soa_benchmark, 50);
285 |     std::cout << std::endl;
286 |     
287 |     // --------- 4. Single Vector Dot Product -------------
288 |     std::cout << "4. Single Vector Dot Product" << std::endl;
289 |     std::cout << "---------------------------------------------------" << std::endl;
290 |     std::cout << "Using SIMD for a single dot product with horizontal addition." << std::endl;
291 |     std::cout << std::endl;
292 |     
293 |     Vec3 v1(0.5f, -0.3f, 0.8f);
294 |     Vec3 v2(0.2f, 0.7f, -0.4f);
295 |     
296 |     float scalar_dot = v1.dot(v2);
297 |     float simd_dot = simdDotProductSingle(v1, v2);
298 |     
299 |     std::cout << "Vector 1: (" << v1.x << ", " << v1.y << ", " << v1.z << ")" << std::endl;
300 |     std::cout << "Vector 2: (" << v2.x << ", " << v2.y << ", " << v2.z << ")" << std::endl;
301 |     std::cout << "Scalar dot product: " << scalar_dot << std::endl;
302 |     std::cout << "SIMD dot product:   " << simd_dot << std::endl;
303 |     std::cout << std::endl;
304 |     
305 |     // Benchmark single vector dot product
306 |     auto scalar_single_benchmark = [&]() {
307 |         for (int i = 0; i < 1000; i++) {
308 |             volatile float result = v1.dot(v2);
309 |         }
310 |     };
311 |     
312 |     auto simd_single_benchmark = [&]() {
313 |         for (int i = 0; i < 1000; i++) {
314 |             volatile float result = simdDotProductSingle(v1, v2);
315 |         }
316 |     };
317 |     
318 | 	// For tiny workloads the SIMD setup costs outweigh the computation, so a lower
319 | 	// speedup (or even a slowdown) is expected and useful to point out to learners.
320 | 	benchmark_comparison("Single Dot Product (1000 iterations)", scalar_single_benchmark, simd_single_benchmark, 10);
321 |     
322 |     return 0;
323 | } 
324 | 


--------------------------------------------------------------------------------
/src/03_Examples/04_image_processing/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <iostream>
  3 | #include <vector>
  4 | #include <cmath>
  5 | #include <algorithm>
  6 | #include <cstring>
  7 | 
  8 | /**
  9 |  * This example demonstrates using SIMD for basic image processing operations.
 10 |  * 
 11 |  * We'll implement:
 12 |  * 1. Brightness adjustment
 13 |  * 2. Contrast enhancement
 14 |  * 3. Image blurring (simple box filter)
 15 |  * 4. Grayscale conversion
 16 |  * 
 17 |  * For simplicity, we'll use a simulated image represented as a 1D array of pixels,
 18 |  * where each pixel has R, G, B components (3 bytes per pixel).
 19 |  */
 20 | 
 21 | // Simulated image dimensions (kept modest so benchmarks finish quickly)
 22 | const int WIDTH = 512;
 23 | const int HEIGHT = 384;
 24 | const int CHANNELS = 3;  // RGB
 25 | const int IMAGE_SIZE = WIDTH * HEIGHT * CHANNELS;
 26 | 
 27 | // Utility function to initialize a test image
 28 | void initialize_test_image(uint8_t* image, int width, int height, int channels) {
 29 |     for (int y = 0; y < height; y++) {
 30 |         for (int x = 0; x < width; x++) {
 31 |             int idx = (y * width + x) * channels;
 32 |             
 33 |             // Create a gradient pattern
 34 |             image[idx + 0] = static_cast<uint8_t>(x * 255 / width);  // R
 35 |             image[idx + 1] = static_cast<uint8_t>(y * 255 / height); // G
 36 |             image[idx + 2] = static_cast<uint8_t>(128);              // B
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | // Print a small section of the image for verification
 42 | void print_image_section(const uint8_t* image, int width, int channels, 
 43 |                          int start_x, int start_y, int section_width, int section_height) {
 44 |     std::cout << "Image section (" << start_x << "," << start_y << ") to (" 
 45 |               << start_x + section_width - 1 << "," << start_y + section_height - 1 << "):" << std::endl;
 46 |     
 47 |     for (int y = start_y; y < start_y + section_height; y++) {
 48 |         for (int x = start_x; x < start_x + section_width; x++) {
 49 |             int idx = (y * width + x) * channels;
 50 |             std::cout << "(" << static_cast<int>(image[idx + 0]) << ","
 51 |                       << static_cast<int>(image[idx + 1]) << ","
 52 |                       << static_cast<int>(image[idx + 2]) << ") ";
 53 |         }
 54 |         std::cout << std::endl;
 55 |     }
 56 |     std::cout << std::endl;
 57 | }
 58 | 
 59 | // 1. Brightness adjustment - Scalar implementation
 60 | void adjust_brightness_scalar(uint8_t* image, int size, int brightness) {
 61 |     for (int i = 0; i < size; i++) {
 62 |         int value = static_cast<int>(image[i]) + brightness;
 63 |         image[i] = static_cast<uint8_t>(std::min(255, std::max(0, value)));
 64 |     }
 65 | }
 66 | 
 67 | // 1. Brightness adjustment - SIMD implementation
 68 | void adjust_brightness_simd(uint8_t* image, int size, int brightness) {
 69 |     // Create a vector with the brightness value
 70 |     __m256i brightness_vec = _mm256_set1_epi8(static_cast<char>(brightness));
 71 |     __m256i zero_vec = _mm256_setzero_si256();
 72 |     __m256i max_vec = _mm256_set1_epi8(static_cast<char>(255));
 73 |     
 74 |     // Process 32 bytes at a time (32 pixels)
 75 |     int i = 0;
 76 |     for (; i <= size - 32; i += 32) {
 77 |         // Load 32 bytes
 78 |         __m256i pixels = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&image[i]));
 79 |         
 80 |         // Add brightness
 81 |         __m256i result = _mm256_adds_epu8(pixels, brightness_vec);
 82 |         
 83 |         // Store result
 84 |         _mm256_storeu_si256(reinterpret_cast<__m256i*>(&image[i]), result);
 85 |     }
 86 |     
 87 |     // Handle remaining pixels
 88 |     for (; i < size; i++) {
 89 |         int value = static_cast<int>(image[i]) + brightness;
 90 |         image[i] = static_cast<uint8_t>(std::min(255, std::max(0, value)));
 91 |     }
 92 | }
 93 | 
 94 | // 2. Contrast enhancement - Scalar implementation
 95 | void enhance_contrast_scalar(uint8_t* image, int size, float contrast) {
 96 |     // Apply contrast formula: (pixel - 128) * contrast + 128
 97 |     for (int i = 0; i < size; i++) {
 98 |         float value = (static_cast<float>(image[i]) - 128.0f) * contrast + 128.0f;
 99 |         image[i] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, value)));
100 |     }
101 | }
102 | 
103 | // 2. Contrast enhancement - SIMD implementation
104 | void enhance_contrast_simd(uint8_t* image, int size, float contrast) {
105 |     // We'll process 8 pixels at a time (converting to float for the calculation)
106 |     __m256 contrast_vec = _mm256_set1_ps(contrast);
107 |     __m256 offset_vec = _mm256_set1_ps(128.0f);
108 |     __m256 min_vec = _mm256_setzero_ps();
109 |     __m256 max_vec = _mm256_set1_ps(255.0f);
110 |     
111 |     // Process 8 pixels at a time
112 |     int i = 0;
113 |     for (; i <= size - 8; i += 8) {
114 |         // Load 8 bytes and convert to float
115 |         __m128i pixels_epi8 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&image[i]));
116 |         __m256i pixels_epi32 = _mm256_cvtepu8_epi32(pixels_epi8);
117 |         __m256 pixels_ps = _mm256_cvtepi32_ps(pixels_epi32);
118 |         
119 |         // Apply contrast formula: (pixel - 128) * contrast + 128
120 |         __m256 centered = _mm256_sub_ps(pixels_ps, offset_vec);
121 |         __m256 scaled = _mm256_mul_ps(centered, contrast_vec);
122 |         __m256 result_ps = _mm256_add_ps(scaled, offset_vec);
123 |         
124 |         // Clamp to [0, 255]
125 |         result_ps = _mm256_min_ps(_mm256_max_ps(result_ps, min_vec), max_vec);
126 |         
127 |         // Convert back to integers and store without requiring AVX-512
128 |         __m256i result_epi32 = _mm256_cvtps_epi32(result_ps);
129 |         __m128i result_low = _mm256_castsi256_si128(result_epi32);
130 |         __m128i result_high = _mm256_extracti128_si256(result_epi32, 1);
131 |         __m128i packed16 = _mm_packus_epi32(result_low, result_high);
132 |         __m128i packed8 = _mm_packus_epi16(packed16, _mm_setzero_si128());
133 |         _mm_storel_epi64(reinterpret_cast<__m128i*>(&image[i]), packed8);
134 |     }
135 |     
136 |     // Handle remaining pixels
137 |     for (; i < size; i++) {
138 |         float value = (static_cast<float>(image[i]) - 128.0f) * contrast + 128.0f;
139 |         image[i] = static_cast<uint8_t>(std::min(255.0f, std::max(0.0f, value)));
140 |     }
141 | }
142 | 
143 | // 3. Grayscale conversion - Scalar implementation
144 | void convert_to_grayscale_scalar(const uint8_t* src, uint8_t* dst, int width, int height) {
145 |     for (int y = 0; y < height; y++) {
146 |         for (int x = 0; x < width; x++) {
147 |             int src_idx = (y * width + x) * CHANNELS;
148 |             int dst_idx = y * width + x;
149 |             
150 |             // Standard grayscale conversion weights
151 |             uint8_t gray = static_cast<uint8_t>(
152 |                 0.299f * src[src_idx + 0] +  // R
153 |                 0.587f * src[src_idx + 1] +  // G
154 |                 0.114f * src[src_idx + 2]    // B
155 |             );
156 |             
157 |             dst[dst_idx] = gray;
158 |         }
159 |     }
160 | }
161 | 
162 | // 3. Grayscale conversion - SIMD implementation
163 | void convert_to_grayscale_simd(const uint8_t* src, uint8_t* dst, int width, int height) {
164 | 	// RGB to Grayscale conversion weights
165 | 	const float weight_r = 0.299f;
166 | 	const float weight_g = 0.587f;
167 | 	const float weight_b = 0.114f;
168 | 
169 | 	const __m128 weight_r_vec = _mm_set1_ps(weight_r);
170 | 	const __m128 weight_g_vec = _mm_set1_ps(weight_g);
171 | 	const __m128 weight_b_vec = _mm_set1_ps(weight_b);
172 | 	const __m128i r_shuffle = _mm_setr_epi8(0, 3, 6, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
173 | 	const __m128i g_shuffle = _mm_setr_epi8(1, 4, 7, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
174 | 	const __m128i b_shuffle = _mm_setr_epi8(2, 5, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
175 | 	const __m128i zero_128 = _mm_setzero_si128();
176 | 	alignas(16) uint8_t chunk[16];
177 | 
178 | 	const int row_stride = width * CHANNELS;
179 | 	for (int y = 0; y < height; y++) {
180 | 		const uint8_t* row_ptr = src + y * row_stride;
181 | 		uint8_t* dst_row = dst + y * width;
182 | 		int x = 0;
183 | 		for (; x <= width - 4; x += 4) {
184 | 			const uint8_t* pixel_ptr = row_ptr + x * CHANNELS;
185 | 			std::memcpy(chunk, pixel_ptr, 12);
186 | 			__m128i block = _mm_load_si128(reinterpret_cast<const __m128i*>(chunk));
187 | 
188 | 			__m128i r_bytes = _mm_shuffle_epi8(block, r_shuffle);
189 | 			__m128i g_bytes = _mm_shuffle_epi8(block, g_shuffle);
190 | 			__m128i b_bytes = _mm_shuffle_epi8(block, b_shuffle);
191 | 
192 | 			__m128 r_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(r_bytes));
193 | 			__m128 g_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(g_bytes));
194 | 			__m128 b_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(b_bytes));
195 | 
196 | 			__m128 gray_ps = _mm_mul_ps(r_ps, weight_r_vec);
197 | 			gray_ps = _mm_add_ps(gray_ps, _mm_mul_ps(g_ps, weight_g_vec));
198 | 			gray_ps = _mm_add_ps(gray_ps, _mm_mul_ps(b_ps, weight_b_vec));
199 | 
200 | 			__m128i gray_epi32 = _mm_cvtps_epi32(gray_ps);
201 | 			__m128i gray_epi16 = _mm_packus_epi32(gray_epi32, zero_128);
202 | 			__m128i gray_epi8 = _mm_packus_epi16(gray_epi16, zero_128);
203 | 
204 | 			int packed = _mm_cvtsi128_si32(gray_epi8);
205 | 			std::memcpy(dst_row + x, &packed, sizeof(packed));
206 | 		}
207 | 		for (; x < width; x++) {
208 | 			int src_idx = (y * width + x) * CHANNELS;
209 | 			float r = static_cast<float>(src[src_idx + 0]);
210 | 			float g = static_cast<float>(src[src_idx + 1]);
211 | 			float b = static_cast<float>(src[src_idx + 2]);
212 | 			float gray = r * weight_r + g * weight_g + b * weight_b;
213 | 			dst_row[x] = static_cast<uint8_t>(gray);
214 | 		}
215 | 	}
216 | }
217 | 
218 | int main() {
219 |     set_benchmark_suite("03_Examples/04_image_processing");
220 | 
221 |     std::cout << "=== SIMD Image Processing Example ===" << std::endl;
222 |     
223 |     // Allocate memory for the test image
224 | 	uint8_t* original_image = new uint8_t[IMAGE_SIZE + 32];
225 | 	uint8_t* processed_image = new uint8_t[IMAGE_SIZE + 32];
226 | 	uint8_t* grayscale_image = new uint8_t[WIDTH * HEIGHT + 32];
227 |     
228 |     // Initialize the test image
229 |     initialize_test_image(original_image, WIDTH, HEIGHT, CHANNELS);
230 |     
231 |     // Print a small section of the original image
232 |     std::cout << "Original Image:" << std::endl;
233 |     print_image_section(original_image, WIDTH, CHANNELS, 0, 0, 3, 3);
234 |     
235 |     // 1. Brightness Adjustment
236 |     std::cout << "1. Brightness Adjustment" << std::endl;
237 |     
238 |     // Copy original image to processed image
239 |     std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
240 |     
241 |     // Benchmark brightness adjustment
242 | 	auto brightness_scalar = [&]() {
243 | 		std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
244 | 		adjust_brightness_scalar(processed_image, IMAGE_SIZE, 50);
245 | 	};
246 | 
247 | 	auto brightness_simd = [&]() {
248 | 		std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
249 | 		adjust_brightness_simd(processed_image, IMAGE_SIZE, 50);
250 | 	};
251 |     
252 |     benchmark_comparison("Brightness Adjustment", brightness_scalar, brightness_simd, 10);
253 |     
254 |     // Print a small section of the brightness-adjusted image
255 |     std::cout << "Brightness-adjusted Image:" << std::endl;
256 |     print_image_section(processed_image, WIDTH, CHANNELS, 0, 0, 3, 3);
257 |     
258 |     // 2. Contrast Enhancement
259 |     std::cout << "2. Contrast Enhancement" << std::endl;
260 |     
261 |     // Reset the processed image
262 |     std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
263 |     
264 |     // Benchmark contrast enhancement
265 | 	auto contrast_scalar = [&]() {
266 | 		std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
267 | 		enhance_contrast_scalar(processed_image, IMAGE_SIZE, 1.5f);
268 | 	};
269 | 
270 | 	auto contrast_simd = [&]() {
271 | 		std::copy(original_image, original_image + IMAGE_SIZE, processed_image);
272 | 		enhance_contrast_simd(processed_image, IMAGE_SIZE, 1.5f);
273 | 	};
274 |     
275 |     benchmark_comparison("Contrast Enhancement", contrast_scalar, contrast_simd, 10);
276 |     
277 |     // Print a small section of the contrast-enhanced image
278 |     std::cout << "Contrast-enhanced Image:" << std::endl;
279 |     print_image_section(processed_image, WIDTH, CHANNELS, 0, 0, 3, 3);
280 |     
281 |     // 3. Grayscale Conversion
282 |     std::cout << "3. Grayscale Conversion" << std::endl;
283 |     
284 |     // Benchmark grayscale conversion
285 |     auto grayscale_scalar = [&]() {
286 |         convert_to_grayscale_scalar(original_image, grayscale_image, WIDTH, HEIGHT);
287 |     };
288 |     
289 |     auto grayscale_simd = [&]() {
290 |         convert_to_grayscale_simd(original_image, grayscale_image, WIDTH, HEIGHT);
291 |     };
292 |     
293 |     benchmark_comparison("Grayscale Conversion", grayscale_scalar, grayscale_simd, 10);
294 |     
295 |     // Print a small section of the grayscale image
296 |     std::cout << "Grayscale Image (showing first few pixels):" << std::endl;
297 |     for (int y = 0; y < 3; y++) {
298 |         for (int x = 0; x < 3; x++) {
299 |             std::cout << static_cast<int>(grayscale_image[y * WIDTH + x]) << " ";
300 |         }
301 |         std::cout << std::endl;
302 |     }
303 |     std::cout << std::endl;
304 |     
305 |     // Clean up
306 |     delete[] original_image;
307 |     delete[] processed_image;
308 |     delete[] grayscale_image;
309 |     
310 |     return 0;
311 | } 
312 | 


--------------------------------------------------------------------------------
/scripts/plot_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Plot SIMD benchmark summaries, attention breakdown, and tiny GPT breakdown figures."""
  3 | 
  4 | import argparse
  5 | import csv
  6 | from pathlib import Path
  7 | from typing import Dict, Iterable, List, Tuple
  8 | import math
  9 | 
 10 | import matplotlib
 11 | 
 12 | matplotlib.use("Agg")
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | Row = Dict[str, str]
 16 | DEFAULT_ARTIFACT_DIR = Path(__file__).resolve().parent.parent / "artifacts"
 17 | 
 18 | 
 19 | def _read_rows(csv_path: Path, required: Iterable[str]) -> List[Row]:
 20 |     if not csv_path.exists():
 21 |         raise FileNotFoundError(f"CSV file not found: {csv_path}")
 22 |     with csv_path.open(newline="", encoding="utf-8") as fh:
 23 |         reader = csv.DictReader(fh)
 24 |         fieldnames = reader.fieldnames or []
 25 |         missing = [col for col in required if col not in fieldnames]
 26 |         if missing:
 27 |             raise ValueError(f"CSV missing expected columns: {missing}")
 28 |         rows = list(reader)
 29 |         if not rows:
 30 |             raise ValueError(f"CSV appears empty: {csv_path}")
 31 |         return rows
 32 | 
 33 | 
 34 | def _parse_float(value: str, *, context: str) -> float:
 35 |     try:
 36 |         return float(value)
 37 |     except ValueError as exc:
 38 |         raise ValueError(f"Unable to parse '{value}' as float ({context})") from exc
 39 | 
 40 | 
 41 | # --- Benchmark overview ----------------------------------------------------
 42 | 
 43 | def plot_benchmarks(csv_path: Path, output_path: Path, dpi: int) -> None:
 44 |     rows = _read_rows(csv_path, required=("suite", "label", "speedup"))
 45 | 
 46 |     skip_suites = {
 47 |         "src/03_Examples/05_attention_block",
 48 |         "03_Examples/05_attention_block",
 49 |         "src/03_Examples/05_mha_block",
 50 |         "03_Examples/05_mha_block",
 51 |         "src/03_Examples/06_tiny_gpt",
 52 |         "03_Examples/06_tiny_gpt",
 53 |     }
 54 | 
 55 |     grouped: Dict[str, List[Tuple[str, float]]] = {}
 56 |     for row in rows:
 57 |         suite = row["suite"]
 58 |         if suite in skip_suites:
 59 |             continue
 60 |         grouped.setdefault(suite, []).append(
 61 |             (
 62 |                 row["label"],
 63 |                 _parse_float(row["speedup"], context=f"suite={suite}, label={row['label']}")
 64 |             )
 65 |         )
 66 | 
 67 |     if not grouped:
 68 |         raise ValueError("No benchmark data to plot (all rows filtered?)")
 69 | 
 70 |     suites = sorted(grouped.keys())
 71 |     values = [sp for suite in suites for _, sp in grouped[suite]]
 72 |     median = sorted(values)[len(values) // 2]
 73 |     mean = sum(values) / len(values)
 74 | 
 75 |     count = len(suites)
 76 |     cols = math.ceil(math.sqrt(count))
 77 |     rows_count = math.ceil(count / cols)
 78 |     fig, axes = plt.subplots(rows_count, cols, figsize=(4 * cols, 3 * rows_count), squeeze=False)
 79 |     fig.suptitle(f"SIMD Microbenchmark Speedups (median {median:.2f}×, mean {mean:.2f}×)", fontsize=14)
 80 | 
 81 |     for ax in axes.flat[count:]:
 82 |         ax.axis("off")
 83 | 
 84 |     for idx, suite in enumerate(suites):
 85 |         ax = axes.flat[idx]
 86 |         labels = [label for label, _ in grouped[suite]]
 87 |         speedups = [value for _, value in grouped[suite]]
 88 |         colors = ["#d62728" if sp < 1.0 else "#1f77b4" for sp in speedups]
 89 |         ypos = list(range(len(labels)))
 90 |         ax.barh(ypos, speedups, color=colors)
 91 |         ax.axvline(1.0, color="#555555", linestyle="--", linewidth=1)
 92 |         ax.set_yticks(ypos)
 93 |         ax.set_yticklabels(labels, fontsize=8)
 94 |         ax.set_xlabel("Speedup (scalar / SIMD)")
 95 |         ax.set_title(suite, fontsize=10)
 96 |         ax.set_xlim(left=0)
 97 |         for y, sp in zip(ypos, speedups):
 98 |             ax.text(sp + 0.05, y, f"{sp:.2f}×", va="center", ha="left", fontsize=8)
 99 | 
100 |     output_path.parent.mkdir(parents=True, exist_ok=True)
101 |     fig.tight_layout(rect=[0, 0.03, 1, 0.95])
102 |     fig.savefig(output_path, dpi=dpi)
103 |     plt.close(fig)
104 | 
105 | 
106 | # --- Attention breakdown ---------------------------------------------------
107 | 
108 | def plot_attention(csv_path: Path, output_path: Path, dpi: int) -> None:
109 |     rows = _read_rows(
110 |         csv_path,
111 |         required=(
112 |             "component",
113 |             "scalar_total_us",
114 |             "simd_total_us",
115 |             "speedup",
116 |             "time_saved_us",
117 |             "contribution_pct",
118 |         ),
119 |     )
120 | 
121 |     overall = next((row for row in rows if row["component"] == "overall"), None)
122 |     if not overall:
123 |         raise ValueError("attention_components.csv must contain an 'overall' row")
124 | 
125 |     components = [row for row in rows if row["component"] != "overall"]
126 |     if not components:
127 |         raise ValueError("attention_components.csv has no component rows")
128 | 
129 |     names = [row["component"] for row in components]
130 |     speedups = [_parse_float(row["speedup"], context=row["component"]) for row in components]
131 |     contributions = [
132 |         _parse_float(row["contribution_pct"], context=row["component"])
133 |         for row in components
134 |     ]
135 |     total_scalar = _parse_float(overall["scalar_total_us"], context="overall scalar_total_us")
136 |     total_simd = _parse_float(overall["simd_total_us"], context="overall simd_total_us")
137 | 
138 |     fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
139 |     fig.suptitle("SIMD Attention Block Breakdown", fontsize=14)
140 | 
141 |     ax_speed, ax_total, ax_contrib = axes
142 | 
143 |     ax_speed.bar(names, speedups, color="#1f77b4")
144 |     ax_speed.set_ylabel("Speedup (scalar / SIMD)")
145 |     ax_speed.set_title("Component Speedups")
146 |     ax_speed.tick_params(axis="x", rotation=45)
147 |     for label in ax_speed.get_xticklabels():
148 |         label.set_horizontalalignment("right")
149 |     for idx, val in enumerate(speedups):
150 |         ax_speed.text(idx, val + 0.05, f"{val:.2f}×", ha="center", va="bottom", fontsize=8)
151 | 
152 |     ax_total.bar(["scalar", "simd"], [total_scalar, total_simd], color=["#d62728", "#2ca02c"])
153 |     ax_total.set_ylabel("Microseconds")
154 |     ax_total.set_title("End-to-End Latency")
155 |     ax_total.set_ylim(0, max(total_scalar, total_simd) * 1.15)
156 |     ax_total.text(0, total_scalar + 10, f"{total_scalar:.0f} μs", ha="center", va="bottom", fontsize=9)
157 |     ax_total.text(1, total_simd + 10, f"{total_simd:.0f} μs", ha="center", va="bottom", fontsize=9)
158 | 
159 |     ax_contrib.barh(names, contributions, color="#9467bd")
160 |     ax_contrib.set_xlabel("% of Total Speedup")
161 |     ax_contrib.set_title("Contribution Share")
162 |     for y, val in enumerate(contributions):
163 |         ax_contrib.text(val + 0.5, y, f"{val:.1f}%", va="center", fontsize=8)
164 |     ax_contrib.set_xlim(0, max(contributions + [10]) * 1.2)
165 | 
166 |     output_path.parent.mkdir(parents=True, exist_ok=True)
167 |     fig.tight_layout(rect=[0, 0, 1, 0.95])
168 |     fig.savefig(output_path, dpi=dpi)
169 |     plt.close(fig)
170 | 
171 | 
172 | # --- Tiny GPT breakdown ----------------------------------------------------
173 | 
174 | def plot_tiny_gpt(csv_path: Path, output_path: Path, dpi: int) -> None:
175 |     rows = _read_rows(
176 |         csv_path,
177 |         required=(
178 |             "stage",
179 |             "count",
180 |             "scalar_total_us",
181 |             "simd_total_us",
182 |             "speedup",
183 |             "time_saved_us",
184 |             "contribution_pct",
185 |         ),
186 |     )
187 | 
188 |     overall = next((row for row in rows if row["stage"] == "overall"), None)
189 |     if not overall:
190 |         raise ValueError("tiny_gpt_components.csv must contain an 'overall' row")
191 | 
192 |     components = [row for row in rows if row["stage"] != "overall"]
193 |     if not components:
194 |         raise ValueError("tiny_gpt_components.csv has no component rows")
195 | 
196 |     names = [row["stage"] for row in components]
197 |     counts = [int(_parse_float(row["count"], context=row["stage"])) for row in components]
198 |     display_names = [f"{name} (×{count})" if count > 1 else name for name, count in zip(names, counts)]
199 |     scalar_vals = [_parse_float(row["scalar_total_us"], context=row["stage"]) for row in components]
200 |     simd_vals = [_parse_float(row["simd_total_us"], context=row["stage"]) for row in components]
201 |     speedups = [_parse_float(row["speedup"], context=row["stage"]) for row in components]
202 |     saved = [_parse_float(row["time_saved_us"], context=row["stage"]) for row in components]
203 |     contributions = [_parse_float(row["contribution_pct"], context=row["stage"]) for row in components]
204 | 
205 |     overall_speedup = _parse_float(overall["speedup"], context="overall speedup")
206 |     overall_scalar = _parse_float(overall["scalar_total_us"], context="overall scalar_total_us")
207 |     overall_simd = _parse_float(overall["simd_total_us"], context="overall simd_total_us")
208 |     overall_count = int(_parse_float(overall["count"], context="overall count"))
209 | 
210 |     fig, axes = plt.subplots(2, 2, figsize=(14, 9))
211 |     fig.suptitle(f"Tiny GPT Decoder Block Breakdown (overall {overall_speedup:.2f}×)", fontsize=14)
212 | 
213 |     x_pos = list(range(len(names)))
214 | 
215 |     ax_speed = axes[0, 0]
216 |     ax_speed.bar(x_pos, speedups, color=["#d62728" if sp < 1.0 else "#1f77b4" for sp in speedups])
217 |     ax_speed.set_ylabel("Speedup (scalar / SIMD)")
218 |     ax_speed.set_title("Component Speedups")
219 |     ax_speed.set_xticks(x_pos)
220 |     ax_speed.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
221 |     for idx, val in enumerate(speedups):
222 |         ax_speed.text(x_pos[idx], val + 0.05, f"{val:.2f}×", ha="center", va="bottom", fontsize=7)
223 | 
224 |     ax_latency = axes[0, 1]
225 |     width = 0.38
226 |     ax_latency.bar([x - width / 2 for x in x_pos], scalar_vals, width=width, label="Scalar", color="#d62728")
227 |     ax_latency.bar([x + width / 2 for x in x_pos], simd_vals, width=width, label="SIMD", color="#2ca02c")
228 |     ax_latency.set_title("Latency by Stage")
229 |     ax_latency.set_ylabel("Microseconds")
230 |     ax_latency.set_xticks(x_pos)
231 |     ax_latency.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
232 |     ax_latency.legend(fontsize=8)
233 | 
234 |     ax_saved = axes[1, 0]
235 |     ax_saved.bar(x_pos, saved, color="#ff7f0e")
236 |     ax_saved.set_ylabel("Time Saved (μs)")
237 |     ax_saved.set_title("Absolute Time Saved")
238 |     ax_saved.set_xticks(x_pos)
239 |     ax_saved.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
240 | 
241 |     ax_contrib = axes[1, 1]
242 |     ax_contrib.barh(display_names, contributions, color="#9467bd")
243 |     ax_contrib.set_xlabel("% of Total Speedup")
244 |     ax_contrib.set_title("Contribution Share")
245 |     for y, val in enumerate(contributions):
246 |         ax_contrib.text(val + 0.5, y, f"{val:.1f}%", va="center", fontsize=8)
247 |     ax_contrib.set_xlim(0, max(contributions + [10]) * 1.2)
248 | 
249 |     fig.text(0.02, 0.02, f"Overall scalar: {overall_scalar:.1f} μs\nOverall SIMD: {overall_simd:.1f} μs\nDecoder blocks: {overall_count}", fontsize=9)
250 | 
251 |     output_path.parent.mkdir(parents=True, exist_ok=True)
252 |     fig.tight_layout(rect=[0, 0.05, 1, 0.95])
253 |     fig.savefig(output_path, dpi=dpi)
254 |     plt.close(fig)
255 | 
256 | 
257 | 
258 | 
259 | # --- CLI -------------------------------------------------------------------
260 | 
261 | def parse_args() -> argparse.Namespace:
262 |     parser = argparse.ArgumentParser(description=__doc__)
263 |     parser.add_argument(
264 |         "--benchmarks-csv",
265 |         type=Path,
266 |         default=DEFAULT_ARTIFACT_DIR / "benchmark_results.csv",
267 |         help="Path to benchmark results CSV.",
268 |     )
269 |     parser.add_argument(
270 |         "--benchmarks-output",
271 |         type=Path,
272 |         default=DEFAULT_ARTIFACT_DIR / "benchmark_speedups.png",
273 |         help="Output path for the benchmark overview plot.",
274 |     )
275 |     parser.add_argument(
276 |         "--attention-csv",
277 |         type=Path,
278 |         default=DEFAULT_ARTIFACT_DIR / "attention_components.csv",
279 |         help="Path to attention components CSV.",
280 |     )
281 |     parser.add_argument(
282 |         "--attention-output",
283 |         type=Path,
284 |         default=DEFAULT_ARTIFACT_DIR / "attention_speedups.png",
285 |         help="Output path for the attention breakdown plot.",
286 |     )
287 |     parser.add_argument(
288 |         "--tiny-gpt-csv",
289 |         type=Path,
290 |         default=DEFAULT_ARTIFACT_DIR / "tiny_gpt_components.csv",
291 |         help="Path to tiny GPT component CSV.",
292 |     )
293 |     parser.add_argument(
294 |         "--tiny-gpt-output",
295 |         type=Path,
296 |         default=DEFAULT_ARTIFACT_DIR / "tiny_gpt_speedups.png",
297 |         help="Output path for the tiny GPT breakdown plot.",
298 |     )
299 |     parser.add_argument("--dpi", type=int, default=180, help="Figure DPI")
300 |     parser.add_argument(
301 |         "--skip-attention",
302 |         action="store_true",
303 |         help="Skip plotting the attention breakdown (benchmark overview still generated).",
304 |     )
305 |     parser.add_argument(
306 |         "--skip-tiny-gpt",
307 |         action="store_true",
308 |         help="Skip plotting the tiny GPT breakdown.",
309 |     )
310 |     return parser.parse_args()
311 | 
312 | 
313 | def main() -> None:
314 |     args = parse_args()
315 |     plot_benchmarks(args.benchmarks_csv, args.benchmarks_output, args.dpi)
316 |     if not args.skip_attention:
317 |         plot_attention(args.attention_csv, args.attention_output, args.dpi)
318 |     if not args.skip_tiny_gpt:
319 |         plot_tiny_gpt(args.tiny_gpt_csv, args.tiny_gpt_output, args.dpi)
320 | 
321 | 
322 | if __name__ == "__main__":  # pragma: no cover
323 |     main()
324 | 


--------------------------------------------------------------------------------
/src/03_Examples/05_mha_block/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <algorithm>
  3 | #include <array>
  4 | #include <chrono>
  5 | #include <cmath>
  6 | #include <filesystem>
  7 | #include <fstream>
  8 | #include <iostream>
  9 | #include <random>
 10 | #include <vector>
 11 | 
 12 | namespace {
 13 | 
 14 | constexpr int SEQ_LEN = 8;
 15 | constexpr int EMBED_DIM = 64;
 16 | constexpr int NUM_HEADS = 4;
 17 | constexpr int HEAD_DIM = EMBED_DIM / NUM_HEADS; // 16
 18 | constexpr int FF_DIM = 128;
 19 | constexpr float EPS = 1e-5f;
 20 | 
 21 | using MatMulFn = void(*)(const float*, const float*, float*, int, int, int);
 22 | using RMSNormFn = void(*)(const float*, const float*, float*, int, float);
 23 | using ActivationFn = void(*)(float*, int);
 24 | 
 25 | float horizontal_sum(__m256 v) {
 26 |     __m128 low = _mm256_castps256_ps128(v);
 27 |     __m128 high = _mm256_extractf128_ps(v, 1);
 28 |     __m128 sum = _mm_add_ps(low, high);
 29 |     sum = _mm_hadd_ps(sum, sum);
 30 |     sum = _mm_hadd_ps(sum, sum);
 31 |     return _mm_cvtss_f32(sum);
 32 | }
 33 | 
 34 | void transpose_matrix(const float* src, float* dst, int rows, int cols) {
 35 |     for (int r = 0; r < rows; ++r) {
 36 |         for (int c = 0; c < cols; ++c) {
 37 |             dst[c * rows + r] = src[r * cols + c];
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | void split_heads(const float* src, float* dst) {
 43 |     for (int s = 0; s < SEQ_LEN; ++s) {
 44 |         for (int h = 0; h < NUM_HEADS; ++h) {
 45 |             const float* from = src + s * EMBED_DIM + h * HEAD_DIM;
 46 |             float* to = dst + (h * SEQ_LEN + s) * HEAD_DIM;
 47 |             std::copy(from, from + HEAD_DIM, to);
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | void combine_heads(const float* src, float* dst) {
 53 |     for (int s = 0; s < SEQ_LEN; ++s) {
 54 |         for (int h = 0; h < NUM_HEADS; ++h) {
 55 |             const float* from = src + (h * SEQ_LEN + s) * HEAD_DIM;
 56 |             float* to = dst + s * EMBED_DIM + h * HEAD_DIM;
 57 |             std::copy(from, from + HEAD_DIM, to);
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | void softmax_inplace(float* row, int len) {
 63 |     float max_val = row[0];
 64 |     for (int i = 1; i < len; ++i) max_val = std::max(max_val, row[i]);
 65 |     float sum = 0.0f;
 66 |     for (int i = 0; i < len; ++i) {
 67 |         row[i] = std::exp(row[i] - max_val);
 68 |         sum += row[i];
 69 |     }
 70 |     float inv = 1.0f / sum;
 71 |     for (int i = 0; i < len; ++i) row[i] *= inv;
 72 | }
 73 | 
 74 | void rmsnorm_scalar(const float* input, const float* gamma, float* output, int length, float eps) {
 75 |     float sum_sq = 0.0f;
 76 |     for (int i = 0; i < length; ++i) {
 77 |         sum_sq += input[i] * input[i];
 78 |     }
 79 |     float scale = 1.0f / std::sqrt(sum_sq / length + eps);
 80 |     for (int i = 0; i < length; ++i) {
 81 |         output[i] = input[i] * gamma[i % EMBED_DIM] * scale;
 82 |     }
 83 | }
 84 | 
 85 | void rmsnorm_simd(const float* input, const float* gamma, float* output, int length, float eps) {
 86 |     __m256 acc = _mm256_setzero_ps();
 87 |     int i = 0;
 88 |     for (; i <= length - 8; i += 8) {
 89 |         __m256 v = _mm256_loadu_ps(input + i);
 90 |         acc = _mm256_fmadd_ps(v, v, acc);
 91 |     }
 92 |     float sum_sq = horizontal_sum(acc);
 93 |     for (; i < length; ++i) sum_sq += input[i] * input[i];
 94 |     float scale = 1.0f / std::sqrt(sum_sq / length + eps);
 95 |     __m256 scale_vec = _mm256_set1_ps(scale);
 96 |     for (i = 0; i <= length - 8; i += 8) {
 97 |         __m256 v = _mm256_loadu_ps(input + i);
 98 |         __m256 g = _mm256_loadu_ps(gamma + (i % EMBED_DIM));
 99 |         __m256 result = _mm256_mul_ps(_mm256_mul_ps(v, g), scale_vec);
100 |         _mm256_storeu_ps(output + i, result);
101 |     }
102 |     for (; i < length; ++i) {
103 |         output[i] = input[i] * gamma[i % EMBED_DIM] * scale;
104 |     }
105 | }
106 | 
107 | void relu_scalar(float* data, int length) {
108 |     for (int i = 0; i < length; ++i) data[i] = std::max(0.0f, data[i]);
109 | }
110 | 
111 | void relu_simd(float* data, int length) {
112 |     __m256 zero = _mm256_setzero_ps();
113 |     int i = 0;
114 |     for (; i <= length - 8; i += 8) {
115 |         __m256 v = _mm256_loadu_ps(data + i);
116 |         _mm256_storeu_ps(data + i, _mm256_max_ps(zero, v));
117 |     }
118 |     for (; i < length; ++i) data[i] = std::max(0.0f, data[i]);
119 | }
120 | 
121 | void matmul_scalar(const float* A, const float* B_T, float* C, int M, int K, int N) {
122 |     for (int i = 0; i < M; ++i) {
123 |         for (int j = 0; j < N; ++j) {
124 |             float sum = 0.0f;
125 |             const float* a_ptr = A + i * K;
126 |             const float* b_ptr = B_T + j * K;
127 |             for (int k = 0; k < K; ++k) {
128 |                 sum += a_ptr[k] * b_ptr[k];
129 |             }
130 |             C[i * N + j] = sum;
131 |         }
132 |     }
133 | }
134 | 
135 | void matmul_simd(const float* A, const float* B_T, float* C, int M, int K, int N) {
136 |     for (int i = 0; i < M; ++i) {
137 |         for (int j = 0; j < N; ++j) {
138 |             const float* a_ptr = A + i * K;
139 |             const float* b_ptr = B_T + j * K;
140 |             __m256 acc = _mm256_setzero_ps();
141 |             int k = 0;
142 |             for (; k <= K - 8; k += 8) {
143 |                 __m256 a = _mm256_loadu_ps(a_ptr + k);
144 |                 __m256 b = _mm256_loadu_ps(b_ptr + k);
145 |                 acc = _mm256_fmadd_ps(a, b, acc);
146 |             }
147 |             float sum = horizontal_sum(acc);
148 |             for (; k < K; ++k) sum += a_ptr[k] * b_ptr[k];
149 |             C[i * N + j] = sum;
150 |         }
151 |     }
152 | }
153 | 
154 | struct ModelWeights {
155 |     std::vector<float> Wq_T, Wk_T, Wv_T, Wo_T, Wff1_T, Wff2_T;
156 |     std::vector<float> gamma1, gamma2;
157 | };
158 | 
159 | struct ModelInputs {
160 |     std::vector<float> tokens;
161 | };
162 | 
163 | double attention_scale() {
164 |     return 1.0 / std::sqrt(static_cast<double>(HEAD_DIM));
165 | }
166 | 
167 | void initialize(ModelWeights& weights, ModelInputs& inputs) {
168 |     std::mt19937 rng(42);
169 |     std::uniform_real_distribution<float> dist(-0.5f, 0.5f);
170 | 
171 |     auto fill_and_transpose = [&](int rows, int cols, std::vector<float>& storage) {
172 |         std::vector<float> tmp(rows * cols);
173 |         for (float& v : tmp) v = dist(rng);
174 |         storage.resize(cols * rows);
175 |         transpose_matrix(tmp.data(), storage.data(), rows, cols);
176 |     };
177 | 
178 |     weights.Wq_T.reserve(EMBED_DIM * EMBED_DIM);
179 |     weights.Wk_T.reserve(EMBED_DIM * EMBED_DIM);
180 |     weights.Wv_T.reserve(EMBED_DIM * EMBED_DIM);
181 |     weights.Wo_T.reserve(EMBED_DIM * EMBED_DIM);
182 |     weights.Wff1_T.reserve(FF_DIM * EMBED_DIM);
183 |     weights.Wff2_T.reserve(EMBED_DIM * FF_DIM);
184 | 
185 |     fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wq_T);
186 |     fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wk_T);
187 |     fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wv_T);
188 |     fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wo_T);
189 |     fill_and_transpose(EMBED_DIM, FF_DIM, weights.Wff1_T);
190 |     fill_and_transpose(FF_DIM, EMBED_DIM, weights.Wff2_T);
191 | 
192 |     weights.gamma1.assign(EMBED_DIM, 1.0f);
193 |     weights.gamma2.assign(EMBED_DIM, 1.0f);
194 | 
195 |     inputs.tokens.resize(SEQ_LEN * EMBED_DIM);
196 |     for (float& v : inputs.tokens) v = dist(rng);
197 | }
198 | 
199 | struct StageTimes {
200 |     double rms1 = 0.0;
201 |     double qkv = 0.0;
202 |     double attn_scores = 0.0;
203 |     double attn_context = 0.0;
204 |     double attn_proj = 0.0;
205 |     double rms2 = 0.0;
206 |     double ff1 = 0.0;
207 |     double activation = 0.0;
208 |     double ff2 = 0.0;
209 |     double total = 0.0;
210 | };
211 | 
212 | using Clock = std::chrono::high_resolution_clock;
213 | using Microseconds = std::chrono::microseconds;
214 | 
215 | void run_block(const ModelWeights& weights,
216 |                const ModelInputs& inputs,
217 |                std::vector<float>& output,
218 |                MatMulFn matmul_fn,
219 |                RMSNormFn rms_fn,
220 |                ActivationFn activation_fn,
221 |                StageTimes* times = nullptr) {
222 |     const int token_dim = SEQ_LEN * EMBED_DIM;
223 |     output.assign(token_dim, 0.0f);
224 | 
225 |     auto add_duration = [](StageTimes* st, double& field,
226 |                            const Clock::time_point& start,
227 |                            const Clock::time_point& end) {
228 |         if (st) {
229 |             field += std::chrono::duration_cast<Microseconds>(end - start).count();
230 |         }
231 |     };
232 | 
233 |     Clock::time_point block_start;
234 |     if (times) {
235 |         block_start = Clock::now();
236 |     }
237 | 
238 |     std::vector<float> norm1(token_dim);
239 |     Clock::time_point t0 = Clock::now();
240 |     rms_fn(inputs.tokens.data(), weights.gamma1.data(), norm1.data(), token_dim, EPS);
241 |     add_duration(times, times->rms1, t0, Clock::now());
242 | 
243 |     std::vector<float> Q(token_dim), K(token_dim), V(token_dim);
244 |     t0 = Clock::now();
245 |     matmul_fn(norm1.data(), weights.Wq_T.data(), Q.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
246 |     matmul_fn(norm1.data(), weights.Wk_T.data(), K.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
247 |     matmul_fn(norm1.data(), weights.Wv_T.data(), V.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
248 |     add_duration(times, times->qkv, t0, Clock::now());
249 | 
250 |     std::vector<float> Q_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM);
251 |     std::vector<float> K_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM);
252 |     std::vector<float> V_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM);
253 |     split_heads(Q.data(), Q_heads.data());
254 |     split_heads(K.data(), K_heads.data());
255 |     split_heads(V.data(), V_heads.data());
256 | 
257 |     std::vector<float> context_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM, 0.0f);
258 |     std::vector<float> K_heads_T(NUM_HEADS * HEAD_DIM * SEQ_LEN);
259 |     std::vector<float> V_heads_T(NUM_HEADS * HEAD_DIM * SEQ_LEN);
260 |     std::vector<float> scores(SEQ_LEN * SEQ_LEN);
261 |     const float scale = static_cast<float>(attention_scale());
262 | 
263 |     for (int h = 0; h < NUM_HEADS; ++h) {
264 |         const float* q_head = Q_heads.data() + h * SEQ_LEN * HEAD_DIM;
265 |         const float* k_head = K_heads.data() + h * SEQ_LEN * HEAD_DIM;
266 |         const float* v_head = V_heads.data() + h * SEQ_LEN * HEAD_DIM;
267 |         float* k_t = K_heads_T.data() + h * HEAD_DIM * SEQ_LEN;
268 |         float* v_t = V_heads_T.data() + h * HEAD_DIM * SEQ_LEN;
269 |         transpose_matrix(k_head, k_t, SEQ_LEN, HEAD_DIM);
270 |         transpose_matrix(v_head, v_t, SEQ_LEN, HEAD_DIM);
271 | 
272 |         t0 = Clock::now();
273 |         matmul_fn(q_head, k_t, scores.data(), SEQ_LEN, HEAD_DIM, SEQ_LEN);
274 |         add_duration(times, times->attn_scores, t0, Clock::now());
275 |         for (float& s : scores) s *= scale;
276 |         for (int row = 0; row < SEQ_LEN; ++row) {
277 |             softmax_inplace(scores.data() + row * SEQ_LEN, SEQ_LEN);
278 |         }
279 |         float* ctx = context_heads.data() + h * SEQ_LEN * HEAD_DIM;
280 |         t0 = Clock::now();
281 |         matmul_fn(scores.data(), v_t, ctx, SEQ_LEN, SEQ_LEN, HEAD_DIM);
282 |         add_duration(times, times->attn_context, t0, Clock::now());
283 |     }
284 | 
285 |     std::vector<float> context(token_dim);
286 |     combine_heads(context_heads.data(), context.data());
287 | 
288 |     std::vector<float> attn_proj(token_dim);
289 |     t0 = Clock::now();
290 |     matmul_fn(context.data(), weights.Wo_T.data(), attn_proj.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
291 |     add_duration(times, times->attn_proj, t0, Clock::now());
292 | 
293 |     std::vector<float> residual1(token_dim);
294 |     for (int i = 0; i < token_dim; ++i) {
295 |         residual1[i] = inputs.tokens[i] + attn_proj[i];
296 |     }
297 | 
298 |     std::vector<float> norm2(token_dim);
299 |     t0 = Clock::now();
300 |     rms_fn(residual1.data(), weights.gamma2.data(), norm2.data(), token_dim, EPS);
301 |     add_duration(times, times->rms2, t0, Clock::now());
302 | 
303 |     std::vector<float> ff1(SEQ_LEN * FF_DIM);
304 |     t0 = Clock::now();
305 |     matmul_fn(norm2.data(), weights.Wff1_T.data(), ff1.data(), SEQ_LEN, EMBED_DIM, FF_DIM);
306 |     add_duration(times, times->ff1, t0, Clock::now());
307 | 
308 |     t0 = Clock::now();
309 |     activation_fn(ff1.data(), static_cast<int>(ff1.size()));
310 |     add_duration(times, times->activation, t0, Clock::now());
311 | 
312 |     std::vector<float> ff2(token_dim);
313 |     t0 = Clock::now();
314 |     matmul_fn(ff1.data(), weights.Wff2_T.data(), ff2.data(), SEQ_LEN, FF_DIM, EMBED_DIM);
315 |     add_duration(times, times->ff2, t0, Clock::now());
316 | 
317 |     for (int i = 0; i < token_dim; ++i) {
318 |         output[i] = residual1[i] + ff2[i];
319 |     }
320 | 
321 |     if (times) {
322 |         times->total += std::chrono::duration_cast<Microseconds>(Clock::now() - block_start).count();
323 |     }
324 | }
325 | 
326 | float max_abs_diff(const std::vector<float>& a, const std::vector<float>& b) {
327 |     float diff = 0.0f;
328 |     for (size_t i = 0; i < a.size(); ++i) {
329 |         diff = std::max(diff, std::abs(a[i] - b[i]));
330 |     }
331 |     return diff;
332 | }
333 | 
334 | } // namespace
335 | 
336 | int main() {
337 |     ModelWeights weights;
338 |     ModelInputs inputs;
339 |     initialize(weights, inputs);
340 | 
341 |     std::vector<float> scalar_output, simd_output;
342 | 
343 |     StageTimes scalar_stage{}, simd_stage{};
344 |     auto scalar_block_times = [&]() {
345 |         run_block(weights, inputs, scalar_output, matmul_scalar, rmsnorm_scalar, relu_scalar, &scalar_stage);
346 |     };
347 |     auto simd_block_times = [&]() {
348 |         run_block(weights, inputs, simd_output, matmul_simd, rmsnorm_simd, relu_simd, &simd_stage);
349 |     };
350 | 
351 |     constexpr int stage_iterations = 10;
352 |     for (int i = 0; i < stage_iterations; ++i) {
353 |         scalar_block_times();
354 |     }
355 |     for (int i = 0; i < stage_iterations; ++i) {
356 |         simd_block_times();
357 |     }
358 | 
359 |     auto normalize = [&](StageTimes& st) {
360 |         st.rms1 /= stage_iterations;
361 |         st.qkv /= stage_iterations;
362 |         st.attn_scores /= stage_iterations;
363 |         st.attn_context /= stage_iterations;
364 |         st.attn_proj /= stage_iterations;
365 |         st.rms2 /= stage_iterations;
366 |         st.ff1 /= stage_iterations;
367 |         st.activation /= stage_iterations;
368 |         st.ff2 /= stage_iterations;
369 |         st.total /= stage_iterations;
370 |     };
371 |     normalize(scalar_stage);
372 |     normalize(simd_stage);
373 | 
374 |     auto scalar_block = [&]() {
375 |         run_block(weights, inputs, scalar_output, matmul_scalar, rmsnorm_scalar, relu_scalar);
376 |     };
377 |     auto simd_block = [&]() {
378 |         run_block(weights, inputs, simd_output, matmul_simd, rmsnorm_simd, relu_simd);
379 |     };
380 | 
381 |     scalar_block();
382 |     simd_block();
383 | 
384 |     float diff = max_abs_diff(scalar_output, simd_output);
385 |     std::cout << "Max |scalar - simd| difference: " << diff << "\n";
386 | 
387 |     struct ComponentRow {
388 |         std::string name;
389 |         int count;
390 |         double scalar_us;
391 |         double simd_us;
392 |     };
393 | 
394 |     std::vector<ComponentRow> components = {
395 |         {"rmsnorm", 2, scalar_stage.rms1 + scalar_stage.rms2, simd_stage.rms1 + simd_stage.rms2},
396 |         {"qkv_projections", 1, scalar_stage.qkv, simd_stage.qkv},
397 |         {"attention_scores", 1, scalar_stage.attn_scores, simd_stage.attn_scores},
398 |         {"context_projection", 1, scalar_stage.attn_context, simd_stage.attn_context},
399 |         {"output_projection", 1, scalar_stage.attn_proj, simd_stage.attn_proj},
400 |         {"ffn_expand", 1, scalar_stage.ff1, simd_stage.ff1},
401 |         {"activation", 1, scalar_stage.activation, simd_stage.activation},
402 |         {"ffn_contract", 1, scalar_stage.ff2, simd_stage.ff2}
403 |     };
404 | 
405 |     double sum_scalar = 0.0;
406 |     double sum_simd = 0.0;
407 |     for (const auto& c : components) {
408 |         sum_scalar += c.scalar_us;
409 |         sum_simd += c.simd_us;
410 |     }
411 |     double others_scalar = std::max(0.0, scalar_stage.total - sum_scalar);
412 |     double others_simd = std::max(0.0, simd_stage.total - sum_simd);
413 |     components.push_back({"others", 1, others_scalar, others_simd});
414 | 
415 |     double total_scalar = scalar_stage.total;
416 |     double total_simd = simd_stage.total;
417 |     double total_saved = total_scalar - total_simd;
418 | 
419 |     namespace fs = std::filesystem;
420 |     fs::path out_path = fs::current_path().parent_path().parent_path().parent_path() / "artifacts" / "attention_components.csv";
421 |     fs::create_directories(out_path.parent_path());
422 |     std::ofstream file(out_path);
423 |     if (file) {
424 |         file << "component,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct\n";
425 |         for (const auto& c : components) {
426 |             double saved = c.scalar_us - c.simd_us;
427 |             double speedup = c.simd_us > 0.0 ? c.scalar_us / c.simd_us : 0.0;
428 |             double pct = (total_saved > 0.0) ? (saved / total_saved * 100.0) : 0.0;
429 |             file << c.name << ',' << c.count << ',' << c.scalar_us << ',' << c.simd_us << ','
430 |                  << speedup << ',' << saved << ',' << pct << '\n';
431 |         }
432 |         double overall_speedup = total_simd > 0.0 ? total_scalar / total_simd : 0.0;
433 |         file << "overall,1," << total_scalar << ',' << total_simd << ','
434 |              << overall_speedup << ',' << total_saved << ',' << 100.0 << '\n';
435 |     } else {
436 |         std::cerr << "Failed to write attention_components.csv" << std::endl;
437 |     }
438 | 
439 |     set_benchmark_suite("03_Examples/05_mha_block");
440 |     benchmark_comparison("attention_block", scalar_block, simd_block, 50);
441 | 
442 |     std::cout << "First token (scalar vs SIMD):\n";
443 |     for (int d = 0; d < EMBED_DIM; ++d) {
444 |         if (d && d % 8 == 0) std::cout << "\n";
445 |         std::cout << scalar_output[d] << " / " << simd_output[d] << "  ";
446 |     }
447 |     std::cout << "\n";
448 | 
449 |     return 0;
450 | }
451 | 


--------------------------------------------------------------------------------
/src/03_Examples/06_tiny_gpt/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "../../include/simd_utils.h"
  2 | #include <algorithm>
  3 | #include <array>
  4 | #include <chrono>
  5 | #include <cmath>
  6 | #include <cstdint>
  7 | #include <filesystem>
  8 | #include <fstream>
  9 | #include <iostream>
 10 | #include <numeric>
 11 | #include <random>
 12 | #include <string>
 13 | #include <utility>
 14 | #include <vector>
 15 | 
 16 | namespace {
 17 | 
 18 | constexpr int SEQ_LEN = 8;
 19 | constexpr int EMBED_DIM = 64;
 20 | constexpr int NUM_HEADS = 4;
 21 | constexpr int HEAD_DIM = EMBED_DIM / NUM_HEADS; // 16
 22 | constexpr int FF_DIM = 128;
 23 | constexpr int VOCAB_SIZE = 64;
 24 | constexpr int NUM_BLOCKS = 61;
 25 | constexpr float EPS = 1e-5f;
 26 | 
 27 | using Clock = std::chrono::high_resolution_clock;
 28 | using Microseconds = std::chrono::microseconds;
 29 | 
 30 | float horizontal_sum(__m256 v) {
 31 |     __m128 low = _mm256_castps256_ps128(v);
 32 |     __m128 high = _mm256_extractf128_ps(v, 1);
 33 |     __m128 sum = _mm_add_ps(low, high);
 34 |     sum = _mm_hadd_ps(sum, sum);
 35 |     sum = _mm_hadd_ps(sum, sum);
 36 |     return _mm_cvtss_f32(sum);
 37 | }
 38 | 
 39 | void transpose_matrix(const float* src, float* dst, int rows, int cols) {
 40 |     for (int r = 0; r < rows; ++r) {
 41 |         for (int c = 0; c < cols; ++c) {
 42 |             dst[c * rows + r] = src[r * cols + c];
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | void split_heads(const float* src, float* dst) {
 48 |     for (int s = 0; s < SEQ_LEN; ++s) {
 49 |         for (int h = 0; h < NUM_HEADS; ++h) {
 50 |             const float* from = src + s * EMBED_DIM + h * HEAD_DIM;
 51 |             float* to = dst + (h * SEQ_LEN + s) * HEAD_DIM;
 52 |             std::copy(from, from + HEAD_DIM, to);
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | void combine_heads(const float* src, float* dst) {
 58 |     for (int s = 0; s < SEQ_LEN; ++s) {
 59 |         for (int h = 0; h < NUM_HEADS; ++h) {
 60 |             const float* from = src + (h * SEQ_LEN + s) * HEAD_DIM;
 61 |             float* to = dst + s * EMBED_DIM + h * HEAD_DIM;
 62 |             std::copy(from, from + HEAD_DIM, to);
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | void softmax_inplace(float* row, int len) {
 68 |     float max_val = row[0];
 69 |     for (int i = 1; i < len; ++i) {
 70 |         max_val = std::max(max_val, row[i]);
 71 |     }
 72 |     float sum = 0.0f;
 73 |     for (int i = 0; i < len; ++i) {
 74 |         row[i] = std::exp(row[i] - max_val);
 75 |         sum += row[i];
 76 |     }
 77 |     float inv = 1.0f / sum;
 78 |     for (int i = 0; i < len; ++i) {
 79 |         row[i] *= inv;
 80 |     }
 81 | }
 82 | 
 83 | void rmsnorm_scalar(const float* input, const float* gamma, float* output, int length, float eps) {
 84 |     float sum_sq = 0.0f;
 85 |     for (int i = 0; i < length; ++i) {
 86 |         sum_sq += input[i] * input[i];
 87 |     }
 88 |     float scale = 1.0f / std::sqrt(sum_sq / length + eps);
 89 |     for (int i = 0; i < length; ++i) {
 90 |         output[i] = input[i] * gamma[i % EMBED_DIM] * scale;
 91 |     }
 92 | }
 93 | 
 94 | void rmsnorm_simd(const float* input, const float* gamma, float* output, int length, float eps) {
 95 |     __m256 acc = _mm256_setzero_ps();
 96 |     int i = 0;
 97 |     for (; i <= length - 8; i += 8) {
 98 |         __m256 v = _mm256_loadu_ps(input + i);
 99 |         acc = _mm256_fmadd_ps(v, v, acc);
100 |     }
101 |     float sum_sq = horizontal_sum(acc);
102 |     for (; i < length; ++i) {
103 |         sum_sq += input[i] * input[i];
104 |     }
105 |     float scale = 1.0f / std::sqrt(sum_sq / length + eps);
106 |     __m256 scale_vec = _mm256_set1_ps(scale);
107 |     i = 0;
108 |     for (; i <= length - 8; i += 8) {
109 |         __m256 v = _mm256_loadu_ps(input + i);
110 |         __m256 g = _mm256_loadu_ps(gamma + (i % EMBED_DIM));
111 |         __m256 result = _mm256_mul_ps(_mm256_mul_ps(v, g), scale_vec);
112 |         _mm256_storeu_ps(output + i, result);
113 |     }
114 |     for (; i < length; ++i) {
115 |         output[i] = input[i] * gamma[i % EMBED_DIM] * scale;
116 |     }
117 | }
118 | 
119 | void relu_scalar(float* data, int length) {
120 |     for (int i = 0; i < length; ++i) {
121 |         data[i] = std::max(0.0f, data[i]);
122 |     }
123 | }
124 | 
125 | void relu_simd(float* data, int length) {
126 |     __m256 zero = _mm256_setzero_ps();
127 |     int i = 0;
128 |     for (; i <= length - 8; i += 8) {
129 |         __m256 v = _mm256_loadu_ps(data + i);
130 |         _mm256_storeu_ps(data + i, _mm256_max_ps(zero, v));
131 |     }
132 |     for (; i < length; ++i) {
133 |         data[i] = std::max(0.0f, data[i]);
134 |     }
135 | }
136 | 
137 | void residual_add_scalar(const float* a, const float* b, float* out, int length) {
138 |     for (int i = 0; i < length; ++i) {
139 |         out[i] = a[i] + b[i];
140 |     }
141 | }
142 | 
143 | void residual_add_simd(const float* a, const float* b, float* out, int length) {
144 |     int i = 0;
145 |     for (; i <= length - 8; i += 8) {
146 |         __m256 va = _mm256_loadu_ps(a + i);
147 |         __m256 vb = _mm256_loadu_ps(b + i);
148 |         _mm256_storeu_ps(out + i, _mm256_add_ps(va, vb));
149 |     }
150 |     for (; i < length; ++i) {
151 |         out[i] = a[i] + b[i];
152 |     }
153 | }
154 | 
155 | void matmul_scalar(const float* A, const float* B_T, float* C, int M, int K, int N) {
156 |     for (int i = 0; i < M; ++i) {
157 |         for (int j = 0; j < N; ++j) {
158 |             float sum = 0.0f;
159 |             const float* a_ptr = A + i * K;
160 |             const float* b_ptr = B_T + j * K;
161 |             for (int k = 0; k < K; ++k) {
162 |                 sum += a_ptr[k] * b_ptr[k];
163 |             }
164 |             C[i * N + j] = sum;
165 |         }
166 |     }
167 | }
168 | 
169 | void matmul_simd(const float* A, const float* B_T, float* C, int M, int K, int N) {
170 |     for (int i = 0; i < M; ++i) {
171 |         for (int j = 0; j < N; ++j) {
172 |             const float* a_ptr = A + i * K;
173 |             const float* b_ptr = B_T + j * K;
174 |             __m256 acc = _mm256_setzero_ps();
175 |             int k = 0;
176 |             for (; k <= K - 8; k += 8) {
177 |                 __m256 a = _mm256_loadu_ps(a_ptr + k);
178 |                 __m256 b = _mm256_loadu_ps(b_ptr + k);
179 |                 acc = _mm256_fmadd_ps(a, b, acc);
180 |             }
181 |             float sum = horizontal_sum(acc);
182 |             for (; k < K; ++k) {
183 |                 sum += a_ptr[k] * b_ptr[k];
184 |             }
185 |             C[i * N + j] = sum;
186 |         }
187 |     }
188 | }
189 | 
190 | struct FloatLinear {
191 |     int out_dim = 0;
192 |     int in_dim = 0;
193 |     std::vector<float> weights_T;
194 | };
195 | 
196 | struct QuantizedLinear {
197 |     int out_dim = 0;
198 |     int in_dim = 0;
199 |     std::vector<int8_t> weights_T;
200 |     std::vector<float> scales;
201 | };
202 | 
203 | struct LinearPair {
204 |     FloatLinear fp32;
205 |     QuantizedLinear q8;
206 | };
207 | 
208 | struct BlockWeights {
209 |     LinearPair wq;
210 |     LinearPair wk;
211 |     LinearPair wv;
212 |     LinearPair wo;
213 |     LinearPair wff1;
214 |     LinearPair wff2;
215 |     std::vector<float> gamma1;
216 |     std::vector<float> gamma2;
217 | };
218 | 
219 | struct ModelWeights {
220 |     std::vector<float> embedding;
221 |     std::vector<BlockWeights> blocks;
222 |     LinearPair logits;
223 | };
224 | 
225 | float attention_scale() {
226 |     return 1.0f / std::sqrt(static_cast<float>(HEAD_DIM));
227 | }
228 | 
229 | void quantize_into(const FloatLinear& src, QuantizedLinear& dst) {
230 |     dst.out_dim = src.out_dim;
231 |     dst.in_dim = src.in_dim;
232 |     dst.weights_T.resize(static_cast<size_t>(dst.out_dim) * dst.in_dim);
233 |     dst.scales.resize(dst.out_dim);
234 |     for (int row = 0; row < dst.out_dim; ++row) {
235 |         const float* src_row = src.weights_T.data() + row * dst.in_dim;
236 |         float max_abs = 0.0f;
237 |         for (int col = 0; col < dst.in_dim; ++col) {
238 |             max_abs = std::max(max_abs, std::abs(src_row[col]));
239 |         }
240 |         float scale = max_abs > 0.0f ? (max_abs / 127.0f) : 1.0f;
241 |         dst.scales[row] = scale;
242 |         float inv_scale = scale > 0.0f ? (1.0f / scale) : 0.0f;
243 |         int8_t* dst_row = dst.weights_T.data() + row * dst.in_dim;
244 |         for (int col = 0; col < dst.in_dim; ++col) {
245 |             float scaled = src_row[col] * inv_scale;
246 |             int value = static_cast<int>(std::round(scaled));
247 |             value = std::max(-127, std::min(127, value));
248 |             dst_row[col] = static_cast<int8_t>(value);
249 |         }
250 |     }
251 | }
252 | 
253 | float dot_q8_simd(const int8_t* w_row, const float* x, int length, float scale) {
254 |     __m256 acc = _mm256_setzero_ps();
255 |     __m256 scale_vec = _mm256_set1_ps(scale);
256 |     int i = 0;
257 |     for (; i <= length - 16; i += 16) {
258 |         __m128i packed = _mm_loadu_si128(reinterpret_cast<const __m128i*>(w_row + i));
259 |         __m128i lo_bytes = packed;
260 |         __m128i hi_bytes = _mm_srli_si128(packed, 8);
261 |         __m128i lo_i16 = _mm_cvtepi8_epi16(lo_bytes);
262 |         __m128i hi_i16 = _mm_cvtepi8_epi16(hi_bytes);
263 |         __m256i lo_i32 = _mm256_cvtepi16_epi32(lo_i16);
264 |         __m256i hi_i32 = _mm256_cvtepi16_epi32(hi_i16);
265 |         __m256 w_lo = _mm256_mul_ps(_mm256_cvtepi32_ps(lo_i32), scale_vec);
266 |         __m256 w_hi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi_i32), scale_vec);
267 |         __m256 x_lo = _mm256_loadu_ps(x + i);
268 |         __m256 x_hi = _mm256_loadu_ps(x + i + 8);
269 |         acc = _mm256_fmadd_ps(w_lo, x_lo, acc);
270 |         acc = _mm256_fmadd_ps(w_hi, x_hi, acc);
271 |     }
272 |     for (; i <= length - 8; i += 8) {
273 |         __m128i packed8 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(w_row + i));
274 |         __m128i i16 = _mm_cvtepi8_epi16(packed8);
275 |         __m256i i32 = _mm256_cvtepi16_epi32(i16);
276 |         __m256 w_vec = _mm256_mul_ps(_mm256_cvtepi32_ps(i32), scale_vec);
277 |         __m256 x_vec = _mm256_loadu_ps(x + i);
278 |         acc = _mm256_fmadd_ps(w_vec, x_vec, acc);
279 |     }
280 |     float sum = horizontal_sum(acc);
281 |     for (; i < length; ++i) {
282 |         sum += static_cast<float>(w_row[i]) * scale * x[i];
283 |     }
284 |     return sum;
285 | }
286 | 
287 | void matmul_q8_simd(const float* A, const QuantizedLinear& W_T, float* C, int M, int K, int N) {
288 |     for (int i = 0; i < M; ++i) {
289 |         const float* a_ptr = A + i * K;
290 |         for (int j = 0; j < N; ++j) {
291 |             const int8_t* w_row = W_T.weights_T.data() + j * K;
292 |             float scale = W_T.scales[j];
293 |             C[i * N + j] = dot_q8_simd(w_row, a_ptr, K, scale);
294 |         }
295 |     }
296 | }
297 | 
298 | struct StageTimes {
299 |     double embed = 0.0;
300 |     double rms1 = 0.0;
301 |     double qkv = 0.0;
302 |     double attn_scores = 0.0;
303 |     double attn_softmax = 0.0;
304 |     double attn_context = 0.0;
305 |     double attn_proj = 0.0;
306 |     double residual1 = 0.0;
307 |     double rms2 = 0.0;
308 |     double ffn_expand = 0.0;
309 |     double activation = 0.0;
310 |     double ffn_contract = 0.0;
311 |     double residual2 = 0.0;
312 |     double logits = 0.0;
313 |     double sampling = 0.0;
314 |     double total = 0.0;
315 | 
316 |     StageTimes& accumulate(const StageTimes& other) {
317 |         embed += other.embed;
318 |         rms1 += other.rms1;
319 |         qkv += other.qkv;
320 |         attn_scores += other.attn_scores;
321 |         attn_softmax += other.attn_softmax;
322 |         attn_context += other.attn_context;
323 |         attn_proj += other.attn_proj;
324 |         residual1 += other.residual1;
325 |         rms2 += other.rms2;
326 |         ffn_expand += other.ffn_expand;
327 |         activation += other.activation;
328 |         ffn_contract += other.ffn_contract;
329 |         residual2 += other.residual2;
330 |         logits += other.logits;
331 |         sampling += other.sampling;
332 |         total += other.total;
333 |         return *this;
334 |     }
335 | 
336 |     StageTimes& scale(double factor) {
337 |         embed *= factor;
338 |         rms1 *= factor;
339 |         qkv *= factor;
340 |         attn_scores *= factor;
341 |         attn_softmax *= factor;
342 |         attn_context *= factor;
343 |         attn_proj *= factor;
344 |         residual1 *= factor;
345 |         rms2 *= factor;
346 |         ffn_expand *= factor;
347 |         activation *= factor;
348 |         ffn_contract *= factor;
349 |         residual2 *= factor;
350 |         logits *= factor;
351 |         sampling *= factor;
352 |         total *= factor;
353 |         return *this;
354 |     }
355 | };
356 | 
357 | struct ScalarKernels {
358 |     static void rmsnorm(const float* input, const std::vector<float>& gamma, float* output, int length, float eps) {
359 |         rmsnorm_scalar(input, gamma.data(), output, length, eps);
360 |     }
361 | 
362 |     static void apply_linear(const LinearPair& weight, const float* input, float* output, int M, int K, int N) {
363 |         matmul_scalar(input, weight.fp32.weights_T.data(), output, M, K, N);
364 |     }
365 | 
366 |     static void add_residual(const float* a, const float* b, float* out, int length) {
367 |         residual_add_scalar(a, b, out, length);
368 |     }
369 | 
370 |     static void activation(float* data, int length) {
371 |         relu_scalar(data, length);
372 |     }
373 | 
374 |     static void matmul_float(const float* A, const float* B_T, float* C, int M, int K, int N) {
375 |         matmul_scalar(A, B_T, C, M, K, N);
376 |     }
377 | };
378 | 
379 | struct SimdKernels {
380 |     static void rmsnorm(const float* input, const std::vector<float>& gamma, float* output, int length, float eps) {
381 |         rmsnorm_simd(input, gamma.data(), output, length, eps);
382 |     }
383 | 
384 |     static void apply_linear(const LinearPair& weight, const float* input, float* output, int M, int K, int N) {
385 |         matmul_q8_simd(input, weight.q8, output, M, K, N);
386 |     }
387 | 
388 |     static void add_residual(const float* a, const float* b, float* out, int length) {
389 |         residual_add_simd(a, b, out, length);
390 |     }
391 | 
392 |     static void activation(float* data, int length) {
393 |         relu_simd(data, length);
394 |     }
395 | 
396 |     static void matmul_float(const float* A, const float* B_T, float* C, int M, int K, int N) {
397 |         matmul_simd(A, B_T, C, M, K, N);
398 |     }
399 | };
400 | 
401 | std::vector<float> random_matrix_T(int rows, int cols, std::mt19937& rng, std::uniform_real_distribution<float>& dist) {
402 |     std::vector<float> original(static_cast<size_t>(rows) * cols);
403 |     for (float& v : original) {
404 |         v = dist(rng);
405 |     }
406 |     std::vector<float> transposed(static_cast<size_t>(rows) * cols);
407 |     transpose_matrix(original.data(), transposed.data(), rows, cols);
408 |     return transposed;
409 | }
410 | 
411 | LinearPair make_linear_pair(int out_dim, int in_dim, std::mt19937& rng, std::uniform_real_distribution<float>& dist) {
412 |     LinearPair pair;
413 |     pair.fp32.out_dim = out_dim;
414 |     pair.fp32.in_dim = in_dim;
415 |     pair.fp32.weights_T = random_matrix_T(out_dim, in_dim, rng, dist);
416 |     pair.q8.out_dim = out_dim;
417 |     pair.q8.in_dim = in_dim;
418 |     quantize_into(pair.fp32, pair.q8);
419 |     return pair;
420 | }
421 | 
422 | void initialize(ModelWeights& weights) {
423 |     std::mt19937 rng(1337);
424 |     std::uniform_real_distribution<float> dist(-0.8f, 0.8f);
425 | 
426 |     weights.embedding.resize(static_cast<size_t>(VOCAB_SIZE) * EMBED_DIM);
427 |     for (float& v : weights.embedding) {
428 |         v = dist(rng);
429 |     }
430 | 
431 |     weights.blocks.clear();
432 |     weights.blocks.reserve(NUM_BLOCKS);
433 |     for (int i = 0; i < NUM_BLOCKS; ++i) {
434 |         BlockWeights block;
435 |         block.wq = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist);
436 |         block.wk = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist);
437 |         block.wv = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist);
438 |         block.wo = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist);
439 |         block.wff1 = make_linear_pair(FF_DIM, EMBED_DIM, rng, dist);
440 |         block.wff2 = make_linear_pair(EMBED_DIM, FF_DIM, rng, dist);
441 |         block.gamma1.assign(EMBED_DIM, 1.0f);
442 |         block.gamma2.assign(EMBED_DIM, 1.0f);
443 |         weights.blocks.push_back(std::move(block));
444 |     }
445 | 
446 |     weights.logits = make_linear_pair(VOCAB_SIZE, EMBED_DIM, rng, dist);
447 | }
448 | 
449 | inline double elapsed_us(const Clock::time_point& start, const Clock::time_point& end) {
450 |     return static_cast<double>(std::chrono::duration_cast<Microseconds>(end - start).count());
451 | }
452 | 
453 | float max_abs_diff(const std::vector<float>& a, const std::vector<float>& b) {
454 |     float diff = 0.0f;
455 |     for (size_t i = 0; i < a.size(); ++i) {
456 |         diff = std::max(diff, std::abs(a[i] - b[i]));
457 |     }
458 |     return diff;
459 | }
460 | 
461 | template <typename Kernels>
462 | void decode_impl(const ModelWeights& weights,
463 |                  const std::vector<int>& tokens,
464 |                  std::vector<float>& block_output,
465 |                  std::vector<float>& logits_out,
466 |                  int& next_token,
467 |                  StageTimes* times) {
468 |     StageTimes local;
469 | 
470 |     Clock::time_point total_start;
471 |     if (times) {
472 |         total_start = Clock::now();
473 |     }
474 | 
475 |     constexpr int token_dim = SEQ_LEN * EMBED_DIM;
476 |     constexpr int ff_dim = SEQ_LEN * FF_DIM;
477 | 
478 |     std::array<float, token_dim> hidden{};
479 |     std::array<float, token_dim> norm1{};
480 |     std::array<float, token_dim> Q{};
481 |     std::array<float, token_dim> K{};
482 |     std::array<float, token_dim> V{};
483 |     std::array<float, NUM_HEADS * SEQ_LEN * HEAD_DIM> Q_heads{};
484 |     std::array<float, NUM_HEADS * SEQ_LEN * HEAD_DIM> K_heads{};
485 |     std::array<float, NUM_HEADS * SEQ_LEN * HEAD_DIM> V_heads{};
486 |     std::array<float, NUM_HEADS * HEAD_DIM * SEQ_LEN> K_heads_T{};
487 |     std::array<float, NUM_HEADS * HEAD_DIM * SEQ_LEN> V_heads_T{};
488 |     std::array<float, NUM_HEADS * SEQ_LEN * HEAD_DIM> context_heads{};
489 |     std::array<float, token_dim> context{};
490 |     std::array<float, token_dim> attn_proj{};
491 |     std::array<float, token_dim> residual1{};
492 |     std::array<float, token_dim> norm2{};
493 |     std::array<float, ff_dim> ff1{};
494 |     std::array<float, token_dim> ff2{};
495 |     std::array<float, token_dim> residual2{};
496 |     std::array<float, SEQ_LEN * SEQ_LEN> scores{};
497 |     std::array<float, VOCAB_SIZE> logits{};
498 | 
499 |     auto record = [&](double& field, const Clock::time_point& start_tp, const Clock::time_point& end_tp) {
500 |         if (times) {
501 |             field += elapsed_us(start_tp, end_tp);
502 |         }
503 |     };
504 | 
505 |     auto t0 = Clock::now();
506 |     for (int t = 0; t < SEQ_LEN; ++t) {
507 |         const float* src = weights.embedding.data() + tokens[t] * EMBED_DIM;
508 |         std::copy(src, src + EMBED_DIM, hidden.data() + t * EMBED_DIM);
509 |     }
510 |     record(local.embed, t0, Clock::now());
511 | 
512 |     const float scale = static_cast<float>(attention_scale());
513 |     for (std::size_t block_idx = 0; block_idx < weights.blocks.size(); ++block_idx) {
514 |         const BlockWeights& block = weights.blocks[block_idx];
515 | 
516 |         t0 = Clock::now();
517 |         Kernels::rmsnorm(hidden.data(), block.gamma1, norm1.data(), token_dim, EPS);
518 |         record(local.rms1, t0, Clock::now());
519 | 
520 |         t0 = Clock::now();
521 |         Kernels::apply_linear(block.wq, norm1.data(), Q.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
522 |         Kernels::apply_linear(block.wk, norm1.data(), K.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
523 |         Kernels::apply_linear(block.wv, norm1.data(), V.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
524 |         record(local.qkv, t0, Clock::now());
525 | 
526 |         split_heads(Q.data(), Q_heads.data());
527 |         split_heads(K.data(), K_heads.data());
528 |         split_heads(V.data(), V_heads.data());
529 | 
530 |         for (int h = 0; h < NUM_HEADS; ++h) {
531 |             const float* q_head = Q_heads.data() + h * SEQ_LEN * HEAD_DIM;
532 |             const float* k_head = K_heads.data() + h * SEQ_LEN * HEAD_DIM;
533 |             const float* v_head = V_heads.data() + h * SEQ_LEN * HEAD_DIM;
534 |             float* k_t = K_heads_T.data() + h * HEAD_DIM * SEQ_LEN;
535 |             float* v_t = V_heads_T.data() + h * HEAD_DIM * SEQ_LEN;
536 |             transpose_matrix(k_head, k_t, SEQ_LEN, HEAD_DIM);
537 |             transpose_matrix(v_head, v_t, SEQ_LEN, HEAD_DIM);
538 | 
539 |             t0 = Clock::now();
540 |             Kernels::matmul_float(q_head, k_t, scores.data(), SEQ_LEN, HEAD_DIM, SEQ_LEN);
541 |             record(local.attn_scores, t0, Clock::now());
542 | 
543 |             auto softmax_start = Clock::now();
544 |             for (float& s : scores) {
545 |                 s *= scale;
546 |             }
547 |             for (int row = 0; row < SEQ_LEN; ++row) {
548 |                 softmax_inplace(scores.data() + row * SEQ_LEN, SEQ_LEN);
549 |             }
550 |             record(local.attn_softmax, softmax_start, Clock::now());
551 | 
552 |             float* ctx = context_heads.data() + h * SEQ_LEN * HEAD_DIM;
553 |             t0 = Clock::now();
554 |             Kernels::matmul_float(scores.data(), v_t, ctx, SEQ_LEN, SEQ_LEN, HEAD_DIM);
555 |             record(local.attn_context, t0, Clock::now());
556 |         }
557 | 
558 |         combine_heads(context_heads.data(), context.data());
559 | 
560 |         t0 = Clock::now();
561 |         Kernels::apply_linear(block.wo, context.data(), attn_proj.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM);
562 |         record(local.attn_proj, t0, Clock::now());
563 | 
564 |         t0 = Clock::now();
565 |         Kernels::add_residual(hidden.data(), attn_proj.data(), residual1.data(), token_dim);
566 |         record(local.residual1, t0, Clock::now());
567 | 
568 |         t0 = Clock::now();
569 |         Kernels::rmsnorm(residual1.data(), block.gamma2, norm2.data(), token_dim, EPS);
570 |         record(local.rms2, t0, Clock::now());
571 | 
572 |         t0 = Clock::now();
573 |         Kernels::apply_linear(block.wff1, norm2.data(), ff1.data(), SEQ_LEN, EMBED_DIM, FF_DIM);
574 |         record(local.ffn_expand, t0, Clock::now());
575 | 
576 |         t0 = Clock::now();
577 |         Kernels::activation(ff1.data(), ff_dim);
578 |         record(local.activation, t0, Clock::now());
579 | 
580 |         t0 = Clock::now();
581 |         Kernels::apply_linear(block.wff2, ff1.data(), ff2.data(), SEQ_LEN, FF_DIM, EMBED_DIM);
582 |         record(local.ffn_contract, t0, Clock::now());
583 | 
584 |         t0 = Clock::now();
585 |         Kernels::add_residual(residual1.data(), ff2.data(), residual2.data(), token_dim);
586 |         record(local.residual2, t0, Clock::now());
587 | 
588 |         std::copy(residual2.begin(), residual2.end(), hidden.begin());
589 |     }
590 | 
591 |     block_output.assign(hidden.begin(), hidden.end());
592 | 
593 |     const float* last_token = hidden.data() + (SEQ_LEN - 1) * EMBED_DIM;
594 |     t0 = Clock::now();
595 |     Kernels::apply_linear(weights.logits, last_token, logits.data(), 1, EMBED_DIM, VOCAB_SIZE);
596 |     record(local.logits, t0, Clock::now());
597 | 
598 |     logits_out.assign(logits.begin(), logits.end());
599 | 
600 |     std::vector<float> probs(logits.begin(), logits.end());
601 |     auto samp_start = Clock::now();
602 |     softmax_inplace(probs.data(), VOCAB_SIZE);
603 |     next_token = static_cast<int>(std::distance(probs.begin(), std::max_element(probs.begin(), probs.end())));
604 |     record(local.sampling, samp_start, Clock::now());
605 | 
606 |     if (times) {
607 |         local.total += elapsed_us(total_start, Clock::now());
608 |         times->accumulate(local);
609 |     }
610 | }
611 | 
612 | 
613 | } // namespace
614 | 
615 | int main() {
616 |     ModelWeights weights;
617 |     initialize(weights);
618 | 
619 |     std::vector<int> prompt = {3, 17, 12, 8, 5, 9, 2, 0};
620 | 
621 |     std::vector<float> scalar_hidden;
622 |     std::vector<float> scalar_logits;
623 |     std::vector<float> simd_hidden;
624 |     std::vector<float> simd_logits;
625 |     int scalar_next = -1;
626 |     int simd_next = -1;
627 | 
628 |     constexpr int warmup = 5;
629 |     for (int i = 0; i < warmup; ++i) {
630 |         decode_impl<ScalarKernels>(weights, prompt, scalar_hidden, scalar_logits, scalar_next, nullptr);
631 |         decode_impl<SimdKernels>(weights, prompt, simd_hidden, simd_logits, simd_next, nullptr);
632 |     }
633 | 
634 |     StageTimes scalar_accum;
635 |     StageTimes simd_accum;
636 |     constexpr int iterations = 20;
637 |     for (int i = 0; i < iterations; ++i) {
638 |         decode_impl<ScalarKernels>(weights, prompt, scalar_hidden, scalar_logits, scalar_next, &scalar_accum);
639 |     }
640 |     for (int i = 0; i < iterations; ++i) {
641 |         decode_impl<SimdKernels>(weights, prompt, simd_hidden, simd_logits, simd_next, &simd_accum);
642 |     }
643 | 
644 |     scalar_accum.scale(1.0 / iterations);
645 |     simd_accum.scale(1.0 / iterations);
646 | 
647 |     float hidden_diff = max_abs_diff(scalar_hidden, simd_hidden);
648 |     float logits_diff = max_abs_diff(scalar_logits, simd_logits);
649 | 
650 |     std::cout << "Tiny GPT block (scalar vs SIMD quantized)\n";
651 |     std::cout << "Decoder blocks: " << weights.blocks.size() << "\n";
652 |     std::cout << "Prompt tokens: ";
653 |     for (size_t i = 0; i < prompt.size(); ++i) {
654 |         if (i) std::cout << ", ";
655 |         std::cout << prompt[i];
656 |     }
657 |     std::cout << "\n";
658 |     std::cout << "Scalar next token: " << scalar_next << "\n";
659 |     std::cout << "SIMD next token:   " << simd_next << "\n";
660 |     std::cout << "Max hidden diff:   " << hidden_diff << "\n";
661 |     std::cout << "Max logits diff:   " << logits_diff << "\n";
662 | 
663 |     struct ComponentRow {
664 |         std::string name;
665 |         int count;
666 |         double StageTimes::*field;
667 |     };
668 | 
669 |     const int block_count = static_cast<int>(weights.blocks.size());
670 |     const int per_head = block_count * NUM_HEADS;
671 | 
672 |     std::vector<ComponentRow> components = {
673 |         {"embedding", 1, &StageTimes::embed},
674 |         {"rmsnorm_1", block_count, &StageTimes::rms1},
675 |         {"qkv_linear", block_count, &StageTimes::qkv},
676 |         {"attention_scores", per_head, &StageTimes::attn_scores},
677 |         {"attention_softmax", per_head, &StageTimes::attn_softmax},
678 |         {"attention_context", per_head, &StageTimes::attn_context},
679 |         {"attention_projection", block_count, &StageTimes::attn_proj},
680 |         {"residual_1", block_count, &StageTimes::residual1},
681 |         {"rmsnorm_2", block_count, &StageTimes::rms2},
682 |         {"ffn_expand", block_count, &StageTimes::ffn_expand},
683 |         {"activation", block_count, &StageTimes::activation},
684 |         {"ffn_contract", block_count, &StageTimes::ffn_contract},
685 |         {"residual_2", block_count, &StageTimes::residual2},
686 |         {"logits_projection", 1, &StageTimes::logits},
687 |         {"sampling", 1, &StageTimes::sampling}
688 |     };
689 | 
690 |     double total_scalar = scalar_accum.total;
691 |     double total_simd = simd_accum.total;
692 |     double total_saved = total_scalar - total_simd;
693 | 
694 |     namespace fs = std::filesystem;
695 |     fs::path out_dir = fs::current_path().parent_path().parent_path().parent_path() / "artifacts";
696 |     fs::create_directories(out_dir);
697 | 
698 |     fs::path csv_path = out_dir / "tiny_gpt_components.csv";
699 |     std::ofstream csv(csv_path);
700 |     if (csv) {
701 |         csv << "stage,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct\n";
702 |         for (const auto& comp : components) {
703 |             double scalar_val = scalar_accum.*(comp.field);
704 |             double simd_val = simd_accum.*(comp.field);
705 |             double saved = scalar_val - simd_val;
706 |             double speedup = simd_val > 0.0 ? scalar_val / simd_val : 0.0;
707 |             double pct = (total_saved != 0.0) ? (saved / total_saved * 100.0) : 0.0;
708 |             csv << comp.name << ',' << comp.count << ',' << scalar_val << ',' << simd_val << ','
709 |                 << speedup << ',' << saved << ',' << pct << '\n';
710 |         }
711 |         double overall_speedup = total_simd > 0.0 ? total_scalar / total_simd : 0.0;
712 |         csv << "overall," << block_count << ',' << total_scalar << ',' << total_simd << ','
713 |             << overall_speedup << ',' << total_saved << ',' << 100.0 << '\n';
714 |     } else {
715 |         std::cerr << "Failed to write " << csv_path << "\n";
716 |     }
717 | 
718 |     set_benchmark_suite("03_Examples/06_tiny_gpt");
719 |     auto scalar_decode = [&]() {
720 |         std::vector<float> hidden;
721 |         std::vector<float> logits;
722 |         int next = -1;
723 |         decode_impl<ScalarKernels>(weights, prompt, hidden, logits, next, nullptr);
724 |     };
725 |     auto simd_decode = [&]() {
726 |         std::vector<float> hidden;
727 |         std::vector<float> logits;
728 |         int next = -1;
729 |         decode_impl<SimdKernels>(weights, prompt, hidden, logits, next, nullptr);
730 |     };
731 | 
732 |     benchmark_comparison("tiny_gpt_decode", scalar_decode, simd_decode, 50);
733 | 
734 |     std::cout << "Scalar average total: " << total_scalar << " us\n";
735 |     std::cout << "SIMD average total:   " << total_simd << " us\n";
736 |     std::cout << "Overall speedup:      " << (total_simd > 0.0 ? total_scalar / total_simd : 0.0) << "x\n";
737 | 
738 |     return 0;
739 | }
740 | 


--------------------------------------------------------------------------------