├── .gitignore ├── artifacts ├── tiny_gpt_speedups.png ├── attention_speedups.png ├── benchmark_speedups.png ├── attention_components.csv ├── tiny_gpt_components.csv └── benchmark_results.csv ├── assets └── intel_isa_families.jpeg ├── src ├── 03_Examples │ ├── 03_data_types │ │ ├── Makefile │ │ └── main.cpp │ ├── 01_conditional_code │ │ ├── Makefile │ │ └── main.cpp │ ├── 02_quadratic_equations │ │ ├── Makefile │ │ └── main.cpp │ ├── 04_image_processing │ │ ├── Makefile │ │ └── main.cpp │ ├── 05_mha_block │ │ ├── Makefile │ │ └── main.cpp │ └── 06_tiny_gpt │ │ ├── Makefile │ │ └── main.cpp ├── 01_Basics │ ├── 01_importing_simd │ │ ├── Makefile │ │ └── main.cpp │ ├── 04_loading_data │ │ ├── Makefile │ │ └── main.cpp │ ├── 02_initializing_data │ │ ├── Makefile │ │ └── main.cpp │ └── 03_binding_with_unions │ │ ├── Makefile │ │ └── main.cpp ├── 02_Computations │ ├── 01_simple_maths │ │ ├── Makefile │ │ └── main.cpp │ └── 02_dot_product │ │ ├── Makefile │ │ └── main.cpp └── include │ └── simd_utils.h ├── LICENSE ├── README.md ├── runme.sh └── scripts └── plot_results.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode -------------------------------------------------------------------------------- /artifacts/tiny_gpt_speedups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/tiny_gpt_speedups.png -------------------------------------------------------------------------------- /assets/intel_isa_families.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/assets/intel_isa_families.jpeg -------------------------------------------------------------------------------- /artifacts/attention_speedups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/attention_speedups.png -------------------------------------------------------------------------------- /artifacts/benchmark_speedups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuninxia/hands-on-simd-programming/HEAD/artifacts/benchmark_speedups.png -------------------------------------------------------------------------------- /src/03_Examples/03_data_types/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) -------------------------------------------------------------------------------- /src/01_Basics/01_importing_simd/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/01_Basics/04_loading_data/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/01_Basics/02_initializing_data/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/01_Basics/03_binding_with_unions/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/03_Examples/01_conditional_code/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/02_Computations/01_simple_maths/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/02_Computations/02_dot_product/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/03_Examples/02_quadratic_equations/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++11 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/03_Examples/04_image_processing/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -masm=att -std=c++11 $(EXTRA_FLAGS) 3 | TARGET=simd_program 4 | ASMFILE=main.s 5 | SRCFILE=main.cpp 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(SRCFILE) 10 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) 11 | 12 | asm: $(SRCFILE) 13 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 14 | 15 | clean: 16 | rm -f $(TARGET) $(ASMFILE) 17 | -------------------------------------------------------------------------------- /src/03_Examples/05_mha_block/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++17 3 | LDFLAGS=-lstdc++fs 4 | TARGET=simd_program 5 | ASMFILE=main.s 6 | SRCFILE=main.cpp 7 | 8 | all: $(TARGET) 9 | 10 | $(TARGET): $(SRCFILE) 11 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) $(LDFLAGS) 12 | 13 | asm: $(SRCFILE) 14 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 15 | 16 | clean: 17 | rm -f $(TARGET) $(ASMFILE) 18 | -------------------------------------------------------------------------------- /src/03_Examples/06_tiny_gpt/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-mavx2 -mfma -masm=att -std=c++17 3 | LDFLAGS=-lstdc++fs 4 | TARGET=simd_program 5 | ASMFILE=main.s 6 | SRCFILE=main.cpp 7 | 8 | all: $(TARGET) 9 | 10 | $(TARGET): $(SRCFILE) 11 | $(CXX) $(CXXFLAGS) $(SRCFILE) -o $(TARGET) $(LDFLAGS) 12 | 13 | asm: $(SRCFILE) 14 | $(CXX) $(CXXFLAGS) -S $(SRCFILE) -o $(ASMFILE) 15 | 16 | clean: 17 | rm -f $(TARGET) $(ASMFILE) 18 | -------------------------------------------------------------------------------- /artifacts/attention_components.csv: -------------------------------------------------------------------------------- 1 | component,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct 2 | rmsnorm,2,18.9,4,4.725,14.9,1.06634 3 | qkv_projections,1,733.2,233.7,3.13736,499.5,35.7475 4 | attention_scores,1,32.6,16.4,1.9878,16.2,1.15938 5 | context_projection,1,36.2,25.4,1.4252,10.8,0.772919 6 | output_projection,1,245.7,80.8,3.04084,164.9,11.8013 7 | ffn_expand,1,487.4,158.2,3.08091,329.2,23.5597 8 | activation,1,14.9,1.6,9.3125,13.3,0.951836 9 | ffn_contract,1,484.3,138.4,3.49928,345.9,24.7549 10 | others,1,111.8,109.2,1.02381,2.6,0.186073 11 | overall,1,2165,767.7,2.82011,1397.3,100 12 | -------------------------------------------------------------------------------- /artifacts/tiny_gpt_components.csv: -------------------------------------------------------------------------------- 1 | stage,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct 2 | embedding,1,0.1,0.1,1,0,0 3 | rmsnorm_1,61,557.25,122.3,4.55642,434.95,0.663332 4 | qkv_linear,61,44915.1,22530.9,1.99349,22384.2,34.1376 5 | attention_scores,244,1989.7,992.15,2.00544,997.55,1.52134 6 | attention_softmax,244,749.55,744.65,1.00658,4.9,0.00747288 7 | attention_context,244,2234.05,1495.8,1.49355,738.25,1.12589 8 | attention_projection,61,14978.7,7483.55,2.00154,7495.1,11.4306 9 | residual_1,61,249.3,61.3,4.06688,188,0.286715 10 | rmsnorm_2,61,557.45,122.7,4.54319,434.75,0.663027 11 | ffn_expand,61,29886.5,14959.3,1.99785,14927.1,22.7651 12 | activation,61,1128.6,63.95,17.6482,1064.65,1.62367 13 | ffn_contract,61,29661.8,12946.5,2.29111,16715.3,25.4921 14 | residual_2,61,247.8,61.6,4.02273,186.2,0.283969 15 | logits_projection,1,30,15.05,1.99336,14.95,0.0227999 16 | sampling,1,5,5,1,0,0 17 | overall,61,129142,63571.6,2.03144,65570.4,100 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Yuning Xia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /artifacts/benchmark_results.csv: -------------------------------------------------------------------------------- 1 | suite,label,iterations,scalar_us,simd_us,speedup 2 | "01_Basics/04_loading_data","Load Operations",10,4504,5414,0.831917 3 | "01_Basics/04_loading_data","Store Operations",10,6309,7212,0.874792 4 | "02_Computations/01_simple_maths","Addition",1000,64,18,3.55556 5 | "02_Computations/01_simple_maths","Subtraction",1000,64,10,6.4 6 | "02_Computations/01_simple_maths","Multiplication",1000,64,10,6.4 7 | "02_Computations/01_simple_maths","Division",1000,64,10,6.4 8 | "02_Computations/01_simple_maths","Fused Multiply-Add",1000,64,14,4.57143 9 | "02_Computations/01_simple_maths","Square Root",1000,127,9,14.1111 10 | "02_Computations/01_simple_maths","Minimum",1000,123,10,12.3 11 | "02_Computations/02_dot_product","Dot Product (1024 vectors)",50,1757,514,3.41829 12 | "02_Computations/02_dot_product","AoS vs SoA",50,12837,513,25.0234 13 | "02_Computations/02_dot_product","Single Dot Product (1000 iterations)",10,134,400,0.335 14 | "03_Examples/01_conditional_code","Clamping",200,39,4,9.75 15 | "03_Examples/01_conditional_code","Filtering",200,18,3,6 16 | "03_Examples/01_conditional_code","Complex Filtering",200,22,5,4.4 17 | "03_Examples/02_quadratic_equations","Quadratic Equation Solver",50,12,3,4 18 | "03_Examples/04_image_processing","Brightness Adjustment",10,129963,3211,40.4743 19 | "03_Examples/04_image_processing","Contrast Enhancement",10,159323,41794,3.8121 20 | "03_Examples/04_image_processing","Grayscale Conversion",10,43116,31358,1.37496 21 | "03_Examples/05_mha_block","attention_block",50,107658,37903,2.84036 22 | "03_Examples/06_tiny_gpt","tiny_gpt_decode",50,6441320,3174565,2.02904 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Intel Logo](https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/Intel_logo_%282006-2020%29.svg/200px-Intel_logo_%282006-2020%29.svg.png) 2 | 3 | # Hands-on SIMD Programming with C++ 4 | 5 | From “what is SIMD?” to “how do I speed up transformer layers?”—this repository walks through reproducible AVX2 microbenchmarks, tuning tricks, and a quantised decoder block. 6 | 7 | ![Intel ISA Families and Features](./assets/intel_isa_families.jpeg) 8 | ![SIMD Speedups](artifacts/benchmark_speedups.png) 9 | ![Attention Breakdown](artifacts/attention_speedups.png) 10 | ![Tiny GPT Breakdown](artifacts/tiny_gpt_speedups.png) 11 | 12 | ## Quick Start 13 | 14 | ```bash 15 | ./runme.sh 16 | # optional: tweak CSVs then regenerate the figures 17 | python scripts/plot_results.py 18 | ``` 19 | 20 | `runme.sh` rebuilds every sample, refreshes `artifacts/*.csv`, and redraws all figures (kept under [`artifacts/`](artifacts) so the root stays clean). 21 | 22 | ## Highlights by Module 23 | 24 | | Module | Highlights | Use Cases / Benchmarks | 25 | | --- | --- | --- | 26 | | **01_Basics** | Loads, alignment, data initialisation, intrinsics setup | `01_importing_simd`, `04_loading_data` | 27 | | **02_Computations** | Vector arithmetic, FMA, AoS→SoA dot products | `01_simple_maths`, `02_dot_product` | 28 | | **03_Examples** | Conditional masks, quadratic solver, image ops, quantised attention, 61-block decoder | `01_conditional_code`, `04_image_processing`, `05_mha_block`, `06_tiny_gpt` | 29 | 30 | Every example ships with scalar **vs.** SIMD implementations and an embedded benchmark so you can quantify the payoff. 31 | 32 | ## Reading the Figures 33 | 34 | 1. **SIMD Speedups** – six canonical kernels showing alignment, arithmetic, SoA wins, mask-driven control flow, equation solving, and image transforms (speedups from 0.8× to 40×). 35 | 2. **Attention Breakdown** – RMSNorm + MHA + FFN block with component speedups, end-to-end latency, and contribution share (≈2.8× faster overall). 36 | 3. **Tiny GPT Breakdown** – 61-block decoder with int8 weight stores and SIMD dequantisation; the 2× end-to-end gain is unpacked by stage, absolute savings, and contribution percentages. 37 | 38 | ## Key Takeaways 39 | 40 | - Memory layout matters: we transpose matrices and lean on SoA buffers so AVX2 loads stay contiguous. 41 | - Quantised linear layers use per-channel scales plus `_mm256_cvtepi16_epi32` / `_mm256_fmadd_ps` to recover float outputs without leaving vector code. 42 | - Accuracy is always checked—SIMD activations are compared against scalar references, and quantised logits agree on the predicted token. 43 | - Automation keeps results fresh: rerunning `runme.sh` recompiles, re-benchmarks, and redraws the conference-style plots. 44 | - The tiny GPT demo stacks 61 decoder blocks, so the CSV/plot counts capture how repeated kernels dominate end-to-end latency. 45 | 46 | ## License 47 | 48 | MIT 49 | -------------------------------------------------------------------------------- /runme.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -uo pipefail 3 | 4 | ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 5 | ARTIFACT_DIR="${ROOT_DIR}/artifacts" 6 | mkdir -p "${ARTIFACT_DIR}" 7 | 8 | if [[ -z "${SIMD_BENCHMARK_CSV:-}" ]]; then 9 | SIMD_BENCHMARK_CSV="${ARTIFACT_DIR}/benchmark_results.csv" 10 | else 11 | csv_path="${SIMD_BENCHMARK_CSV}" 12 | if [[ "${csv_path}" != /* ]]; then 13 | csv_path="${ROOT_DIR}/${csv_path}" 14 | fi 15 | SIMD_BENCHMARK_CSV="${csv_path}" 16 | fi 17 | rm -f "${SIMD_BENCHMARK_CSV}" 18 | export SIMD_BENCHMARK_CSV 19 | 20 | mapfile -t examples < <(cd "$ROOT_DIR" && find src -type f -name 'main.cpp' -printf '%h\n' | sort) 21 | 22 | if (( ${#examples[@]} == 0 )); then 23 | echo "No example directories with main.cpp found." >&2 24 | exit 1 25 | fi 26 | 27 | failures=() 28 | 29 | for example in "${examples[@]}"; do 30 | echo 31 | echo "=== Building and running ${example} ===" 32 | pushd "$ROOT_DIR/$example" > /dev/null 33 | 34 | if [ ! -f Makefile ]; then 35 | echo "Skipping ${example}: Makefile not found." >&2 36 | failures+=("${example}: missing Makefile") 37 | popd > /dev/null 38 | continue 39 | fi 40 | 41 | if ! make clean >/dev/null 2>&1; then 42 | echo "make clean failed for ${example}" >&2 43 | failures+=("${example}: make clean failed") 44 | popd > /dev/null 45 | continue 46 | fi 47 | 48 | if ! make; then 49 | echo "make failed for ${example}" >&2 50 | failures+=("${example}: make failed") 51 | popd > /dev/null 52 | continue 53 | fi 54 | 55 | if [ ! -x ./simd_program ]; then 56 | echo "Executable simd_program not produced in ${example}" >&2 57 | failures+=("${example}: missing simd_program") 58 | popd > /dev/null 59 | continue 60 | fi 61 | 62 | if ! ./simd_program; then 63 | echo "Execution failed for ${example}" >&2 64 | failures+=("${example}: execution failed") 65 | popd > /dev/null 66 | continue 67 | fi 68 | 69 | popd > /dev/null 70 | echo "--- Completed ${example} ---" 71 | 72 | if [[ -n "${KEEP_BUILD_ARTIFACTS:-}" ]]; then 73 | continue 74 | fi 75 | 76 | pushd "$ROOT_DIR/$example" > /dev/null 77 | make clean >/dev/null 2>&1 || true 78 | popd > /dev/null 79 | echo "Cleaned ${example} artifacts." 80 | 81 | done 82 | 83 | if (( ${#failures[@]} )); then 84 | echo 85 | echo "Failures detected:" >&2 86 | for entry in "${failures[@]}"; do 87 | echo " - ${entry}" >&2 88 | done 89 | exit 1 90 | fi 91 | 92 | echo 93 | echo "All SIMD examples built and ran successfully." 94 | if [[ -n "${SIMD_BENCHMARK_CSV:-}" && -f "${SIMD_BENCHMARK_CSV}" ]]; then 95 | echo "Benchmark CSV saved to ${SIMD_BENCHMARK_CSV}" 96 | fi 97 | 98 | echo 99 | echo "Generating plots via Python scripts..." 100 | python3 "${ROOT_DIR}/scripts/plot_results.py" \ 101 | --benchmarks-csv "${SIMD_BENCHMARK_CSV}" \ 102 | --benchmarks-output "${ARTIFACT_DIR}/benchmark_speedups.png" \ 103 | --attention-csv "${ARTIFACT_DIR}/attention_components.csv" \ 104 | --attention-output "${ARTIFACT_DIR}/attention_speedups.png" \ 105 | --tiny-gpt-csv "${ARTIFACT_DIR}/tiny_gpt_components.csv" \ 106 | --tiny-gpt-output "${ARTIFACT_DIR}/tiny_gpt_speedups.png" 107 | echo "Plots saved to ${ARTIFACT_DIR}" 108 | -------------------------------------------------------------------------------- /src/01_Basics/01_importing_simd/main.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * 01_Basics/01_importing_simd - Introduction to SIMD headers and basic operations 3 | * 4 | * This example demonstrates: 5 | * 1. How to include SIMD headers in your C/C++ programs 6 | * 2. The hierarchy of SIMD instruction sets 7 | * 3. Basic SIMD vector operations 8 | */ 9 | 10 | // SIMD operations can be included in C/C++ programs via specific header files. 11 | // Below is a hierarchy of headers provided by Intel, grouped by the instruction sets they implement. 12 | 13 | #include "../../include/simd_utils.h" // Our utility header that includes 14 | 15 | // If you're not using our utility header, you would typically include: 16 | // #include // The all-encompassing header for Intel SIMD: AVX, AVX2, FMA, AVX-512, etc. 17 | 18 | /** 19 | * SIMD Instruction Set Hierarchy: 20 | * 21 | * 1. MMX (MultiMedia eXtensions) - 64-bit operations on integers 22 | * - Header: 23 | * - Introduced in 1997 with Intel Pentium MMX 24 | * 25 | * 2. SSE (Streaming SIMD Extensions) - 128-bit operations on 4 floats 26 | * - Header: 27 | * - Introduced in 1999 with Intel Pentium III 28 | * 29 | * 3. SSE2 - Added support for integers and doubles in 128-bit registers 30 | * - Header: 31 | * - Introduced in 2001 with Intel Pentium 4 32 | * 33 | * 4. SSE3 - Added horizontal operations and better handling of unaligned data 34 | * - Header: 35 | * - Introduced in 2004 with Intel Pentium 4 (Prescott) 36 | * 37 | * 5. SSSE3 (Supplemental SSE3) - Added more integer instructions 38 | * - Header: 39 | * - Introduced in 2006 with Intel Core 2 40 | * 41 | * 6. SSE4.1 and SSE4.2 - Added dot product, string processing, etc. 42 | * - Headers: and 43 | * - Introduced in 2007-2008 with Intel Core i7 44 | * 45 | * 7. AVX (Advanced Vector Extensions) - 256-bit operations (8 floats) 46 | * - Header: 47 | * - Introduced in 2011 with Intel Sandy Bridge 48 | * 49 | * 8. AVX2 - Added 256-bit integer operations and more instructions 50 | * - Header: 51 | * - Introduced in 2013 with Intel Haswell 52 | * 53 | * 9. AVX-512 - 512-bit operations (16 floats) 54 | * - Header: 55 | * - Introduced in 2016 with Intel Xeon Phi 56 | */ 57 | 58 | // Generally, "immintrin.h" is sufficient for most modern SIMD operations as it includes all the above. 59 | 60 | #include 61 | 62 | int main() { 63 | set_benchmark_suite("01_Basics/01_importing_simd"); 64 | 65 | std::cout << "=== SIMD Header Introduction ===" << std::endl; 66 | std::cout << "This example demonstrates basic SIMD vector operations." << std::endl; 67 | std::cout << std::endl; 68 | 69 | // Example 1: Basic vector addition with AVX2 70 | std::cout << "Example 1: Vector Addition" << std::endl; 71 | 72 | // Initialize two SIMD vectors with 8 float values each 73 | __m256 a = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); 74 | __m256 b = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 75 | 76 | // Add the vectors element-wise 77 | __m256 c = _mm256_add_ps(a, b); 78 | 79 | // Print the vectors using our utility function 80 | print_m256(a, "Vector A"); 81 | print_m256(b, "Vector B"); 82 | print_m256(c, "A + B"); 83 | 84 | // Example 2: Storing SIMD results back to memory 85 | std::cout << std::endl; 86 | std::cout << "Example 2: Storing SIMD Results" << std::endl; 87 | 88 | // Allocate aligned memory for results 89 | float* result = aligned_alloc(8); 90 | 91 | // Store the SIMD vector to memory 92 | _mm256_store_ps(result, c); 93 | 94 | // Print the results from memory 95 | std::cout << "Result array: ["; 96 | for (int i = 0; i < 7; i++) { 97 | std::cout << result[i] << ", "; 98 | } 99 | std::cout << result[7] << "]" << std::endl; 100 | 101 | // Example 3: Different data types 102 | std::cout << std::endl; 103 | std::cout << "Example 3: Different Data Types" << std::endl; 104 | 105 | // Integer SIMD operations 106 | __m256i int_a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); 107 | __m256i int_b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); 108 | __m256i int_sum = _mm256_add_epi32(int_a, int_b); 109 | 110 | print_m256i(int_a, "Integer Vector A"); 111 | print_m256i(int_b, "Integer Vector B"); 112 | print_m256i(int_sum, "A + B (Integer)"); 113 | 114 | // Double precision SIMD operations (4 doubles in a 256-bit register) 115 | __m256d double_a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); 116 | __m256d double_b = _mm256_set_pd(4.0, 3.0, 2.0, 1.0); 117 | __m256d double_sum = _mm256_add_pd(double_a, double_b); 118 | 119 | print_m256d(double_a, "Double Vector A"); 120 | print_m256d(double_b, "Double Vector B"); 121 | print_m256d(double_sum, "A + B (Double)"); 122 | 123 | // Clean up 124 | free(result); 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /src/03_Examples/02_quadratic_equations/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example demonstrates solving multiple quadratic equations in parallel using SIMD. 7 | * 8 | * For each quadratic equation ax² + bx + c = 0, we compute the discriminant b² - 4ac 9 | * and then calculate the solution using the quadratic formula: x = (-b ± √(b² - 4ac)) / 2a 10 | * 11 | * We'll solve 8 different quadratic equations simultaneously using AVX2 instructions. 12 | */ 13 | 14 | int main() { 15 | set_benchmark_suite("03_Examples/02_quadratic_equations"); 16 | 17 | std::cout << "=== Solving Quadratic Equations with SIMD ===" << std::endl; 18 | std::cout << "This example solves 8 quadratic equations in parallel." << std::endl; 19 | std::cout << "For each equation ax² + bx + c = 0, we find the smaller root." << std::endl; 20 | std::cout << std::endl; 21 | 22 | // Allocate aligned memory for coefficients 23 | float* a = aligned_alloc(8); 24 | float* b = aligned_alloc(8); 25 | float* c = aligned_alloc(8); 26 | 27 | // Initialize coefficients for 8 different quadratic equations 28 | // Equation 1: 5x² + 3x - 1 = 0 29 | a[0] = 5.0f; b[0] = 3.0f; c[0] = -1.0f; 30 | 31 | // Equation 2: 12x² + 1x - 5 = 0 32 | a[1] = 12.0f; b[1] = 1.0f; c[1] = -5.0f; 33 | 34 | // Equation 3: 6x² + 4x - 6 = 0 35 | a[2] = 6.0f; b[2] = 4.0f; c[2] = -6.0f; 36 | 37 | // Equation 4: 7x² - 2x - 6 = 0 38 | a[3] = 7.0f; b[3] = -2.0f; c[3] = -6.0f; 39 | 40 | // Equation 5: 1x² + 2x + 5 = 0 (complex roots, will return NaN) 41 | a[4] = 1.0f; b[4] = 2.0f; c[4] = 5.0f; 42 | 43 | // Equation 6: 1x² + 1x + 30 = 0 (complex roots, will return NaN) 44 | a[5] = 1.0f; b[5] = 1.0f; c[5] = 30.0f; 45 | 46 | // Equation 7: 1x² + 1x + 35 = 0 (complex roots, will return NaN) 47 | a[6] = 1.0f; b[6] = 1.0f; c[6] = 35.0f; 48 | 49 | // Equation 8: 1x² + 1x - 40 = 0 50 | a[7] = 1.0f; b[7] = 1.0f; c[7] = -40.0f; 51 | 52 | // Print the equations we're solving 53 | std::cout << "Equations to solve:" << std::endl; 54 | for (int i = 0; i < 8; i++) { 55 | std::cout << "Equation " << (i+1) << ": " 56 | << a[i] << "x² + " << b[i] << "x + " << c[i] << " = 0" << std::endl; 57 | } 58 | std::cout << std::endl; 59 | 60 | // Union to access SIMD results 61 | float8 result; 62 | 63 | // -------- Standard scalar approach --------------- 64 | std::cout << "----------- Standard scalar approach -----------" << std::endl; 65 | 66 | // Initialize result array with placeholder values 67 | for (int lane = 0; lane < 8; ++lane) { 68 | result.a[lane] = std::numeric_limits::quiet_NaN(); 69 | } 70 | 71 | // Define the scalar implementation as a lambda for benchmarking 72 | auto scalar_func = [&]() { 73 | for (int lane = 0; lane < 8; ++lane) { 74 | float discriminant = b[lane] * b[lane] - 4.0f * a[lane] * c[lane]; 75 | if (discriminant >= 0) { 76 | // Calculate the smaller root: (-b - sqrt(discriminant)) / (2*a) 77 | result.a[lane] = (-b[lane] - sqrtf(discriminant)) / (2.0f * a[lane]); 78 | } else { 79 | // Complex roots, set to NaN 80 | result.a[lane] = std::numeric_limits::quiet_NaN(); 81 | } 82 | } 83 | }; 84 | 85 | // Run the scalar implementation once to get the results 86 | scalar_func(); 87 | 88 | // Print scalar results 89 | std::cout << "Scalar solutions (smaller root):" << std::endl; 90 | for (int lane = 0; lane < 8; ++lane) { 91 | std::cout << "Equation " << (lane+1) << ": "; 92 | if (std::isnan(result.a[lane])) { 93 | std::cout << "Complex roots" << std::endl; 94 | } else { 95 | std::cout << result.a[lane] << std::endl; 96 | } 97 | } 98 | std::cout << std::endl; 99 | 100 | // -------- SIMD approach --------------- 101 | std::cout << "----------- SIMD approach -----------" << std::endl; 102 | 103 | // Define the SIMD implementation as a lambda for benchmarking 104 | auto simd_func = [&]() { 105 | // Load coefficients into SIMD registers 106 | __m256 aCoeffs = _mm256_loadu_ps(a); 107 | __m256 bCoeffs = _mm256_loadu_ps(b); 108 | __m256 cCoeffs = _mm256_loadu_ps(c); 109 | 110 | // Calculate discriminant: b² - 4ac 111 | // Using fused multiply-add for better precision: b*b - 4*a*c 112 | __m256 four = _mm256_set1_ps(4.0f); 113 | __m256 ac = _mm256_mul_ps(aCoeffs, cCoeffs); 114 | __m256 four_ac = _mm256_mul_ps(four, ac); 115 | __m256 b_squared = _mm256_mul_ps(bCoeffs, bCoeffs); 116 | __m256 discriminant = _mm256_sub_ps(b_squared, four_ac); 117 | 118 | // Create mask for discriminant >= 0 (real roots) 119 | __m256 zero = _mm256_setzero_ps(); 120 | __m256 mask = _mm256_cmp_ps(discriminant, zero, _CMP_GE_OQ); 121 | 122 | // Calculate sqrt(discriminant) where discriminant >= 0 123 | __m256 sqrt_discriminant = _mm256_sqrt_ps(discriminant); 124 | 125 | // Calculate -b 126 | __m256 neg_b = _mm256_sub_ps(zero, bCoeffs); 127 | 128 | // Calculate numerator: -b - sqrt(discriminant) 129 | __m256 numerator = _mm256_sub_ps(neg_b, sqrt_discriminant); 130 | 131 | // Calculate denominator: 2*a 132 | __m256 two = _mm256_set1_ps(2.0f); 133 | __m256 denominator = _mm256_mul_ps(two, aCoeffs); 134 | 135 | // Calculate result: (-b - sqrt(discriminant)) / (2*a) 136 | __m256 solution = _mm256_div_ps(numerator, denominator); 137 | 138 | // Set NaN for complex roots (discriminant < 0) 139 | __m256 nan = _mm256_set1_ps(std::numeric_limits::quiet_NaN()); 140 | __m256 final_result = _mm256_blendv_ps(nan, solution, mask); 141 | 142 | // Store result 143 | result.v = final_result; 144 | }; 145 | 146 | // Run the SIMD implementation once to get the results 147 | simd_func(); 148 | 149 | // Print SIMD results 150 | std::cout << "SIMD solutions (smaller root):" << std::endl; 151 | for (int lane = 0; lane < 8; ++lane) { 152 | std::cout << "Equation " << (lane+1) << ": "; 153 | if (std::isnan(result.a[lane])) { 154 | std::cout << "Complex roots" << std::endl; 155 | } else { 156 | std::cout << result.a[lane] << std::endl; 157 | } 158 | } 159 | std::cout << std::endl; 160 | 161 | // Benchmark comparison 162 | benchmark_comparison( 163 | "Quadratic Equation Solver", 164 | scalar_func, 165 | simd_func, 166 | 50 167 | ); 168 | 169 | // Free allocated memory 170 | free(a); 171 | free(b); 172 | free(c); 173 | 174 | return 0; 175 | } 176 | -------------------------------------------------------------------------------- /src/03_Examples/03_data_types/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example demonstrates SIMD operations with different data types. 7 | * 8 | * We'll explore: 9 | * 1. Working with different numeric types (float, double, int, short) 10 | * 2. Converting between different SIMD types 11 | * 3. Handling different vector widths 12 | * 4. Performing operations specific to certain data types 13 | */ 14 | 15 | int main() { 16 | set_benchmark_suite("03_Examples/03_data_types"); 17 | 18 | std::cout << "=== SIMD Operations with Different Data Types ===" << std::endl; 19 | std::cout << std::endl; 20 | 21 | // -------- 1. Float operations (32-bit) -------- 22 | std::cout << "1. Float Operations (32-bit, 8 elements per vector)" << std::endl; 23 | 24 | // Initialize float vector 25 | __m256 float_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 26 | __m256 float_vec2 = _mm256_set1_ps(2.0f); // Set all elements to 2.0 27 | 28 | // Perform operations 29 | __m256 float_sum = _mm256_add_ps(float_vec1, float_vec2); 30 | __m256 float_product = _mm256_mul_ps(float_vec1, float_vec2); 31 | 32 | // Print results 33 | print_m256(float_vec1, "Float Vector 1"); 34 | print_m256(float_vec2, "Float Vector 2"); 35 | print_m256(float_sum, "Sum (float_vec1 + float_vec2)"); 36 | print_m256(float_product, "Product (float_vec1 * float_vec2)"); 37 | std::cout << std::endl; 38 | 39 | // -------- 2. Double operations (64-bit) -------- 40 | std::cout << "2. Double Operations (64-bit, 4 elements per vector)" << std::endl; 41 | 42 | // Initialize double vector 43 | __m256d double_vec1 = _mm256_set_pd(4.0, 3.0, 2.0, 1.0); 44 | __m256d double_vec2 = _mm256_set1_pd(3.0); // Set all elements to 3.0 45 | 46 | // Perform operations 47 | __m256d double_sum = _mm256_add_pd(double_vec1, double_vec2); 48 | __m256d double_product = _mm256_mul_pd(double_vec1, double_vec2); 49 | 50 | // Print results 51 | print_m256d(double_vec1, "Double Vector 1"); 52 | print_m256d(double_vec2, "Double Vector 2"); 53 | print_m256d(double_sum, "Sum (double_vec1 + double_vec2)"); 54 | print_m256d(double_product, "Product (double_vec1 * double_vec2)"); 55 | std::cout << std::endl; 56 | 57 | // -------- 3. Integer operations (32-bit) -------- 58 | std::cout << "3. Integer Operations (32-bit, 8 elements per vector)" << std::endl; 59 | 60 | // Initialize integer vector 61 | __m256i int_vec1 = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); 62 | __m256i int_vec2 = _mm256_set1_epi32(10); // Set all elements to 10 63 | 64 | // Perform operations 65 | __m256i int_sum = _mm256_add_epi32(int_vec1, int_vec2); 66 | __m256i int_sub = _mm256_sub_epi32(int_vec1, int_vec2); 67 | 68 | // Print results 69 | print_m256i(int_vec1, "Int Vector 1"); 70 | print_m256i(int_vec2, "Int Vector 2"); 71 | print_m256i(int_sum, "Sum (int_vec1 + int_vec2)"); 72 | print_m256i(int_sub, "Difference (int_vec1 - int_vec2)"); 73 | std::cout << std::endl; 74 | 75 | // -------- 4. Type Conversions -------- 76 | std::cout << "4. Type Conversions" << std::endl; 77 | 78 | // Convert float to integer (truncation) 79 | __m256i float_to_int = _mm256_cvttps_epi32(float_vec1); 80 | print_m256i(float_to_int, "Float to Int (truncated)"); 81 | 82 | // Convert integer to float 83 | __m256 int_to_float = _mm256_cvtepi32_ps(int_vec1); 84 | print_m256(int_to_float, "Int to Float"); 85 | 86 | // Convert between float and double (need to split/combine) 87 | // Extract lower 4 floats and convert to double 88 | __m128 float_low = _mm256_extractf128_ps(float_vec1, 0); 89 | __m256d float_to_double_low = _mm256_cvtps_pd(float_low); 90 | print_m256d(float_to_double_low, "Lower 4 Floats to Double"); 91 | 92 | // Extract upper 4 floats and convert to double 93 | __m128 float_high = _mm256_extractf128_ps(float_vec1, 1); 94 | __m256d float_to_double_high = _mm256_cvtps_pd(float_high); 95 | print_m256d(float_to_double_high, "Upper 4 Floats to Double"); 96 | std::cout << std::endl; 97 | 98 | // -------- 5. Bitwise Operations -------- 99 | std::cout << "5. Bitwise Operations" << std::endl; 100 | 101 | // Create test vectors 102 | __m256i bits1 = _mm256_set1_epi32(0x0F0F0F0F); // 00001111 00001111 00001111 00001111 103 | __m256i bits2 = _mm256_set1_epi32(0x33333333); // 00110011 00110011 00110011 00110011 104 | 105 | // Perform bitwise operations 106 | __m256i bit_and = _mm256_and_si256(bits1, bits2); 107 | __m256i bit_or = _mm256_or_si256(bits1, bits2); 108 | __m256i bit_xor = _mm256_xor_si256(bits1, bits2); 109 | 110 | // Print results in hex format 111 | std::cout << "Bits1 (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 112 | << reinterpret_cast(&bits1)[0] << std::endl; 113 | std::cout << "Bits2 (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 114 | << reinterpret_cast(&bits2)[0] << std::endl; 115 | std::cout << "AND (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 116 | << reinterpret_cast(&bit_and)[0] << std::endl; 117 | std::cout << "OR (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 118 | << reinterpret_cast(&bit_or)[0] << std::endl; 119 | std::cout << "XOR (hex): 0x" << std::hex << std::setfill('0') << std::setw(8) 120 | << reinterpret_cast(&bit_xor)[0] << std::endl; 121 | std::cout << std::dec << std::endl; // Reset to decimal 122 | 123 | // -------- 6. Specialized Operations -------- 124 | std::cout << "6. Specialized Operations" << std::endl; 125 | 126 | // Horizontal addition (add adjacent pairs) 127 | __m256 hadd_result = _mm256_hadd_ps(float_vec1, float_vec2); 128 | print_m256(hadd_result, "Horizontal Add (pairs from float_vec1, float_vec2)"); 129 | 130 | // Permute (rearrange elements) 131 | __m256 permute_result = _mm256_permute_ps(float_vec1, 0b10010011); 132 | print_m256(permute_result, "Permuted float_vec1"); 133 | 134 | // Blend (select elements from two vectors based on mask) 135 | __m256 blend_result = _mm256_blend_ps(float_vec1, float_vec2, 0b10101010); 136 | print_m256(blend_result, "Blend of float_vec1 and float_vec2"); 137 | 138 | return 0; 139 | } 140 | -------------------------------------------------------------------------------- /src/01_Basics/03_binding_with_unions/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * 01_Basics/03_binding_with_unions - Techniques for accessing SIMD data 7 | * 8 | * This example demonstrates different ways to access and manipulate data in SIMD vectors: 9 | * 1. Using pointer conversion (reinterpret_cast) 10 | * 2. Using unions to create an alias between SIMD types and arrays 11 | * 3. Using the _mm256_store_* and _mm256_load_* functions 12 | * 4. Using the extract and insert element functions 13 | * 14 | * Each method has its advantages and use cases. 15 | */ 16 | 17 | int main() { 18 | set_benchmark_suite("01_Basics/03_binding_with_unions"); 19 | 20 | std::cout << "=== Accessing SIMD Data ===" << std::endl; 21 | std::cout << std::endl; 22 | 23 | // --------- 1. Pointer Conversion ------------- 24 | std::cout << "1. Pointer Conversion" << std::endl; 25 | std::cout << "---------------------------------------------------" << std::endl; 26 | std::cout << "Using reinterpret_cast to convert between SIMD types and arrays." << std::endl; 27 | std::cout << "This is a simple but potentially unsafe method." << std::endl; 28 | std::cout << std::endl; 29 | 30 | // Initialize a SIMD vector with ascending values 31 | __m256 simd_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 32 | 33 | // Access the data using pointer conversion 34 | float* float_ptr = reinterpret_cast(&simd_vec1); 35 | 36 | // Print the data 37 | std::cout << "SIMD vector values via pointer: ["; 38 | for (int i = 0; i < 7; i++) { 39 | std::cout << float_ptr[i] << ", "; 40 | } 41 | std::cout << float_ptr[7] << "]" << std::endl; 42 | 43 | // Modify the data through the pointer 44 | std::cout << "Modifying values via pointer..." << std::endl; 45 | float_ptr[0] = 100.0f; 46 | float_ptr[4] = 200.0f; 47 | 48 | // Print the modified SIMD vector 49 | print_m256(simd_vec1, "Modified SIMD vector"); 50 | std::cout << std::endl; 51 | 52 | // --------- 2. Using Unions ------------- 53 | std::cout << "2. Using Unions" << std::endl; 54 | std::cout << "---------------------------------------------------" << std::endl; 55 | std::cout << "Using unions to create an alias between SIMD types and arrays." << std::endl; 56 | std::cout << "This is a cleaner and safer approach than pointer conversion." << std::endl; 57 | std::cout << std::endl; 58 | 59 | // Define a union for float SIMD vector 60 | union FloatSIMD { 61 | __m256 v; 62 | float a[8]; 63 | }; 64 | 65 | // Initialize the union with a SIMD vector 66 | FloatSIMD float_union; 67 | float_union.v = _mm256_set_ps(16.0f, 14.0f, 12.0f, 10.0f, 8.0f, 6.0f, 4.0f, 2.0f); 68 | 69 | // Access the data through the array 70 | std::cout << "SIMD vector values via union: ["; 71 | for (int i = 0; i < 7; i++) { 72 | std::cout << float_union.a[i] << ", "; 73 | } 74 | std::cout << float_union.a[7] << "]" << std::endl; 75 | 76 | // Modify the data through the array 77 | std::cout << "Modifying values via union..." << std::endl; 78 | float_union.a[1] = 42.0f; 79 | float_union.a[6] = 99.0f; 80 | 81 | // Print the modified SIMD vector 82 | print_m256(float_union.v, "Modified SIMD vector (union)"); 83 | 84 | // Using our utility union from simd_utils.h 85 | float8 float8_union; 86 | float8_union.v = _mm256_set1_ps(5.0f); 87 | float8_union.a[2] = 10.0f; 88 | float8_union.a[5] = 20.0f; 89 | 90 | print_m256(float8_union.v, "Using float8 union from simd_utils.h"); 91 | std::cout << std::endl; 92 | 93 | // --------- 3. Store and Load Functions ------------- 94 | std::cout << "3. Store and Load Functions" << std::endl; 95 | std::cout << "---------------------------------------------------" << std::endl; 96 | std::cout << "Using _mm256_store_* and _mm256_load_* functions to transfer data." << std::endl; 97 | std::cout << "This is the recommended approach for most situations." << std::endl; 98 | std::cout << std::endl; 99 | 100 | // Initialize a SIMD vector 101 | __m256 simd_vec3 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 102 | 103 | // Allocate aligned memory for the array 104 | float* aligned_array = aligned_alloc(8); 105 | 106 | // Store the SIMD vector to the array 107 | _mm256_store_ps(aligned_array, simd_vec3); 108 | 109 | // Print the array 110 | std::cout << "SIMD vector values via store: ["; 111 | for (int i = 0; i < 7; i++) { 112 | std::cout << aligned_array[i] << ", "; 113 | } 114 | std::cout << aligned_array[7] << "]" << std::endl; 115 | 116 | // Modify the array 117 | std::cout << "Modifying values in the array..." << std::endl; 118 | aligned_array[3] = 30.0f; 119 | aligned_array[7] = 80.0f; 120 | 121 | // Load the modified array back to a SIMD vector 122 | __m256 modified_vec = _mm256_load_ps(aligned_array); 123 | 124 | // Print the modified SIMD vector 125 | print_m256(modified_vec, "Modified SIMD vector (store/load)"); 126 | std::cout << std::endl; 127 | 128 | // --------- 4. Extract and Insert Elements ------------- 129 | std::cout << "4. Extract and Insert Elements" << std::endl; 130 | std::cout << "---------------------------------------------------" << std::endl; 131 | std::cout << "Using _mm256_extract_* and _mm256_insert_* functions to access individual elements." << std::endl; 132 | std::cout << "This is useful when you only need to access a few elements." << std::endl; 133 | std::cout << std::endl; 134 | 135 | // Initialize a SIMD vector with integers 136 | __m256i simd_int_vec = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); 137 | 138 | // Extract individual elements 139 | // Note: For AVX2, we need to extract 128-bit lanes first, then extract from those 140 | __m128i low_lane = _mm256_extracti128_si256(simd_int_vec, 0); // Extract lower 128 bits 141 | __m128i high_lane = _mm256_extracti128_si256(simd_int_vec, 1); // Extract upper 128 bits 142 | 143 | int element0 = _mm_extract_epi32(low_lane, 0); // Extract element 0 144 | int element3 = _mm_extract_epi32(low_lane, 3); // Extract element 3 145 | int element4 = _mm_extract_epi32(high_lane, 0); // Extract element 4 146 | int element7 = _mm_extract_epi32(high_lane, 3); // Extract element 7 147 | 148 | std::cout << "Extracted elements: " << element0 << ", " << element3 << ", " 149 | << element4 << ", " << element7 << std::endl; 150 | 151 | // Insert elements 152 | // For inserting, we need to create new 128-bit vectors and then combine them 153 | __m128i new_low = _mm_insert_epi32(low_lane, 100, 1); // Replace element 1 154 | __m128i new_high = _mm_insert_epi32(high_lane, 200, 2); // Replace element 6 155 | 156 | // Combine the lanes back into a 256-bit vector 157 | __m256i modified_int_vec = _mm256_setr_m128i(new_low, new_high); 158 | 159 | // Print the modified vector 160 | print_m256i(modified_int_vec, "Modified integer vector (extract/insert)"); 161 | 162 | // Clean up 163 | free(aligned_array); 164 | 165 | return 0; 166 | } 167 | -------------------------------------------------------------------------------- /src/include/simd_utils.h: -------------------------------------------------------------------------------- 1 | /** 2 | * simd_utils.h - Utility functions and macros for SIMD programming 3 | * 4 | * This header provides common utilities for SIMD programming, including: 5 | * - Type definitions for SIMD vectors 6 | * - Helper macros for alignment 7 | * - Utility functions for printing SIMD vectors 8 | * - Performance measurement utilities 9 | */ 10 | 11 | #ifndef SIMD_UTILS_H 12 | #define SIMD_UTILS_H 13 | 14 | #include // AVX2, 256-bit operations 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | // Alignment macros 26 | #define SIMD_ALIGN_32 alignas(32) 27 | #define SIMD_ALIGN_64 alignas(64) 28 | 29 | // Helper union for accessing SIMD vector elements 30 | union float8 { 31 | __m256 v; 32 | float a[8]; 33 | 34 | float8(__m256 _v) : v(_v) {} 35 | float8() : v(_mm256_setzero_ps()) {} 36 | }; 37 | 38 | union double4 { 39 | __m256d v; 40 | double a[4]; 41 | 42 | double4(__m256d _v) : v(_v) {} 43 | double4() : v(_mm256_setzero_pd()) {} 44 | }; 45 | 46 | union int8 { 47 | __m256i v; 48 | int a[8]; 49 | 50 | int8(__m256i _v) : v(_v) {} 51 | int8() : v(_mm256_setzero_si256()) {} 52 | }; 53 | 54 | // Print utilities 55 | inline void print_m256(const __m256& v, const std::string& label = "") { 56 | float8 tmp(v); 57 | if (!label.empty()) { 58 | std::cout << label << ": "; 59 | } 60 | std::cout << "["; 61 | for (int i = 0; i < 7; i++) { 62 | std::cout << tmp.a[i] << ", "; 63 | } 64 | std::cout << tmp.a[7] << "]" << std::endl; 65 | } 66 | 67 | inline void print_m256d(const __m256d& v, const std::string& label = "") { 68 | double4 tmp(v); 69 | if (!label.empty()) { 70 | std::cout << label << ": "; 71 | } 72 | std::cout << "["; 73 | for (int i = 0; i < 3; i++) { 74 | std::cout << tmp.a[i] << ", "; 75 | } 76 | std::cout << tmp.a[3] << "]" << std::endl; 77 | } 78 | 79 | inline void print_m256i(const __m256i& v, const std::string& label = "") { 80 | int8 tmp(v); 81 | if (!label.empty()) { 82 | std::cout << label << ": "; 83 | } 84 | std::cout << "["; 85 | for (int i = 0; i < 7; i++) { 86 | std::cout << tmp.a[i] << ", "; 87 | } 88 | std::cout << tmp.a[7] << "]" << std::endl; 89 | } 90 | 91 | // Performance measurement utilities 92 | class Timer { 93 | private: 94 | std::chrono::high_resolution_clock::time_point start_time; 95 | std::string label; 96 | 97 | public: 98 | Timer(const std::string& _label = "Operation") : label(_label) { 99 | start_time = std::chrono::high_resolution_clock::now(); 100 | } 101 | 102 | ~Timer() { 103 | auto end_time = std::chrono::high_resolution_clock::now(); 104 | auto duration = std::chrono::duration_cast(end_time - start_time); 105 | std::cout << label << " took " << duration.count() << " microseconds" << std::endl; 106 | } 107 | }; 108 | 109 | namespace simd_bench_detail { 110 | 111 | inline std::string& suite_label() { 112 | static std::string label = "unspecified_suite"; 113 | return label; 114 | } 115 | 116 | inline std::string& csv_path_store() { 117 | static std::string path; 118 | return path; 119 | } 120 | 121 | inline std::mutex& csv_mutex() { 122 | static std::mutex m; 123 | return m; 124 | } 125 | 126 | inline std::string csv_escape(const std::string& input) { 127 | std::string out; 128 | out.reserve(input.size() + 2); 129 | out.push_back('"'); 130 | for (char ch : input) { 131 | if (ch == '"') { 132 | out.push_back('"'); 133 | out.push_back('"'); 134 | } else { 135 | out.push_back(ch); 136 | } 137 | } 138 | out.push_back('"'); 139 | return out; 140 | } 141 | 142 | inline std::string effective_csv_path() { 143 | const char* env_path = std::getenv("SIMD_BENCHMARK_CSV"); 144 | if (env_path && *env_path) { 145 | return std::string(env_path); 146 | } 147 | return csv_path_store(); 148 | } 149 | 150 | } // namespace simd_bench_detail 151 | 152 | inline void set_benchmark_suite(const std::string& suite) { 153 | simd_bench_detail::suite_label() = suite; 154 | } 155 | 156 | inline void set_benchmark_csv_path(const std::string& path) { 157 | simd_bench_detail::csv_path_store() = path; 158 | } 159 | 160 | // Benchmark function to compare scalar vs SIMD implementations 161 | template 162 | void benchmark_comparison( 163 | const std::string& label, 164 | ScalarFunc scalar_func, 165 | SimdFunc simd_func, 166 | int iterations = 1000 167 | ) { 168 | // Warm-up 169 | scalar_func(); 170 | simd_func(); 171 | 172 | // Benchmark scalar implementation 173 | auto scalar_start = std::chrono::high_resolution_clock::now(); 174 | for (int i = 0; i < iterations; i++) { 175 | scalar_func(); 176 | } 177 | auto scalar_end = std::chrono::high_resolution_clock::now(); 178 | auto scalar_duration = std::chrono::duration_cast(scalar_end - scalar_start); 179 | 180 | // Benchmark SIMD implementation 181 | auto simd_start = std::chrono::high_resolution_clock::now(); 182 | for (int i = 0; i < iterations; i++) { 183 | simd_func(); 184 | } 185 | auto simd_end = std::chrono::high_resolution_clock::now(); 186 | auto simd_duration = std::chrono::duration_cast(simd_end - simd_start); 187 | 188 | // Print results 189 | std::cout << "===== " << label << " Benchmark =====" << std::endl; 190 | std::cout << "Scalar implementation: " << scalar_duration.count() << " microseconds" << std::endl; 191 | std::cout << "SIMD implementation: " << simd_duration.count() << " microseconds" << std::endl; 192 | 193 | double speedup = static_cast(scalar_duration.count()) / simd_duration.count(); 194 | std::cout << "Speedup: " << std::fixed << std::setprecision(2) << speedup << "x" << std::endl; 195 | std::cout << "===============================" << std::endl; 196 | 197 | const std::string csv_path = simd_bench_detail::effective_csv_path(); 198 | if (!csv_path.empty()) { 199 | std::lock_guard lock(simd_bench_detail::csv_mutex()); 200 | bool need_header = false; 201 | { 202 | std::ifstream existing(csv_path); 203 | if (!existing.good() || existing.peek() == std::ifstream::traits_type::eof()) { 204 | need_header = true; 205 | } 206 | } 207 | 208 | std::ofstream csv(csv_path, std::ios::app); 209 | if (csv) { 210 | if (need_header) { 211 | csv << "suite,label,iterations,scalar_us,simd_us,speedup" << '\n'; 212 | } 213 | csv << simd_bench_detail::csv_escape(simd_bench_detail::suite_label()) << ',' 214 | << simd_bench_detail::csv_escape(label) << ',' 215 | << iterations << ',' 216 | << scalar_duration.count() << ',' 217 | << simd_duration.count() << ',' 218 | << std::setprecision(6) << speedup << '\n'; 219 | } else { 220 | std::cerr << "Failed to write benchmark CSV at " << csv_path << std::endl; 221 | } 222 | } 223 | } 224 | 225 | // Allocate aligned memory 226 | template 227 | T* aligned_alloc(size_t size, size_t alignment = 32) { 228 | void* ptr = nullptr; 229 | if (posix_memalign(&ptr, alignment, size * sizeof(T)) != 0) { 230 | throw std::bad_alloc(); 231 | } 232 | return static_cast(ptr); 233 | } 234 | 235 | #endif // SIMD_UTILS_H 236 | -------------------------------------------------------------------------------- /src/01_Basics/04_loading_data/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /** 7 | * 01_Basics/04_loading_data - Loading and storing SIMD data 8 | * 9 | * This example demonstrates different ways to load data into SIMD vectors: 10 | * 1. Aligned load (_mm256_load_ps) - Requires 32-byte aligned memory 11 | * 2. Unaligned load (_mm256_loadu_ps) - Works with any memory address 12 | * 3. Masked load (_mm256_maskload_ps) - Selectively loads elements based on a mask 13 | * 4. Stream load (_mm256_stream_load_si256) - Non-temporal load that bypasses cache 14 | * 15 | * And different ways to store SIMD data: 16 | * 1. Aligned store (_mm256_store_ps) - Requires 32-byte aligned memory 17 | * 2. Unaligned store (_mm256_storeu_ps) - Works with any memory address 18 | * 3. Masked store (_mm256_maskstore_ps) - Selectively stores elements based on a mask 19 | * 4. Stream store (_mm256_stream_ps) - Non-temporal store that bypasses cache 20 | * 21 | * We'll also compare the performance of these methods. 22 | */ 23 | 24 | const int ARRAY_SIZE = 8; 25 | // Fewer iterations keep the demo responsive while still highlighting relative costs. 26 | const int TEST_ITERATIONS = 100000; 27 | 28 | int main() { 29 | set_benchmark_suite("01_Basics/04_loading_data"); 30 | 31 | std::cout << "=== SIMD Data Loading and Storing ===" << std::endl; 32 | std::cout << std::endl; 33 | 34 | // --------- 1. Aligned vs. Unaligned Load ------------- 35 | std::cout << "1. Aligned vs. Unaligned Load" << std::endl; 36 | std::cout << "---------------------------------------------------" << std::endl; 37 | std::cout << "Comparing aligned and unaligned memory access." << std::endl; 38 | std::cout << std::endl; 39 | 40 | // Allocate aligned and unaligned memory 41 | float* aligned_data = aligned_alloc(ARRAY_SIZE, 32); // 32-byte alignment for AVX 42 | float* unaligned_data = new float[ARRAY_SIZE + 1]; // +1 to ensure we can create unaligned pointer 43 | float* unaligned_ptr = unaligned_data + 1; // Offset by 1 to ensure unalignment 44 | 45 | // Initialize data 46 | for (int i = 0; i < ARRAY_SIZE; i++) { 47 | aligned_data[i] = static_cast(i + 1); 48 | unaligned_ptr[i] = static_cast(i + 1); 49 | } 50 | 51 | // Demonstrate aligned load 52 | __m256 aligned_vec = _mm256_load_ps(aligned_data); 53 | print_m256(aligned_vec, "Aligned load result"); 54 | 55 | // Demonstrate unaligned load 56 | __m256 unaligned_vec = _mm256_loadu_ps(unaligned_ptr); 57 | print_m256(unaligned_vec, "Unaligned load result"); 58 | 59 | // Performance comparison 60 | Timer timer("Aligned vs. Unaligned Load Performance"); 61 | 62 | // Benchmark aligned load 63 | auto aligned_load = [&]() { 64 | __m256 result; 65 | for (int i = 0; i < TEST_ITERATIONS; i++) { 66 | result = _mm256_load_ps(aligned_data); 67 | } 68 | return result; 69 | }; 70 | 71 | // Benchmark unaligned load 72 | auto unaligned_load = [&]() { 73 | __m256 result; 74 | for (int i = 0; i < TEST_ITERATIONS; i++) { 75 | result = _mm256_loadu_ps(unaligned_ptr); 76 | } 77 | return result; 78 | }; 79 | 80 | benchmark_comparison("Load Operations", aligned_load, unaligned_load, 10); 81 | std::cout << std::endl; 82 | 83 | // --------- 2. Masked Load ------------- 84 | std::cout << "2. Masked Load" << std::endl; 85 | std::cout << "---------------------------------------------------" << std::endl; 86 | std::cout << "Selectively loading elements based on a mask." << std::endl; 87 | std::cout << std::endl; 88 | 89 | // Create a mask to load only elements 0, 2, 4, and 6 90 | __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); 91 | 92 | // Perform masked load (elements not selected by mask will be zero) 93 | __m256 masked_vec = _mm256_maskload_ps(aligned_data, mask); 94 | print_m256(masked_vec, "Masked load result (even indices only)"); 95 | std::cout << std::endl; 96 | 97 | // --------- 3. Aligned vs. Unaligned Store ------------- 98 | std::cout << "3. Aligned vs. Unaligned Store" << std::endl; 99 | std::cout << "---------------------------------------------------" << std::endl; 100 | std::cout << "Comparing aligned and unaligned store operations." << std::endl; 101 | std::cout << std::endl; 102 | 103 | // Create a test vector 104 | __m256 test_vec = _mm256_set_ps(16.0f, 14.0f, 12.0f, 10.0f, 8.0f, 6.0f, 4.0f, 2.0f); 105 | 106 | // Perform aligned store 107 | _mm256_store_ps(aligned_data, test_vec); 108 | 109 | std::cout << "Aligned store result: ["; 110 | for (int i = 0; i < ARRAY_SIZE - 1; i++) { 111 | std::cout << aligned_data[i] << ", "; 112 | } 113 | std::cout << aligned_data[ARRAY_SIZE - 1] << "]" << std::endl; 114 | 115 | // Perform unaligned store 116 | _mm256_storeu_ps(unaligned_ptr, test_vec); 117 | 118 | std::cout << "Unaligned store result: ["; 119 | for (int i = 0; i < ARRAY_SIZE - 1; i++) { 120 | std::cout << unaligned_ptr[i] << ", "; 121 | } 122 | std::cout << unaligned_ptr[ARRAY_SIZE - 1] << "]" << std::endl; 123 | 124 | // Performance comparison 125 | Timer timer2("Aligned vs. Unaligned Store Performance"); 126 | 127 | // Benchmark aligned store 128 | auto aligned_store = [&]() { 129 | for (int i = 0; i < TEST_ITERATIONS; i++) { 130 | _mm256_store_ps(aligned_data, test_vec); 131 | } 132 | }; 133 | 134 | // Benchmark unaligned store 135 | auto unaligned_store = [&]() { 136 | for (int i = 0; i < TEST_ITERATIONS; i++) { 137 | _mm256_storeu_ps(unaligned_ptr, test_vec); 138 | } 139 | }; 140 | 141 | benchmark_comparison("Store Operations", aligned_store, unaligned_store, 10); 142 | std::cout << std::endl; 143 | 144 | // --------- 4. Masked Store ------------- 145 | std::cout << "4. Masked Store" << std::endl; 146 | std::cout << "---------------------------------------------------" << std::endl; 147 | std::cout << "Selectively storing elements based on a mask." << std::endl; 148 | std::cout << std::endl; 149 | 150 | // Reset aligned data 151 | for (int i = 0; i < ARRAY_SIZE; i++) { 152 | aligned_data[i] = 0.0f; 153 | } 154 | 155 | // Create a mask to store only elements 1, 3, 5, and 7 156 | __m256i mask2 = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); 157 | 158 | // Perform masked store 159 | _mm256_maskstore_ps(aligned_data, mask2, test_vec); 160 | 161 | std::cout << "Masked store result (odd indices only): ["; 162 | for (int i = 0; i < ARRAY_SIZE - 1; i++) { 163 | std::cout << aligned_data[i] << ", "; 164 | } 165 | std::cout << aligned_data[ARRAY_SIZE - 1] << "]" << std::endl; 166 | std::cout << std::endl; 167 | 168 | // --------- 5. Stream Load/Store (Non-temporal) ------------- 169 | std::cout << "5. Stream Load/Store (Non-temporal)" << std::endl; 170 | std::cout << "---------------------------------------------------" << std::endl; 171 | std::cout << "Using non-temporal loads and stores that bypass the cache." << std::endl; 172 | std::cout << "Useful for large data sets that won't be reused soon." << std::endl; 173 | std::cout << std::endl; 174 | 175 | // Allocate a large array to demonstrate streaming operations 176 | const int LARGE_SIZE = 1024; 177 | float* large_array = aligned_alloc(LARGE_SIZE, 32); 178 | 179 | // Initialize the array 180 | for (int i = 0; i < LARGE_SIZE; i++) { 181 | large_array[i] = static_cast(i); 182 | } 183 | 184 | // Perform stream load and store 185 | for (int i = 0; i < LARGE_SIZE; i += 8) { 186 | // Stream load (using _mm256_stream_load_si256 which requires casting) 187 | __m256 loaded = _mm256_loadu_ps(&large_array[i]); 188 | 189 | // Process the data (simple multiplication by 2) 190 | __m256 processed = _mm256_mul_ps(loaded, _mm256_set1_ps(2.0f)); 191 | 192 | // Stream store (non-temporal store that bypasses cache) 193 | _mm256_stream_ps(&large_array[i], processed); 194 | } 195 | 196 | // Ensure all streaming stores are visible 197 | _mm_sfence(); 198 | 199 | // Print a small section of the result 200 | std::cout << "Stream store result (first 16 elements): ["; 201 | for (int i = 0; i < 15; i++) { 202 | std::cout << large_array[i] << ", "; 203 | } 204 | std::cout << large_array[15] << "]" << std::endl; 205 | 206 | // Clean up 207 | free(aligned_data); 208 | delete[] unaligned_data; 209 | free(large_array); 210 | 211 | return 0; 212 | } 213 | -------------------------------------------------------------------------------- /src/03_Examples/01_conditional_code/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | /** 12 | * 03_Examples/01_conditional_code - Implementing conditional operations with SIMD 13 | * 14 | * This example demonstrates how to implement conditional logic using SIMD: 15 | * 1. Clamping values to a range 16 | * 2. Filtering positive values 17 | * 3. Complex conditional operations (multiple conditions) 18 | * 4. Using masks and blending for conditional selection 19 | * 20 | * Conditional operations are challenging in SIMD because traditional branching 21 | * (if/else statements) doesn't work well with vector operations. Instead, we use 22 | * comparison operations to create masks, and then use those masks to select values. 23 | */ 24 | 25 | int main() { 26 | set_benchmark_suite("03_Examples/01_conditional_code"); 27 | 28 | std::cout << "=== SIMD Conditional Operations ===" << std::endl; 29 | std::cout << std::endl; 30 | 31 | // Initialize test data 32 | // Allocate aligned memory for better performance 33 | float* data1 = aligned_alloc(8); 34 | float* data2 = aligned_alloc(8); 35 | float* result_scalar = aligned_alloc(8); 36 | float* result_simd = aligned_alloc(8); 37 | 38 | // Initialize data1 with ascending values 39 | data1[0] = 5.0f; data1[1] = 10.0f; data1[2] = 15.0f; data1[3] = 20.0f; 40 | data1[4] = 25.0f; data1[5] = 30.0f; data1[6] = 35.0f; data1[7] = 40.0f; 41 | 42 | // Initialize data2 with mixed positive and negative values 43 | data2[0] = -1.0f; data2[1] = 4.0f; data2[2] = 9.0f; data2[3] = -16.0f; 44 | data2[4] = 25.0f; data2[5] = -36.0f; data2[6] = 49.0f; data2[7] = -64.0f; 45 | 46 | // Load data into SIMD registers 47 | __m256 vector1 = _mm256_load_ps(data1); 48 | __m256 vector2 = _mm256_load_ps(data2); 49 | 50 | // Print input data 51 | print_m256(vector1, "Vector 1"); 52 | print_m256(vector2, "Vector 2"); 53 | std::cout << std::endl; 54 | 55 | // --------- 1. Clamping Values ------------- 56 | std::cout << "1. Clamping Values" << std::endl; 57 | std::cout << "---------------------------------------------------" << std::endl; 58 | std::cout << "Clamping values in Vector 2 to the range [5, 30]" << std::endl; 59 | std::cout << std::endl; 60 | 61 | // Scalar implementation of clamping 62 | auto scalar_clamp = [&]() { 63 | for (int i = 0; i < 8; i++) { 64 | result_scalar[i] = std::max(5.0f, std::min(30.0f, data2[i])); 65 | } 66 | }; 67 | 68 | // SIMD implementation of clamping 69 | auto simd_clamp = [&]() { 70 | __m256 min_val = _mm256_set1_ps(5.0f); 71 | __m256 max_val = _mm256_set1_ps(30.0f); 72 | 73 | // First, clamp to upper bound (min operation) 74 | __m256 upper_clamped = _mm256_min_ps(vector2, max_val); 75 | 76 | // Then, clamp to lower bound (max operation) 77 | __m256 result = _mm256_max_ps(upper_clamped, min_val); 78 | 79 | _mm256_store_ps(result_simd, result); 80 | }; 81 | 82 | // Execute both implementations 83 | scalar_clamp(); 84 | simd_clamp(); 85 | 86 | // Print results 87 | std::cout << "Scalar clamping result: ["; 88 | for (int i = 0; i < 7; i++) { 89 | std::cout << result_scalar[i] << ", "; 90 | } 91 | std::cout << result_scalar[7] << "]" << std::endl; 92 | 93 | std::cout << "SIMD clamping result: ["; 94 | for (int i = 0; i < 7; i++) { 95 | std::cout << result_simd[i] << ", "; 96 | } 97 | std::cout << result_simd[7] << "]" << std::endl; 98 | 99 | // Benchmark comparison 100 | benchmark_comparison("Clamping", scalar_clamp, simd_clamp, 200); 101 | std::cout << std::endl; 102 | 103 | // --------- 2. Filtering Positive Values ------------- 104 | std::cout << "2. Filtering Positive Values" << std::endl; 105 | std::cout << "---------------------------------------------------" << std::endl; 106 | std::cout << "Creating a mask for positive values in Vector 2" << std::endl; 107 | std::cout << std::endl; 108 | 109 | // Create a mask for positive values 110 | __m256 zero = _mm256_setzero_ps(); 111 | __m256 positive_mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ); 112 | 113 | // Print the mask (all bits set for true, all bits clear for false) 114 | float8 mask_values(positive_mask); 115 | std::cout << "Positive mask (as floats): ["; 116 | for (int i = 0; i < 7; i++) { 117 | std::cout << mask_values.a[i] << ", "; 118 | } 119 | std::cout << mask_values.a[7] << "]" << std::endl; 120 | 121 | // Convert the mask to a bitmask (one bit per element) 122 | int bitmask = _mm256_movemask_ps(positive_mask); 123 | std::cout << "Positive mask (as bitmask): " << std::bitset<8>(bitmask) << " (decimal: " << bitmask << ")" << std::endl; 124 | 125 | // Explain the bitmask 126 | std::cout << "Explanation: Positions 1, 2, 4, and 6 have positive values," << std::endl; 127 | std::cout << "corresponding to bits 1, 2, 4, and 6 in the bitmask." << std::endl; 128 | std::cout << "As a decimal: 2^1 + 2^2 + 2^4 + 2^6 = 2 + 4 + 16 + 64 = 86" << std::endl; 129 | std::cout << std::endl; 130 | 131 | // Scalar implementation of filtering 132 | auto scalar_filter = [&]() { 133 | for (int i = 0; i < 8; i++) { 134 | if (data2[i] > 0) { 135 | result_scalar[i] = data2[i]; 136 | } else { 137 | result_scalar[i] = 0.0f; 138 | } 139 | } 140 | }; 141 | 142 | // SIMD implementation of filtering 143 | auto simd_filter = [&]() { 144 | __m256 mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ); 145 | __m256 result = _mm256_and_ps(vector2, mask); // Keep only positive values 146 | _mm256_store_ps(result_simd, result); 147 | }; 148 | 149 | // Execute both implementations 150 | scalar_filter(); 151 | simd_filter(); 152 | 153 | // Print results 154 | std::cout << "Scalar filtering result: ["; 155 | for (int i = 0; i < 7; i++) { 156 | std::cout << result_scalar[i] << ", "; 157 | } 158 | std::cout << result_scalar[7] << "]" << std::endl; 159 | 160 | std::cout << "SIMD filtering result: ["; 161 | for (int i = 0; i < 7; i++) { 162 | std::cout << result_simd[i] << ", "; 163 | } 164 | std::cout << result_simd[7] << "]" << std::endl; 165 | 166 | // Benchmark comparison 167 | benchmark_comparison("Filtering", scalar_filter, simd_filter, 200); 168 | std::cout << std::endl; 169 | 170 | // --------- 3. Complex Conditional Operations ------------- 171 | std::cout << "3. Complex Conditional Operations" << std::endl; 172 | std::cout << "---------------------------------------------------" << std::endl; 173 | std::cout << "Finding values in Vector 2 that are both positive and greater than Vector 1" << std::endl; 174 | std::cout << std::endl; 175 | 176 | // Create masks for both conditions 177 | __m256 positive_mask2 = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ); 178 | __m256 greater_mask = _mm256_cmp_ps(vector2, vector1, _CMP_GT_OQ); 179 | 180 | // Combine masks with logical AND 181 | __m256 combined_mask = _mm256_and_ps(positive_mask2, greater_mask); 182 | 183 | // Print the combined mask 184 | float8 combined_mask_values(combined_mask); 185 | std::cout << "Combined mask (as floats): ["; 186 | for (int i = 0; i < 7; i++) { 187 | std::cout << combined_mask_values.a[i] << ", "; 188 | } 189 | std::cout << combined_mask_values.a[7] << "]" << std::endl; 190 | 191 | // Convert the combined mask to a bitmask 192 | int combined_bitmask = _mm256_movemask_ps(combined_mask); 193 | std::cout << "Combined mask (as bitmask): " << std::bitset<8>(combined_bitmask) << " (decimal: " << combined_bitmask << ")" << std::endl; 194 | std::cout << std::endl; 195 | 196 | // Scalar implementation of complex filtering 197 | auto scalar_complex = [&]() { 198 | for (int i = 0; i < 8; i++) { 199 | if (data2[i] > 0 && data2[i] > data1[i]) { 200 | result_scalar[i] = data2[i]; 201 | } else { 202 | result_scalar[i] = 0.0f; 203 | } 204 | } 205 | }; 206 | 207 | // SIMD implementation of complex filtering using blendv 208 | auto simd_complex = [&]() { 209 | __m256 pos_mask = _mm256_cmp_ps(vector2, zero, _CMP_GT_OQ); 210 | __m256 gt_mask = _mm256_cmp_ps(vector2, vector1, _CMP_GT_OQ); 211 | __m256 combined = _mm256_and_ps(pos_mask, gt_mask); 212 | 213 | // Use blendv to select values: if mask is true, take from vector2, else take 0 214 | __m256 result = _mm256_blendv_ps(zero, vector2, combined); 215 | _mm256_store_ps(result_simd, result); 216 | }; 217 | 218 | // Execute both implementations 219 | scalar_complex(); 220 | simd_complex(); 221 | 222 | // Print results 223 | std::cout << "Scalar complex filtering result: ["; 224 | for (int i = 0; i < 7; i++) { 225 | std::cout << result_scalar[i] << ", "; 226 | } 227 | std::cout << result_scalar[7] << "]" << std::endl; 228 | 229 | std::cout << "SIMD complex filtering result: ["; 230 | for (int i = 0; i < 7; i++) { 231 | std::cout << result_simd[i] << ", "; 232 | } 233 | std::cout << result_simd[7] << "]" << std::endl; 234 | 235 | // Benchmark comparison 236 | benchmark_comparison("Complex Filtering", scalar_complex, simd_complex, 200); 237 | std::cout << std::endl; 238 | 239 | // --------- 4. Conditional Selection with Blending ------------- 240 | std::cout << "4. Conditional Selection with Blending" << std::endl; 241 | std::cout << "---------------------------------------------------" << std::endl; 242 | std::cout << "Using _mm256_blendv_ps for conditional selection" << std::endl; 243 | std::cout << std::endl; 244 | 245 | // Create a new vector with different values 246 | __m256 vector3 = _mm256_set_ps(80.0f, 70.0f, 60.0f, 50.0f, 40.0f, 30.0f, 20.0f, 10.0f); 247 | print_m256(vector3, "Vector 3"); 248 | 249 | // Create a mask based on a condition (e.g., values > 50) 250 | __m256 threshold = _mm256_set1_ps(50.0f); 251 | __m256 blend_mask = _mm256_cmp_ps(vector3, threshold, _CMP_GT_OQ); 252 | 253 | // Use blendv to select values from vector1 or vector2 based on the mask 254 | __m256 blended = _mm256_blendv_ps(vector1, vector2, blend_mask); 255 | print_m256(blended, "Blended Result (Vector 2 if > 50, else Vector 1)"); 256 | 257 | // Explain the blending operation 258 | std::cout << "Explanation: For each element, if Vector 3 > 50, we take the value from Vector 2," << std::endl; 259 | std::cout << "otherwise we take the value from Vector 1." << std::endl; 260 | std::cout << std::endl; 261 | 262 | // Clean up 263 | free(data1); 264 | free(data2); 265 | free(result_scalar); 266 | free(result_simd); 267 | 268 | return 0; 269 | } 270 | -------------------------------------------------------------------------------- /src/02_Computations/01_simple_maths/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /** 7 | * 02_Computations/01_simple_maths - Basic SIMD mathematical operations 8 | * 9 | * This example demonstrates various mathematical operations using SIMD: 10 | * 1. Addition (_mm256_add_ps) 11 | * 2. Subtraction (_mm256_sub_ps) 12 | * 3. Multiplication (_mm256_mul_ps) 13 | * 4. Division (_mm256_div_ps) 14 | * 5. Fused Multiply-Add (_mm256_fmadd_ps) 15 | * 6. Square Root (_mm256_sqrt_ps) 16 | * 7. Minimum/Maximum (_mm256_min_ps, _mm256_max_ps) 17 | * 8. Horizontal operations (_mm256_hadd_ps, _mm256_hsub_ps) 18 | * 19 | * For each operation, we compare the performance of SIMD vs. scalar implementation. 20 | */ 21 | 22 | int main() { 23 | set_benchmark_suite("02_Computations/01_simple_maths"); 24 | 25 | std::cout << "=== SIMD Mathematical Operations ===" << std::endl; 26 | std::cout << std::endl; 27 | 28 | // Initialize test data 29 | float data1[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; 30 | float data2[8] = {8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}; 31 | 32 | // Load data into SIMD vectors 33 | __m256 vector1 = _mm256_loadu_ps(data1); 34 | __m256 vector2 = _mm256_loadu_ps(data2); 35 | 36 | // --------- 1. Addition ------------- 37 | std::cout << "1. Addition (_mm256_add_ps)" << std::endl; 38 | std::cout << "---------------------------------------------------" << std::endl; 39 | std::cout << "Adds corresponding elements of two vectors." << std::endl; 40 | std::cout << std::endl; 41 | 42 | // Print input vectors 43 | print_m256(vector1, "Vector 1"); 44 | print_m256(vector2, "Vector 2"); 45 | 46 | // Perform addition 47 | __m256 add_result = _mm256_add_ps(vector1, vector2); 48 | print_m256(add_result, "Addition Result (Vector 1 + Vector 2)"); 49 | 50 | // Compare performance: scalar vs. SIMD 51 | auto scalar_add = [&]() { 52 | float result[8]; 53 | for (int i = 0; i < 8; i++) { 54 | result[i] = data1[i] + data2[i]; 55 | } 56 | }; 57 | 58 | auto simd_add = [&]() { 59 | __m256 result = _mm256_add_ps(vector1, vector2); 60 | }; 61 | 62 | benchmark_comparison("Addition", scalar_add, simd_add); 63 | std::cout << std::endl; 64 | 65 | // --------- 2. Subtraction ------------- 66 | std::cout << "2. Subtraction (_mm256_sub_ps)" << std::endl; 67 | std::cout << "---------------------------------------------------" << std::endl; 68 | std::cout << "Subtracts corresponding elements of two vectors." << std::endl; 69 | std::cout << std::endl; 70 | 71 | // Perform subtraction 72 | __m256 sub_result = _mm256_sub_ps(vector1, vector2); 73 | print_m256(sub_result, "Subtraction Result (Vector 1 - Vector 2)"); 74 | 75 | // Compare performance: scalar vs. SIMD 76 | auto scalar_sub = [&]() { 77 | float result[8]; 78 | for (int i = 0; i < 8; i++) { 79 | result[i] = data1[i] - data2[i]; 80 | } 81 | }; 82 | 83 | auto simd_sub = [&]() { 84 | __m256 result = _mm256_sub_ps(vector1, vector2); 85 | }; 86 | 87 | benchmark_comparison("Subtraction", scalar_sub, simd_sub); 88 | std::cout << std::endl; 89 | 90 | // --------- 3. Multiplication ------------- 91 | std::cout << "3. Multiplication (_mm256_mul_ps)" << std::endl; 92 | std::cout << "---------------------------------------------------" << std::endl; 93 | std::cout << "Multiplies corresponding elements of two vectors." << std::endl; 94 | std::cout << std::endl; 95 | 96 | // Perform multiplication 97 | __m256 mul_result = _mm256_mul_ps(vector1, vector2); 98 | print_m256(mul_result, "Multiplication Result (Vector 1 * Vector 2)"); 99 | 100 | // Compare performance: scalar vs. SIMD 101 | auto scalar_mul = [&]() { 102 | float result[8]; 103 | for (int i = 0; i < 8; i++) { 104 | result[i] = data1[i] * data2[i]; 105 | } 106 | }; 107 | 108 | auto simd_mul = [&]() { 109 | __m256 result = _mm256_mul_ps(vector1, vector2); 110 | }; 111 | 112 | benchmark_comparison("Multiplication", scalar_mul, simd_mul); 113 | std::cout << std::endl; 114 | 115 | // --------- 4. Division ------------- 116 | std::cout << "4. Division (_mm256_div_ps)" << std::endl; 117 | std::cout << "---------------------------------------------------" << std::endl; 118 | std::cout << "Divides corresponding elements of two vectors." << std::endl; 119 | std::cout << std::endl; 120 | 121 | // Perform division 122 | __m256 div_result = _mm256_div_ps(vector1, vector2); 123 | print_m256(div_result, "Division Result (Vector 1 / Vector 2)"); 124 | 125 | // Compare performance: scalar vs. SIMD 126 | auto scalar_div = [&]() { 127 | float result[8]; 128 | for (int i = 0; i < 8; i++) { 129 | result[i] = data1[i] / data2[i]; 130 | } 131 | }; 132 | 133 | auto simd_div = [&]() { 134 | __m256 result = _mm256_div_ps(vector1, vector2); 135 | }; 136 | 137 | benchmark_comparison("Division", scalar_div, simd_div); 138 | std::cout << std::endl; 139 | 140 | // --------- 5. Fused Multiply-Add ------------- 141 | std::cout << "5. Fused Multiply-Add (_mm256_fmadd_ps)" << std::endl; 142 | std::cout << "---------------------------------------------------" << std::endl; 143 | std::cout << "Performs a fused multiply-add operation: a*b + c" << std::endl; 144 | std::cout << "This is more accurate and faster than separate multiply and add." << std::endl; 145 | std::cout << std::endl; 146 | 147 | // Create a third vector for FMA 148 | __m256 vector3 = _mm256_set1_ps(2.0f); 149 | print_m256(vector3, "Vector 3"); 150 | 151 | // Perform FMA: vector1 * vector2 + vector3 152 | __m256 fma_result = _mm256_fmadd_ps(vector1, vector2, vector3); 153 | print_m256(fma_result, "FMA Result (Vector 1 * Vector 2 + Vector 3)"); 154 | 155 | // Compare performance: scalar vs. SIMD 156 | auto scalar_fma = [&]() { 157 | float result[8]; 158 | for (int i = 0; i < 8; i++) { 159 | result[i] = data1[i] * data2[i] + 2.0f; 160 | } 161 | }; 162 | 163 | auto simd_fma = [&]() { 164 | __m256 result = _mm256_fmadd_ps(vector1, vector2, vector3); 165 | }; 166 | 167 | benchmark_comparison("Fused Multiply-Add", scalar_fma, simd_fma); 168 | std::cout << std::endl; 169 | 170 | // --------- 6. Square Root ------------- 171 | std::cout << "6. Square Root (_mm256_sqrt_ps)" << std::endl; 172 | std::cout << "---------------------------------------------------" << std::endl; 173 | std::cout << "Computes the square root of each element in a vector." << std::endl; 174 | std::cout << std::endl; 175 | 176 | // Create a vector of positive values 177 | __m256 pos_vector = _mm256_set_ps(64.0f, 49.0f, 36.0f, 25.0f, 16.0f, 9.0f, 4.0f, 1.0f); 178 | print_m256(pos_vector, "Input Vector"); 179 | 180 | // Compute square root 181 | __m256 sqrt_result = _mm256_sqrt_ps(pos_vector); 182 | print_m256(sqrt_result, "Square Root Result"); 183 | 184 | // Compare performance: scalar vs. SIMD 185 | auto scalar_sqrt = [&]() { 186 | float result[8]; 187 | union { 188 | __m256 v; 189 | float a[8]; 190 | } u; 191 | u.v = pos_vector; 192 | for (int i = 0; i < 8; i++) { 193 | result[i] = std::sqrt(u.a[i]); 194 | } 195 | }; 196 | 197 | auto simd_sqrt = [&]() { 198 | __m256 result = _mm256_sqrt_ps(pos_vector); 199 | }; 200 | 201 | benchmark_comparison("Square Root", scalar_sqrt, simd_sqrt); 202 | std::cout << std::endl; 203 | 204 | // --------- 7. Min/Max Operations ------------- 205 | std::cout << "7. Min/Max Operations (_mm256_min_ps, _mm256_max_ps)" << std::endl; 206 | std::cout << "---------------------------------------------------" << std::endl; 207 | std::cout << "Computes the minimum or maximum of corresponding elements." << std::endl; 208 | std::cout << std::endl; 209 | 210 | // Print input vectors again 211 | print_m256(vector1, "Vector 1"); 212 | print_m256(vector2, "Vector 2"); 213 | 214 | // Compute min and max 215 | __m256 min_result = _mm256_min_ps(vector1, vector2); 216 | __m256 max_result = _mm256_max_ps(vector1, vector2); 217 | 218 | print_m256(min_result, "Minimum Result"); 219 | print_m256(max_result, "Maximum Result"); 220 | 221 | // Compare performance: scalar vs. SIMD for min 222 | auto scalar_min = [&]() { 223 | float result[8]; 224 | for (int i = 0; i < 8; i++) { 225 | result[i] = std::min(data1[i], data2[i]); 226 | } 227 | }; 228 | 229 | auto simd_min = [&]() { 230 | __m256 result = _mm256_min_ps(vector1, vector2); 231 | }; 232 | 233 | benchmark_comparison("Minimum", scalar_min, simd_min); 234 | std::cout << std::endl; 235 | 236 | // --------- 8. Horizontal Operations ------------- 237 | std::cout << "8. Horizontal Operations (_mm256_hadd_ps, _mm256_hsub_ps)" << std::endl; 238 | std::cout << "---------------------------------------------------" << std::endl; 239 | std::cout << "Performs horizontal addition or subtraction of adjacent elements." << std::endl; 240 | std::cout << std::endl; 241 | 242 | // Create test vectors 243 | __m256 hadd_vec1 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 244 | __m256 hadd_vec2 = _mm256_set_ps(16.0f, 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f); 245 | 246 | print_m256(hadd_vec1, "Vector A"); 247 | print_m256(hadd_vec2, "Vector B"); 248 | 249 | // Perform horizontal addition 250 | // This adds adjacent pairs: (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) 251 | __m256 hadd_result = _mm256_hadd_ps(hadd_vec1, hadd_vec2); 252 | print_m256(hadd_result, "Horizontal Addition Result"); 253 | 254 | // Perform horizontal subtraction 255 | // This subtracts adjacent pairs: (a0-a1, a2-a3, b0-b1, b2-b3, a4-a5, a6-a7, b4-b5, b6-b7) 256 | __m256 hsub_result = _mm256_hsub_ps(hadd_vec1, hadd_vec2); 257 | print_m256(hsub_result, "Horizontal Subtraction Result"); 258 | 259 | // Note: Horizontal operations are typically slower than vertical operations 260 | // They are useful for specific algorithms like dot products and matrix operations 261 | 262 | return 0; 263 | } 264 | -------------------------------------------------------------------------------- /src/01_Basics/02_initializing_data/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /** 7 | * 01_Basics/02_initializing_data - Different ways to initialize SIMD vectors 8 | * 9 | * This example demonstrates various methods to initialize SIMD vectors: 10 | * 1. _mm256_setzero_ps/pd/si256 - Initialize all elements to zero 11 | * 2. _mm256_set1_ps/pd/epi32/etc - Initialize all elements to the same value 12 | * 3. _mm256_set_ps/pd/epi32/etc - Initialize each element individually 13 | * 4. _mm256_setr_ps/pd/epi32/etc - Initialize each element in reverse order 14 | * 15 | * We'll also compare the performance of SIMD initialization vs. standard array initialization. 16 | */ 17 | 18 | // Constants 19 | // A lighter loop count keeps turnaround snappy while still magnifying the perf gap. 20 | constexpr int NUM_ITERATIONS = 100000; 21 | 22 | template 23 | void printArray(const T (&arr)[N], const std::string &description) { 24 | std::cout << description << ": "; 25 | for (size_t i = 0; i < N; ++i) { 26 | std::cout << arr[i] << ", "; 27 | } 28 | std::cout << std::endl; 29 | } 30 | 31 | void copyFromSIMD(float* dest, const __m256& src) { 32 | _mm256_storeu_ps(dest, src); 33 | } 34 | 35 | void copyFromSIMD(double* dest, const __m256d& src) { 36 | _mm256_storeu_pd(dest, src); 37 | } 38 | 39 | void copyFromSIMD(int* dest, const __m256i& src) { 40 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest), src); 41 | } 42 | 43 | void copyFromSIMD(short* dest, const __m256i& src) { 44 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest), src); 45 | } 46 | 47 | int main() { 48 | set_benchmark_suite("01_Basics/02_initializing_data"); 49 | 50 | std::cout << "=== SIMD Data Initialization Methods ===" << std::endl; 51 | std::cout << std::endl; 52 | 53 | // --------- 1. Zero Initialization (_mm256_setzero_*) ------------- 54 | std::cout << "1. Zero Initialization (_mm256_setzero_*)" << std::endl; 55 | std::cout << "---------------------------------------------------" << std::endl; 56 | std::cout << "Initializes all elements of a SIMD vector to zero." << std::endl; 57 | std::cout << std::endl; 58 | 59 | // Standard method for float array 60 | float std_float_array[8]; 61 | auto start = std::chrono::high_resolution_clock::now(); 62 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 63 | for (int lane = 0; lane < 8; ++lane) { 64 | std_float_array[lane] = 0.0f; 65 | } 66 | } 67 | auto stop = std::chrono::high_resolution_clock::now(); 68 | auto duration_std = std::chrono::duration_cast(stop - start); 69 | 70 | // SIMD method for float vector 71 | __m256 simd_float_vec; 72 | start = std::chrono::high_resolution_clock::now(); 73 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 74 | simd_float_vec = _mm256_setzero_ps(); 75 | } 76 | stop = std::chrono::high_resolution_clock::now(); 77 | auto duration_simd = std::chrono::duration_cast(stop - start); 78 | 79 | // Print results 80 | std::cout << "Float Zero Initialization:" << std::endl; 81 | std::cout << " Standard method: " << duration_std.count() << " microseconds" << std::endl; 82 | std::cout << " SIMD method: " << duration_simd.count() << " microseconds" << std::endl; 83 | std::cout << " Speedup: " << std::fixed << std::setprecision(2) 84 | << static_cast(duration_std.count()) / duration_simd.count() << "x" << std::endl; 85 | 86 | // Print the SIMD vector 87 | print_m256(simd_float_vec, "Zero-initialized float vector"); 88 | 89 | // Also demonstrate zero initialization for integers and doubles 90 | __m256i simd_int_vec = _mm256_setzero_si256(); 91 | __m256d simd_double_vec = _mm256_setzero_pd(); 92 | 93 | print_m256i(simd_int_vec, "Zero-initialized integer vector"); 94 | print_m256d(simd_double_vec, "Zero-initialized double vector"); 95 | std::cout << std::endl; 96 | 97 | // --------- 2. Broadcast Initialization (_mm256_set1_*) ------------- 98 | std::cout << "2. Broadcast Initialization (_mm256_set1_*)" << std::endl; 99 | std::cout << "---------------------------------------------------" << std::endl; 100 | std::cout << "Initializes all elements of a SIMD vector to the same value." << std::endl; 101 | std::cout << std::endl; 102 | 103 | // Standard method for double array 104 | double std_double_array[4]; 105 | start = std::chrono::high_resolution_clock::now(); 106 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 107 | for (int lane = 0; lane < 4; ++lane) { 108 | std_double_array[lane] = 10.0; 109 | } 110 | } 111 | stop = std::chrono::high_resolution_clock::now(); 112 | duration_std = std::chrono::duration_cast(stop - start); 113 | 114 | // SIMD method for double vector 115 | __m256d simd_double_vec2; 116 | start = std::chrono::high_resolution_clock::now(); 117 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 118 | simd_double_vec2 = _mm256_set1_pd(10.0); 119 | } 120 | stop = std::chrono::high_resolution_clock::now(); 121 | duration_simd = std::chrono::duration_cast(stop - start); 122 | 123 | // Print results 124 | std::cout << "Double Broadcast Initialization:" << std::endl; 125 | std::cout << " Standard method: " << duration_std.count() << " microseconds" << std::endl; 126 | std::cout << " SIMD method: " << duration_simd.count() << " microseconds" << std::endl; 127 | std::cout << " Speedup: " << std::fixed << std::setprecision(2) 128 | << static_cast(duration_std.count()) / duration_simd.count() << "x" << std::endl; 129 | 130 | // Print the SIMD vector 131 | print_m256d(simd_double_vec2, "Broadcast-initialized double vector (10.0)"); 132 | 133 | // Also demonstrate broadcast initialization for floats and integers 134 | __m256 simd_float_vec2 = _mm256_set1_ps(42.0f); 135 | __m256i simd_int_vec2 = _mm256_set1_epi32(100); 136 | 137 | print_m256(simd_float_vec2, "Broadcast-initialized float vector (42.0)"); 138 | print_m256i(simd_int_vec2, "Broadcast-initialized integer vector (100)"); 139 | std::cout << std::endl; 140 | 141 | // --------- 3. Individual Element Initialization (_mm256_set_*) ------------- 142 | std::cout << "3. Individual Element Initialization (_mm256_set_*)" << std::endl; 143 | std::cout << "---------------------------------------------------" << std::endl; 144 | std::cout << "Initializes each element of a SIMD vector individually." << std::endl; 145 | std::cout << "Note: Elements are specified in reverse order (high to low)." << std::endl; 146 | std::cout << std::endl; 147 | 148 | // Standard method for int array 149 | int std_int_array[8]; 150 | start = std::chrono::high_resolution_clock::now(); 151 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 152 | for (int lane = 0; lane < 8; ++lane) { 153 | std_int_array[lane] = lane + 1; 154 | } 155 | } 156 | stop = std::chrono::high_resolution_clock::now(); 157 | duration_std = std::chrono::duration_cast(stop - start); 158 | 159 | // SIMD method for int vector 160 | __m256i simd_int_vec3; 161 | start = std::chrono::high_resolution_clock::now(); 162 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 163 | // Note: _mm256_set_epi32 takes arguments in reverse order (high to low) 164 | simd_int_vec3 = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); 165 | } 166 | stop = std::chrono::high_resolution_clock::now(); 167 | duration_simd = std::chrono::duration_cast(stop - start); 168 | 169 | // Print results 170 | std::cout << "Integer Individual Initialization:" << std::endl; 171 | std::cout << " Standard method: " << duration_std.count() << " microseconds" << std::endl; 172 | std::cout << " SIMD method: " << duration_simd.count() << " microseconds" << std::endl; 173 | std::cout << " Speedup: " << std::fixed << std::setprecision(2) 174 | << static_cast(duration_std.count()) / duration_simd.count() << "x" << std::endl; 175 | 176 | // Print the SIMD vector 177 | print_m256i(simd_int_vec3, "Individually-initialized integer vector"); 178 | 179 | // Also demonstrate individual initialization for floats and doubles 180 | __m256 simd_float_vec3 = _mm256_set_ps(8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f); 181 | __m256d simd_double_vec3 = _mm256_set_pd(4.0, 3.0, 2.0, 1.0); 182 | 183 | print_m256(simd_float_vec3, "Individually-initialized float vector"); 184 | print_m256d(simd_double_vec3, "Individually-initialized double vector"); 185 | std::cout << std::endl; 186 | 187 | // --------- 4. Reverse Order Initialization (_mm256_setr_*) ------------- 188 | std::cout << "4. Reverse Order Initialization (_mm256_setr_*)" << std::endl; 189 | std::cout << "---------------------------------------------------" << std::endl; 190 | std::cout << "Initializes each element of a SIMD vector individually in natural order." << std::endl; 191 | std::cout << "Note: Elements are specified in natural order (low to high)." << std::endl; 192 | std::cout << std::endl; 193 | 194 | // Standard method for short array 195 | short std_short_array[16]; 196 | start = std::chrono::high_resolution_clock::now(); 197 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 198 | for (int lane = 0; lane < 16; ++lane) { 199 | std_short_array[lane] = static_cast(lane + 1); 200 | } 201 | } 202 | stop = std::chrono::high_resolution_clock::now(); 203 | duration_std = std::chrono::duration_cast(stop - start); 204 | 205 | // SIMD method for short vector 206 | __m256i simd_short_vec; 207 | start = std::chrono::high_resolution_clock::now(); 208 | for (int i = 0; i < NUM_ITERATIONS; ++i) { 209 | // Note: _mm256_setr_epi16 takes arguments in natural order (low to high) 210 | simd_short_vec = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 211 | } 212 | stop = std::chrono::high_resolution_clock::now(); 213 | duration_simd = std::chrono::duration_cast(stop - start); 214 | 215 | // Print results 216 | std::cout << "Short Reverse Order Initialization:" << std::endl; 217 | std::cout << " Standard method: " << duration_std.count() << " microseconds" << std::endl; 218 | std::cout << " SIMD method: " << duration_simd.count() << " microseconds" << std::endl; 219 | std::cout << " Speedup: " << std::fixed << std::setprecision(2) 220 | << static_cast(duration_std.count()) / duration_simd.count() << "x" << std::endl; 221 | 222 | // Print the SIMD vector (first 8 elements) 223 | // Note: We need to extract the shorts from the __m256i 224 | short short_array[16]; 225 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(short_array), simd_short_vec); 226 | 227 | std::cout << "Reverse-initialized short vector: ["; 228 | for (int i = 0; i < 15; i++) { 229 | std::cout << short_array[i] << ", "; 230 | } 231 | std::cout << short_array[15] << "]" << std::endl; 232 | 233 | // Also demonstrate reverse initialization for floats 234 | __m256 simd_float_vec4 = _mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); 235 | print_m256(simd_float_vec4, "Reverse-initialized float vector"); 236 | 237 | return 0; 238 | } 239 | 240 | -------------------------------------------------------------------------------- /src/02_Computations/02_dot_product/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /** 8 | * 02_Computations/02_dot_product - Implementing vector dot products with SIMD 9 | * 10 | * This example demonstrates different ways to calculate dot products using SIMD: 11 | * 1. Scalar implementation (baseline) 12 | * 2. Basic SIMD implementation using separate vectors for x, y, z components 13 | * 3. SIMD implementation with Structure of Arrays (SoA) layout 14 | * 4. SIMD implementation with horizontal addition 15 | * 5. SIMD implementation for large arrays (batch processing) 16 | * 17 | * The dot product is a fundamental operation in many fields including: 18 | * - Computer graphics (lighting calculations, projections) 19 | * - Machine learning (neural networks, similarity measures) 20 | * - Physics simulations (force calculations) 21 | */ 22 | 23 | // 3D vector structure (Array of Structures layout) 24 | struct Vec3 { 25 | float x, y, z; 26 | 27 | Vec3(float x = 0.0f, float y = 0.0f, float z = 0.0f) : x(x), y(y), z(z) {} 28 | 29 | // Scalar dot product 30 | float dot(const Vec3& other) const { 31 | return x * other.x + y * other.y + z * other.z; 32 | } 33 | }; 34 | 35 | // Structure of Arrays layout for better SIMD performance 36 | struct Vec3Array { 37 | std::vector x; 38 | std::vector y; 39 | std::vector z; 40 | 41 | Vec3Array(size_t size) : x(size), y(size), z(size) {} 42 | 43 | void set(size_t index, float x_val, float y_val, float z_val) { 44 | x[index] = x_val; 45 | y[index] = y_val; 46 | z[index] = z_val; 47 | } 48 | 49 | void set(size_t index, const Vec3& vec) { 50 | x[index] = vec.x; 51 | y[index] = vec.y; 52 | z[index] = vec.z; 53 | } 54 | }; 55 | 56 | // Generate random 3D vectors 57 | std::vector generateRandomVectors(size_t count) { 58 | std::random_device rd; 59 | std::mt19937 gen(rd()); 60 | std::uniform_real_distribution dist(-1.0f, 1.0f); 61 | 62 | std::vector vectors; 63 | vectors.reserve(count); 64 | 65 | for (size_t i = 0; i < count; i++) { 66 | vectors.emplace_back(dist(gen), dist(gen), dist(gen)); 67 | } 68 | 69 | return vectors; 70 | } 71 | 72 | // Convert Array of Structures to Structure of Arrays 73 | Vec3Array convertToSoA(const std::vector& vectors) { 74 | Vec3Array result(vectors.size()); 75 | 76 | for (size_t i = 0; i < vectors.size(); i++) { 77 | result.set(i, vectors[i]); 78 | } 79 | 80 | return result; 81 | } 82 | 83 | // 1. Scalar dot product implementation 84 | float scalarDotProduct(const std::vector& vectors1, const std::vector& vectors2) { 85 | float sum = 0.0f; 86 | for (size_t i = 0; i < vectors1.size(); i++) { 87 | sum += vectors1[i].dot(vectors2[i]); 88 | } 89 | return sum; 90 | } 91 | 92 | // 2. Basic SIMD dot product implementation (for 8 vectors at a time) 93 | __m256 simdDotProduct8(const std::vector& vectors1, const std::vector& vectors2) { 94 | // Load x, y, z components into separate SIMD registers 95 | float x1[8], y1[8], z1[8]; 96 | float x2[8], y2[8], z2[8]; 97 | 98 | for (int i = 0; i < 8; i++) { 99 | x1[i] = vectors1[i].x; 100 | y1[i] = vectors1[i].y; 101 | z1[i] = vectors1[i].z; 102 | x2[i] = vectors2[i].x; 103 | y2[i] = vectors2[i].y; 104 | z2[i] = vectors2[i].z; 105 | } 106 | 107 | // Load data into SIMD registers 108 | __m256 vx1 = _mm256_loadu_ps(x1); 109 | __m256 vy1 = _mm256_loadu_ps(y1); 110 | __m256 vz1 = _mm256_loadu_ps(z1); 111 | __m256 vx2 = _mm256_loadu_ps(x2); 112 | __m256 vy2 = _mm256_loadu_ps(y2); 113 | __m256 vz2 = _mm256_loadu_ps(z2); 114 | 115 | // Compute dot products using FMA (Fused Multiply-Add) 116 | // (x1*x2 + y1*y2 + z1*z2) 117 | __m256 result = _mm256_mul_ps(vx1, vx2); // x1*x2 118 | result = _mm256_fmadd_ps(vy1, vy2, result); // x1*x2 + y1*y2 119 | result = _mm256_fmadd_ps(vz1, vz2, result); // x1*x2 + y1*y2 + z1*z2 120 | 121 | return result; 122 | } 123 | 124 | // 3. SIMD dot product with Structure of Arrays layout 125 | __m256 simdDotProductSoA8(const Vec3Array& vectors1, const Vec3Array& vectors2, size_t offset) { 126 | // Load data into SIMD registers directly from SoA structure 127 | __m256 vx1 = _mm256_loadu_ps(&vectors1.x[offset]); 128 | __m256 vy1 = _mm256_loadu_ps(&vectors1.y[offset]); 129 | __m256 vz1 = _mm256_loadu_ps(&vectors1.z[offset]); 130 | __m256 vx2 = _mm256_loadu_ps(&vectors2.x[offset]); 131 | __m256 vy2 = _mm256_loadu_ps(&vectors2.y[offset]); 132 | __m256 vz2 = _mm256_loadu_ps(&vectors2.z[offset]); 133 | 134 | // Compute dot products using FMA (Fused Multiply-Add) 135 | __m256 result = _mm256_mul_ps(vx1, vx2); 136 | result = _mm256_fmadd_ps(vy1, vy2, result); 137 | result = _mm256_fmadd_ps(vz1, vz2, result); 138 | 139 | return result; 140 | } 141 | 142 | // 4. SIMD dot product with horizontal addition (for a single dot product) 143 | float simdDotProductSingle(const Vec3& v1, const Vec3& v2) { 144 | // Load vector components into SIMD registers 145 | __m128 vec1 = _mm_setr_ps(v1.x, v1.y, v1.z, 0.0f); 146 | __m128 vec2 = _mm_setr_ps(v2.x, v2.y, v2.z, 0.0f); 147 | 148 | // Multiply components 149 | __m128 mul = _mm_mul_ps(vec1, vec2); 150 | 151 | // Horizontal addition to sum up components 152 | // First add pairs: (x+y, z+0, x+y, z+0) 153 | __m128 hadd1 = _mm_hadd_ps(mul, mul); 154 | // Then add pairs again: (x+y+z+0, x+y+z+0, x+y+z+0, x+y+z+0) 155 | __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); 156 | 157 | // Extract the result (first element) 158 | return _mm_cvtss_f32(hadd2); 159 | } 160 | 161 | // 5. SIMD dot product for large arrays 162 | float simdDotProductLarge(const Vec3Array& vectors1, const Vec3Array& vectors2) { 163 | size_t size = vectors1.x.size(); 164 | size_t blocks = size / 8; 165 | size_t remainder = size % 8; 166 | 167 | // Process 8 vectors at a time 168 | __m256 sum = _mm256_setzero_ps(); 169 | for (size_t i = 0; i < blocks; i++) { 170 | __m256 dot8 = simdDotProductSoA8(vectors1, vectors2, i * 8); 171 | sum = _mm256_add_ps(sum, dot8); 172 | } 173 | 174 | // Horizontal sum of the 8 dot products 175 | float result_array[8]; 176 | _mm256_storeu_ps(result_array, sum); 177 | float total = 0.0f; 178 | for (int i = 0; i < 8; i++) { 179 | total += result_array[i]; 180 | } 181 | 182 | // Process remaining vectors 183 | for (size_t i = blocks * 8; i < size; i++) { 184 | Vec3 v1(vectors1.x[i], vectors1.y[i], vectors1.z[i]); 185 | Vec3 v2(vectors2.x[i], vectors2.y[i], vectors2.z[i]); 186 | total += v1.dot(v2); 187 | } 188 | 189 | return total; 190 | } 191 | 192 | int main() { 193 | set_benchmark_suite("02_Computations/02_dot_product"); 194 | 195 | std::cout << "=== SIMD Dot Product Implementations ===" << std::endl; 196 | std::cout << std::endl; 197 | 198 | // Generate random test vectors 199 | const size_t NUM_VECTORS = 1024; 200 | std::vector vectors1 = generateRandomVectors(NUM_VECTORS); 201 | std::vector vectors2 = generateRandomVectors(NUM_VECTORS); 202 | 203 | // Convert to Structure of Arrays for more efficient SIMD processing 204 | Vec3Array soa_vectors1 = convertToSoA(vectors1); 205 | Vec3Array soa_vectors2 = convertToSoA(vectors2); 206 | 207 | // --------- 1. Basic Dot Product Comparison ------------- 208 | std::cout << "1. Basic Dot Product (8 vectors)" << std::endl; 209 | std::cout << "---------------------------------------------------" << std::endl; 210 | std::cout << "Comparing scalar vs. SIMD implementation for 8 vectors." << std::endl; 211 | std::cout << std::endl; 212 | 213 | // Calculate dot products using scalar method 214 | float scalar_results[8]; 215 | for (int i = 0; i < 8; i++) { 216 | scalar_results[i] = vectors1[i].dot(vectors2[i]); 217 | } 218 | 219 | // Calculate dot products using SIMD 220 | __m256 simd_result = simdDotProduct8(vectors1, vectors2); 221 | float simd_results[8]; 222 | _mm256_storeu_ps(simd_results, simd_result); 223 | 224 | // Print and compare results 225 | std::cout << "Scalar results: ["; 226 | for (int i = 0; i < 7; i++) { 227 | std::cout << scalar_results[i] << ", "; 228 | } 229 | std::cout << scalar_results[7] << "]" << std::endl; 230 | 231 | std::cout << "SIMD results: ["; 232 | for (int i = 0; i < 7; i++) { 233 | std::cout << simd_results[i] << ", "; 234 | } 235 | std::cout << simd_results[7] << "]" << std::endl; 236 | std::cout << std::endl; 237 | 238 | // --------- 2. Performance Comparison ------------- 239 | std::cout << "2. Performance Comparison" << std::endl; 240 | std::cout << "---------------------------------------------------" << std::endl; 241 | std::cout << "Comparing performance of different dot product implementations." << std::endl; 242 | std::cout << std::endl; 243 | 244 | // Benchmark scalar implementation 245 | auto scalar_benchmark = [&]() { 246 | volatile float result = scalarDotProduct(vectors1, vectors2); 247 | }; 248 | 249 | // Benchmark SIMD implementation with AoS layout 250 | auto simd_aos_benchmark = [&]() { 251 | float total = 0.0f; 252 | for (size_t i = 0; i < NUM_VECTORS; i += 8) { 253 | size_t remaining = std::min(size_t(8), NUM_VECTORS - i); 254 | if (remaining < 8) break; // Skip incomplete blocks for simplicity 255 | 256 | std::vector block1(vectors1.begin() + i, vectors1.begin() + i + 8); 257 | std::vector block2(vectors2.begin() + i, vectors2.begin() + i + 8); 258 | 259 | __m256 result = simdDotProduct8(block1, block2); 260 | float results[8]; 261 | _mm256_storeu_ps(results, result); 262 | 263 | for (int j = 0; j < 8; j++) { 264 | total += results[j]; 265 | } 266 | } 267 | }; 268 | 269 | // Benchmark SIMD implementation with SoA layout 270 | auto simd_soa_benchmark = [&]() { 271 | volatile float result = simdDotProductLarge(soa_vectors1, soa_vectors2); 272 | }; 273 | 274 | // Run benchmarks 275 | benchmark_comparison("Dot Product (1024 vectors)", scalar_benchmark, simd_soa_benchmark, 50); 276 | std::cout << std::endl; 277 | 278 | // --------- 3. Structure of Arrays vs Array of Structures ------------- 279 | std::cout << "3. Structure of Arrays vs Array of Structures" << std::endl; 280 | std::cout << "---------------------------------------------------" << std::endl; 281 | std::cout << "Comparing AoS vs SoA memory layouts for SIMD processing." << std::endl; 282 | std::cout << std::endl; 283 | 284 | benchmark_comparison("AoS vs SoA", simd_aos_benchmark, simd_soa_benchmark, 50); 285 | std::cout << std::endl; 286 | 287 | // --------- 4. Single Vector Dot Product ------------- 288 | std::cout << "4. Single Vector Dot Product" << std::endl; 289 | std::cout << "---------------------------------------------------" << std::endl; 290 | std::cout << "Using SIMD for a single dot product with horizontal addition." << std::endl; 291 | std::cout << std::endl; 292 | 293 | Vec3 v1(0.5f, -0.3f, 0.8f); 294 | Vec3 v2(0.2f, 0.7f, -0.4f); 295 | 296 | float scalar_dot = v1.dot(v2); 297 | float simd_dot = simdDotProductSingle(v1, v2); 298 | 299 | std::cout << "Vector 1: (" << v1.x << ", " << v1.y << ", " << v1.z << ")" << std::endl; 300 | std::cout << "Vector 2: (" << v2.x << ", " << v2.y << ", " << v2.z << ")" << std::endl; 301 | std::cout << "Scalar dot product: " << scalar_dot << std::endl; 302 | std::cout << "SIMD dot product: " << simd_dot << std::endl; 303 | std::cout << std::endl; 304 | 305 | // Benchmark single vector dot product 306 | auto scalar_single_benchmark = [&]() { 307 | for (int i = 0; i < 1000; i++) { 308 | volatile float result = v1.dot(v2); 309 | } 310 | }; 311 | 312 | auto simd_single_benchmark = [&]() { 313 | for (int i = 0; i < 1000; i++) { 314 | volatile float result = simdDotProductSingle(v1, v2); 315 | } 316 | }; 317 | 318 | // For tiny workloads the SIMD setup costs outweigh the computation, so a lower 319 | // speedup (or even a slowdown) is expected and useful to point out to learners. 320 | benchmark_comparison("Single Dot Product (1000 iterations)", scalar_single_benchmark, simd_single_benchmark, 10); 321 | 322 | return 0; 323 | } 324 | -------------------------------------------------------------------------------- /src/03_Examples/04_image_processing/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /** 9 | * This example demonstrates using SIMD for basic image processing operations. 10 | * 11 | * We'll implement: 12 | * 1. Brightness adjustment 13 | * 2. Contrast enhancement 14 | * 3. Image blurring (simple box filter) 15 | * 4. Grayscale conversion 16 | * 17 | * For simplicity, we'll use a simulated image represented as a 1D array of pixels, 18 | * where each pixel has R, G, B components (3 bytes per pixel). 19 | */ 20 | 21 | // Simulated image dimensions (kept modest so benchmarks finish quickly) 22 | const int WIDTH = 512; 23 | const int HEIGHT = 384; 24 | const int CHANNELS = 3; // RGB 25 | const int IMAGE_SIZE = WIDTH * HEIGHT * CHANNELS; 26 | 27 | // Utility function to initialize a test image 28 | void initialize_test_image(uint8_t* image, int width, int height, int channels) { 29 | for (int y = 0; y < height; y++) { 30 | for (int x = 0; x < width; x++) { 31 | int idx = (y * width + x) * channels; 32 | 33 | // Create a gradient pattern 34 | image[idx + 0] = static_cast(x * 255 / width); // R 35 | image[idx + 1] = static_cast(y * 255 / height); // G 36 | image[idx + 2] = static_cast(128); // B 37 | } 38 | } 39 | } 40 | 41 | // Print a small section of the image for verification 42 | void print_image_section(const uint8_t* image, int width, int channels, 43 | int start_x, int start_y, int section_width, int section_height) { 44 | std::cout << "Image section (" << start_x << "," << start_y << ") to (" 45 | << start_x + section_width - 1 << "," << start_y + section_height - 1 << "):" << std::endl; 46 | 47 | for (int y = start_y; y < start_y + section_height; y++) { 48 | for (int x = start_x; x < start_x + section_width; x++) { 49 | int idx = (y * width + x) * channels; 50 | std::cout << "(" << static_cast(image[idx + 0]) << "," 51 | << static_cast(image[idx + 1]) << "," 52 | << static_cast(image[idx + 2]) << ") "; 53 | } 54 | std::cout << std::endl; 55 | } 56 | std::cout << std::endl; 57 | } 58 | 59 | // 1. Brightness adjustment - Scalar implementation 60 | void adjust_brightness_scalar(uint8_t* image, int size, int brightness) { 61 | for (int i = 0; i < size; i++) { 62 | int value = static_cast(image[i]) + brightness; 63 | image[i] = static_cast(std::min(255, std::max(0, value))); 64 | } 65 | } 66 | 67 | // 1. Brightness adjustment - SIMD implementation 68 | void adjust_brightness_simd(uint8_t* image, int size, int brightness) { 69 | // Create a vector with the brightness value 70 | __m256i brightness_vec = _mm256_set1_epi8(static_cast(brightness)); 71 | __m256i zero_vec = _mm256_setzero_si256(); 72 | __m256i max_vec = _mm256_set1_epi8(static_cast(255)); 73 | 74 | // Process 32 bytes at a time (32 pixels) 75 | int i = 0; 76 | for (; i <= size - 32; i += 32) { 77 | // Load 32 bytes 78 | __m256i pixels = _mm256_loadu_si256(reinterpret_cast(&image[i])); 79 | 80 | // Add brightness 81 | __m256i result = _mm256_adds_epu8(pixels, brightness_vec); 82 | 83 | // Store result 84 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(&image[i]), result); 85 | } 86 | 87 | // Handle remaining pixels 88 | for (; i < size; i++) { 89 | int value = static_cast(image[i]) + brightness; 90 | image[i] = static_cast(std::min(255, std::max(0, value))); 91 | } 92 | } 93 | 94 | // 2. Contrast enhancement - Scalar implementation 95 | void enhance_contrast_scalar(uint8_t* image, int size, float contrast) { 96 | // Apply contrast formula: (pixel - 128) * contrast + 128 97 | for (int i = 0; i < size; i++) { 98 | float value = (static_cast(image[i]) - 128.0f) * contrast + 128.0f; 99 | image[i] = static_cast(std::min(255.0f, std::max(0.0f, value))); 100 | } 101 | } 102 | 103 | // 2. Contrast enhancement - SIMD implementation 104 | void enhance_contrast_simd(uint8_t* image, int size, float contrast) { 105 | // We'll process 8 pixels at a time (converting to float for the calculation) 106 | __m256 contrast_vec = _mm256_set1_ps(contrast); 107 | __m256 offset_vec = _mm256_set1_ps(128.0f); 108 | __m256 min_vec = _mm256_setzero_ps(); 109 | __m256 max_vec = _mm256_set1_ps(255.0f); 110 | 111 | // Process 8 pixels at a time 112 | int i = 0; 113 | for (; i <= size - 8; i += 8) { 114 | // Load 8 bytes and convert to float 115 | __m128i pixels_epi8 = _mm_loadl_epi64(reinterpret_cast(&image[i])); 116 | __m256i pixels_epi32 = _mm256_cvtepu8_epi32(pixels_epi8); 117 | __m256 pixels_ps = _mm256_cvtepi32_ps(pixels_epi32); 118 | 119 | // Apply contrast formula: (pixel - 128) * contrast + 128 120 | __m256 centered = _mm256_sub_ps(pixels_ps, offset_vec); 121 | __m256 scaled = _mm256_mul_ps(centered, contrast_vec); 122 | __m256 result_ps = _mm256_add_ps(scaled, offset_vec); 123 | 124 | // Clamp to [0, 255] 125 | result_ps = _mm256_min_ps(_mm256_max_ps(result_ps, min_vec), max_vec); 126 | 127 | // Convert back to integers and store without requiring AVX-512 128 | __m256i result_epi32 = _mm256_cvtps_epi32(result_ps); 129 | __m128i result_low = _mm256_castsi256_si128(result_epi32); 130 | __m128i result_high = _mm256_extracti128_si256(result_epi32, 1); 131 | __m128i packed16 = _mm_packus_epi32(result_low, result_high); 132 | __m128i packed8 = _mm_packus_epi16(packed16, _mm_setzero_si128()); 133 | _mm_storel_epi64(reinterpret_cast<__m128i*>(&image[i]), packed8); 134 | } 135 | 136 | // Handle remaining pixels 137 | for (; i < size; i++) { 138 | float value = (static_cast(image[i]) - 128.0f) * contrast + 128.0f; 139 | image[i] = static_cast(std::min(255.0f, std::max(0.0f, value))); 140 | } 141 | } 142 | 143 | // 3. Grayscale conversion - Scalar implementation 144 | void convert_to_grayscale_scalar(const uint8_t* src, uint8_t* dst, int width, int height) { 145 | for (int y = 0; y < height; y++) { 146 | for (int x = 0; x < width; x++) { 147 | int src_idx = (y * width + x) * CHANNELS; 148 | int dst_idx = y * width + x; 149 | 150 | // Standard grayscale conversion weights 151 | uint8_t gray = static_cast( 152 | 0.299f * src[src_idx + 0] + // R 153 | 0.587f * src[src_idx + 1] + // G 154 | 0.114f * src[src_idx + 2] // B 155 | ); 156 | 157 | dst[dst_idx] = gray; 158 | } 159 | } 160 | } 161 | 162 | // 3. Grayscale conversion - SIMD implementation 163 | void convert_to_grayscale_simd(const uint8_t* src, uint8_t* dst, int width, int height) { 164 | // RGB to Grayscale conversion weights 165 | const float weight_r = 0.299f; 166 | const float weight_g = 0.587f; 167 | const float weight_b = 0.114f; 168 | 169 | const __m128 weight_r_vec = _mm_set1_ps(weight_r); 170 | const __m128 weight_g_vec = _mm_set1_ps(weight_g); 171 | const __m128 weight_b_vec = _mm_set1_ps(weight_b); 172 | const __m128i r_shuffle = _mm_setr_epi8(0, 3, 6, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 173 | const __m128i g_shuffle = _mm_setr_epi8(1, 4, 7, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 174 | const __m128i b_shuffle = _mm_setr_epi8(2, 5, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 175 | const __m128i zero_128 = _mm_setzero_si128(); 176 | alignas(16) uint8_t chunk[16]; 177 | 178 | const int row_stride = width * CHANNELS; 179 | for (int y = 0; y < height; y++) { 180 | const uint8_t* row_ptr = src + y * row_stride; 181 | uint8_t* dst_row = dst + y * width; 182 | int x = 0; 183 | for (; x <= width - 4; x += 4) { 184 | const uint8_t* pixel_ptr = row_ptr + x * CHANNELS; 185 | std::memcpy(chunk, pixel_ptr, 12); 186 | __m128i block = _mm_load_si128(reinterpret_cast(chunk)); 187 | 188 | __m128i r_bytes = _mm_shuffle_epi8(block, r_shuffle); 189 | __m128i g_bytes = _mm_shuffle_epi8(block, g_shuffle); 190 | __m128i b_bytes = _mm_shuffle_epi8(block, b_shuffle); 191 | 192 | __m128 r_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(r_bytes)); 193 | __m128 g_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(g_bytes)); 194 | __m128 b_ps = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(b_bytes)); 195 | 196 | __m128 gray_ps = _mm_mul_ps(r_ps, weight_r_vec); 197 | gray_ps = _mm_add_ps(gray_ps, _mm_mul_ps(g_ps, weight_g_vec)); 198 | gray_ps = _mm_add_ps(gray_ps, _mm_mul_ps(b_ps, weight_b_vec)); 199 | 200 | __m128i gray_epi32 = _mm_cvtps_epi32(gray_ps); 201 | __m128i gray_epi16 = _mm_packus_epi32(gray_epi32, zero_128); 202 | __m128i gray_epi8 = _mm_packus_epi16(gray_epi16, zero_128); 203 | 204 | int packed = _mm_cvtsi128_si32(gray_epi8); 205 | std::memcpy(dst_row + x, &packed, sizeof(packed)); 206 | } 207 | for (; x < width; x++) { 208 | int src_idx = (y * width + x) * CHANNELS; 209 | float r = static_cast(src[src_idx + 0]); 210 | float g = static_cast(src[src_idx + 1]); 211 | float b = static_cast(src[src_idx + 2]); 212 | float gray = r * weight_r + g * weight_g + b * weight_b; 213 | dst_row[x] = static_cast(gray); 214 | } 215 | } 216 | } 217 | 218 | int main() { 219 | set_benchmark_suite("03_Examples/04_image_processing"); 220 | 221 | std::cout << "=== SIMD Image Processing Example ===" << std::endl; 222 | 223 | // Allocate memory for the test image 224 | uint8_t* original_image = new uint8_t[IMAGE_SIZE + 32]; 225 | uint8_t* processed_image = new uint8_t[IMAGE_SIZE + 32]; 226 | uint8_t* grayscale_image = new uint8_t[WIDTH * HEIGHT + 32]; 227 | 228 | // Initialize the test image 229 | initialize_test_image(original_image, WIDTH, HEIGHT, CHANNELS); 230 | 231 | // Print a small section of the original image 232 | std::cout << "Original Image:" << std::endl; 233 | print_image_section(original_image, WIDTH, CHANNELS, 0, 0, 3, 3); 234 | 235 | // 1. Brightness Adjustment 236 | std::cout << "1. Brightness Adjustment" << std::endl; 237 | 238 | // Copy original image to processed image 239 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 240 | 241 | // Benchmark brightness adjustment 242 | auto brightness_scalar = [&]() { 243 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 244 | adjust_brightness_scalar(processed_image, IMAGE_SIZE, 50); 245 | }; 246 | 247 | auto brightness_simd = [&]() { 248 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 249 | adjust_brightness_simd(processed_image, IMAGE_SIZE, 50); 250 | }; 251 | 252 | benchmark_comparison("Brightness Adjustment", brightness_scalar, brightness_simd, 10); 253 | 254 | // Print a small section of the brightness-adjusted image 255 | std::cout << "Brightness-adjusted Image:" << std::endl; 256 | print_image_section(processed_image, WIDTH, CHANNELS, 0, 0, 3, 3); 257 | 258 | // 2. Contrast Enhancement 259 | std::cout << "2. Contrast Enhancement" << std::endl; 260 | 261 | // Reset the processed image 262 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 263 | 264 | // Benchmark contrast enhancement 265 | auto contrast_scalar = [&]() { 266 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 267 | enhance_contrast_scalar(processed_image, IMAGE_SIZE, 1.5f); 268 | }; 269 | 270 | auto contrast_simd = [&]() { 271 | std::copy(original_image, original_image + IMAGE_SIZE, processed_image); 272 | enhance_contrast_simd(processed_image, IMAGE_SIZE, 1.5f); 273 | }; 274 | 275 | benchmark_comparison("Contrast Enhancement", contrast_scalar, contrast_simd, 10); 276 | 277 | // Print a small section of the contrast-enhanced image 278 | std::cout << "Contrast-enhanced Image:" << std::endl; 279 | print_image_section(processed_image, WIDTH, CHANNELS, 0, 0, 3, 3); 280 | 281 | // 3. Grayscale Conversion 282 | std::cout << "3. Grayscale Conversion" << std::endl; 283 | 284 | // Benchmark grayscale conversion 285 | auto grayscale_scalar = [&]() { 286 | convert_to_grayscale_scalar(original_image, grayscale_image, WIDTH, HEIGHT); 287 | }; 288 | 289 | auto grayscale_simd = [&]() { 290 | convert_to_grayscale_simd(original_image, grayscale_image, WIDTH, HEIGHT); 291 | }; 292 | 293 | benchmark_comparison("Grayscale Conversion", grayscale_scalar, grayscale_simd, 10); 294 | 295 | // Print a small section of the grayscale image 296 | std::cout << "Grayscale Image (showing first few pixels):" << std::endl; 297 | for (int y = 0; y < 3; y++) { 298 | for (int x = 0; x < 3; x++) { 299 | std::cout << static_cast(grayscale_image[y * WIDTH + x]) << " "; 300 | } 301 | std::cout << std::endl; 302 | } 303 | std::cout << std::endl; 304 | 305 | // Clean up 306 | delete[] original_image; 307 | delete[] processed_image; 308 | delete[] grayscale_image; 309 | 310 | return 0; 311 | } 312 | -------------------------------------------------------------------------------- /scripts/plot_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Plot SIMD benchmark summaries, attention breakdown, and tiny GPT breakdown figures.""" 3 | 4 | import argparse 5 | import csv 6 | from pathlib import Path 7 | from typing import Dict, Iterable, List, Tuple 8 | import math 9 | 10 | import matplotlib 11 | 12 | matplotlib.use("Agg") 13 | import matplotlib.pyplot as plt 14 | 15 | Row = Dict[str, str] 16 | DEFAULT_ARTIFACT_DIR = Path(__file__).resolve().parent.parent / "artifacts" 17 | 18 | 19 | def _read_rows(csv_path: Path, required: Iterable[str]) -> List[Row]: 20 | if not csv_path.exists(): 21 | raise FileNotFoundError(f"CSV file not found: {csv_path}") 22 | with csv_path.open(newline="", encoding="utf-8") as fh: 23 | reader = csv.DictReader(fh) 24 | fieldnames = reader.fieldnames or [] 25 | missing = [col for col in required if col not in fieldnames] 26 | if missing: 27 | raise ValueError(f"CSV missing expected columns: {missing}") 28 | rows = list(reader) 29 | if not rows: 30 | raise ValueError(f"CSV appears empty: {csv_path}") 31 | return rows 32 | 33 | 34 | def _parse_float(value: str, *, context: str) -> float: 35 | try: 36 | return float(value) 37 | except ValueError as exc: 38 | raise ValueError(f"Unable to parse '{value}' as float ({context})") from exc 39 | 40 | 41 | # --- Benchmark overview ---------------------------------------------------- 42 | 43 | def plot_benchmarks(csv_path: Path, output_path: Path, dpi: int) -> None: 44 | rows = _read_rows(csv_path, required=("suite", "label", "speedup")) 45 | 46 | skip_suites = { 47 | "src/03_Examples/05_attention_block", 48 | "03_Examples/05_attention_block", 49 | "src/03_Examples/05_mha_block", 50 | "03_Examples/05_mha_block", 51 | "src/03_Examples/06_tiny_gpt", 52 | "03_Examples/06_tiny_gpt", 53 | } 54 | 55 | grouped: Dict[str, List[Tuple[str, float]]] = {} 56 | for row in rows: 57 | suite = row["suite"] 58 | if suite in skip_suites: 59 | continue 60 | grouped.setdefault(suite, []).append( 61 | ( 62 | row["label"], 63 | _parse_float(row["speedup"], context=f"suite={suite}, label={row['label']}") 64 | ) 65 | ) 66 | 67 | if not grouped: 68 | raise ValueError("No benchmark data to plot (all rows filtered?)") 69 | 70 | suites = sorted(grouped.keys()) 71 | values = [sp for suite in suites for _, sp in grouped[suite]] 72 | median = sorted(values)[len(values) // 2] 73 | mean = sum(values) / len(values) 74 | 75 | count = len(suites) 76 | cols = math.ceil(math.sqrt(count)) 77 | rows_count = math.ceil(count / cols) 78 | fig, axes = plt.subplots(rows_count, cols, figsize=(4 * cols, 3 * rows_count), squeeze=False) 79 | fig.suptitle(f"SIMD Microbenchmark Speedups (median {median:.2f}×, mean {mean:.2f}×)", fontsize=14) 80 | 81 | for ax in axes.flat[count:]: 82 | ax.axis("off") 83 | 84 | for idx, suite in enumerate(suites): 85 | ax = axes.flat[idx] 86 | labels = [label for label, _ in grouped[suite]] 87 | speedups = [value for _, value in grouped[suite]] 88 | colors = ["#d62728" if sp < 1.0 else "#1f77b4" for sp in speedups] 89 | ypos = list(range(len(labels))) 90 | ax.barh(ypos, speedups, color=colors) 91 | ax.axvline(1.0, color="#555555", linestyle="--", linewidth=1) 92 | ax.set_yticks(ypos) 93 | ax.set_yticklabels(labels, fontsize=8) 94 | ax.set_xlabel("Speedup (scalar / SIMD)") 95 | ax.set_title(suite, fontsize=10) 96 | ax.set_xlim(left=0) 97 | for y, sp in zip(ypos, speedups): 98 | ax.text(sp + 0.05, y, f"{sp:.2f}×", va="center", ha="left", fontsize=8) 99 | 100 | output_path.parent.mkdir(parents=True, exist_ok=True) 101 | fig.tight_layout(rect=[0, 0.03, 1, 0.95]) 102 | fig.savefig(output_path, dpi=dpi) 103 | plt.close(fig) 104 | 105 | 106 | # --- Attention breakdown --------------------------------------------------- 107 | 108 | def plot_attention(csv_path: Path, output_path: Path, dpi: int) -> None: 109 | rows = _read_rows( 110 | csv_path, 111 | required=( 112 | "component", 113 | "scalar_total_us", 114 | "simd_total_us", 115 | "speedup", 116 | "time_saved_us", 117 | "contribution_pct", 118 | ), 119 | ) 120 | 121 | overall = next((row for row in rows if row["component"] == "overall"), None) 122 | if not overall: 123 | raise ValueError("attention_components.csv must contain an 'overall' row") 124 | 125 | components = [row for row in rows if row["component"] != "overall"] 126 | if not components: 127 | raise ValueError("attention_components.csv has no component rows") 128 | 129 | names = [row["component"] for row in components] 130 | speedups = [_parse_float(row["speedup"], context=row["component"]) for row in components] 131 | contributions = [ 132 | _parse_float(row["contribution_pct"], context=row["component"]) 133 | for row in components 134 | ] 135 | total_scalar = _parse_float(overall["scalar_total_us"], context="overall scalar_total_us") 136 | total_simd = _parse_float(overall["simd_total_us"], context="overall simd_total_us") 137 | 138 | fig, axes = plt.subplots(1, 3, figsize=(15, 4.5)) 139 | fig.suptitle("SIMD Attention Block Breakdown", fontsize=14) 140 | 141 | ax_speed, ax_total, ax_contrib = axes 142 | 143 | ax_speed.bar(names, speedups, color="#1f77b4") 144 | ax_speed.set_ylabel("Speedup (scalar / SIMD)") 145 | ax_speed.set_title("Component Speedups") 146 | ax_speed.tick_params(axis="x", rotation=45) 147 | for label in ax_speed.get_xticklabels(): 148 | label.set_horizontalalignment("right") 149 | for idx, val in enumerate(speedups): 150 | ax_speed.text(idx, val + 0.05, f"{val:.2f}×", ha="center", va="bottom", fontsize=8) 151 | 152 | ax_total.bar(["scalar", "simd"], [total_scalar, total_simd], color=["#d62728", "#2ca02c"]) 153 | ax_total.set_ylabel("Microseconds") 154 | ax_total.set_title("End-to-End Latency") 155 | ax_total.set_ylim(0, max(total_scalar, total_simd) * 1.15) 156 | ax_total.text(0, total_scalar + 10, f"{total_scalar:.0f} μs", ha="center", va="bottom", fontsize=9) 157 | ax_total.text(1, total_simd + 10, f"{total_simd:.0f} μs", ha="center", va="bottom", fontsize=9) 158 | 159 | ax_contrib.barh(names, contributions, color="#9467bd") 160 | ax_contrib.set_xlabel("% of Total Speedup") 161 | ax_contrib.set_title("Contribution Share") 162 | for y, val in enumerate(contributions): 163 | ax_contrib.text(val + 0.5, y, f"{val:.1f}%", va="center", fontsize=8) 164 | ax_contrib.set_xlim(0, max(contributions + [10]) * 1.2) 165 | 166 | output_path.parent.mkdir(parents=True, exist_ok=True) 167 | fig.tight_layout(rect=[0, 0, 1, 0.95]) 168 | fig.savefig(output_path, dpi=dpi) 169 | plt.close(fig) 170 | 171 | 172 | # --- Tiny GPT breakdown ---------------------------------------------------- 173 | 174 | def plot_tiny_gpt(csv_path: Path, output_path: Path, dpi: int) -> None: 175 | rows = _read_rows( 176 | csv_path, 177 | required=( 178 | "stage", 179 | "count", 180 | "scalar_total_us", 181 | "simd_total_us", 182 | "speedup", 183 | "time_saved_us", 184 | "contribution_pct", 185 | ), 186 | ) 187 | 188 | overall = next((row for row in rows if row["stage"] == "overall"), None) 189 | if not overall: 190 | raise ValueError("tiny_gpt_components.csv must contain an 'overall' row") 191 | 192 | components = [row for row in rows if row["stage"] != "overall"] 193 | if not components: 194 | raise ValueError("tiny_gpt_components.csv has no component rows") 195 | 196 | names = [row["stage"] for row in components] 197 | counts = [int(_parse_float(row["count"], context=row["stage"])) for row in components] 198 | display_names = [f"{name} (×{count})" if count > 1 else name for name, count in zip(names, counts)] 199 | scalar_vals = [_parse_float(row["scalar_total_us"], context=row["stage"]) for row in components] 200 | simd_vals = [_parse_float(row["simd_total_us"], context=row["stage"]) for row in components] 201 | speedups = [_parse_float(row["speedup"], context=row["stage"]) for row in components] 202 | saved = [_parse_float(row["time_saved_us"], context=row["stage"]) for row in components] 203 | contributions = [_parse_float(row["contribution_pct"], context=row["stage"]) for row in components] 204 | 205 | overall_speedup = _parse_float(overall["speedup"], context="overall speedup") 206 | overall_scalar = _parse_float(overall["scalar_total_us"], context="overall scalar_total_us") 207 | overall_simd = _parse_float(overall["simd_total_us"], context="overall simd_total_us") 208 | overall_count = int(_parse_float(overall["count"], context="overall count")) 209 | 210 | fig, axes = plt.subplots(2, 2, figsize=(14, 9)) 211 | fig.suptitle(f"Tiny GPT Decoder Block Breakdown (overall {overall_speedup:.2f}×)", fontsize=14) 212 | 213 | x_pos = list(range(len(names))) 214 | 215 | ax_speed = axes[0, 0] 216 | ax_speed.bar(x_pos, speedups, color=["#d62728" if sp < 1.0 else "#1f77b4" for sp in speedups]) 217 | ax_speed.set_ylabel("Speedup (scalar / SIMD)") 218 | ax_speed.set_title("Component Speedups") 219 | ax_speed.set_xticks(x_pos) 220 | ax_speed.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8) 221 | for idx, val in enumerate(speedups): 222 | ax_speed.text(x_pos[idx], val + 0.05, f"{val:.2f}×", ha="center", va="bottom", fontsize=7) 223 | 224 | ax_latency = axes[0, 1] 225 | width = 0.38 226 | ax_latency.bar([x - width / 2 for x in x_pos], scalar_vals, width=width, label="Scalar", color="#d62728") 227 | ax_latency.bar([x + width / 2 for x in x_pos], simd_vals, width=width, label="SIMD", color="#2ca02c") 228 | ax_latency.set_title("Latency by Stage") 229 | ax_latency.set_ylabel("Microseconds") 230 | ax_latency.set_xticks(x_pos) 231 | ax_latency.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8) 232 | ax_latency.legend(fontsize=8) 233 | 234 | ax_saved = axes[1, 0] 235 | ax_saved.bar(x_pos, saved, color="#ff7f0e") 236 | ax_saved.set_ylabel("Time Saved (μs)") 237 | ax_saved.set_title("Absolute Time Saved") 238 | ax_saved.set_xticks(x_pos) 239 | ax_saved.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8) 240 | 241 | ax_contrib = axes[1, 1] 242 | ax_contrib.barh(display_names, contributions, color="#9467bd") 243 | ax_contrib.set_xlabel("% of Total Speedup") 244 | ax_contrib.set_title("Contribution Share") 245 | for y, val in enumerate(contributions): 246 | ax_contrib.text(val + 0.5, y, f"{val:.1f}%", va="center", fontsize=8) 247 | ax_contrib.set_xlim(0, max(contributions + [10]) * 1.2) 248 | 249 | fig.text(0.02, 0.02, f"Overall scalar: {overall_scalar:.1f} μs\nOverall SIMD: {overall_simd:.1f} μs\nDecoder blocks: {overall_count}", fontsize=9) 250 | 251 | output_path.parent.mkdir(parents=True, exist_ok=True) 252 | fig.tight_layout(rect=[0, 0.05, 1, 0.95]) 253 | fig.savefig(output_path, dpi=dpi) 254 | plt.close(fig) 255 | 256 | 257 | 258 | 259 | # --- CLI ------------------------------------------------------------------- 260 | 261 | def parse_args() -> argparse.Namespace: 262 | parser = argparse.ArgumentParser(description=__doc__) 263 | parser.add_argument( 264 | "--benchmarks-csv", 265 | type=Path, 266 | default=DEFAULT_ARTIFACT_DIR / "benchmark_results.csv", 267 | help="Path to benchmark results CSV.", 268 | ) 269 | parser.add_argument( 270 | "--benchmarks-output", 271 | type=Path, 272 | default=DEFAULT_ARTIFACT_DIR / "benchmark_speedups.png", 273 | help="Output path for the benchmark overview plot.", 274 | ) 275 | parser.add_argument( 276 | "--attention-csv", 277 | type=Path, 278 | default=DEFAULT_ARTIFACT_DIR / "attention_components.csv", 279 | help="Path to attention components CSV.", 280 | ) 281 | parser.add_argument( 282 | "--attention-output", 283 | type=Path, 284 | default=DEFAULT_ARTIFACT_DIR / "attention_speedups.png", 285 | help="Output path for the attention breakdown plot.", 286 | ) 287 | parser.add_argument( 288 | "--tiny-gpt-csv", 289 | type=Path, 290 | default=DEFAULT_ARTIFACT_DIR / "tiny_gpt_components.csv", 291 | help="Path to tiny GPT component CSV.", 292 | ) 293 | parser.add_argument( 294 | "--tiny-gpt-output", 295 | type=Path, 296 | default=DEFAULT_ARTIFACT_DIR / "tiny_gpt_speedups.png", 297 | help="Output path for the tiny GPT breakdown plot.", 298 | ) 299 | parser.add_argument("--dpi", type=int, default=180, help="Figure DPI") 300 | parser.add_argument( 301 | "--skip-attention", 302 | action="store_true", 303 | help="Skip plotting the attention breakdown (benchmark overview still generated).", 304 | ) 305 | parser.add_argument( 306 | "--skip-tiny-gpt", 307 | action="store_true", 308 | help="Skip plotting the tiny GPT breakdown.", 309 | ) 310 | return parser.parse_args() 311 | 312 | 313 | def main() -> None: 314 | args = parse_args() 315 | plot_benchmarks(args.benchmarks_csv, args.benchmarks_output, args.dpi) 316 | if not args.skip_attention: 317 | plot_attention(args.attention_csv, args.attention_output, args.dpi) 318 | if not args.skip_tiny_gpt: 319 | plot_tiny_gpt(args.tiny_gpt_csv, args.tiny_gpt_output, args.dpi) 320 | 321 | 322 | if __name__ == "__main__": # pragma: no cover 323 | main() 324 | -------------------------------------------------------------------------------- /src/03_Examples/05_mha_block/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace { 13 | 14 | constexpr int SEQ_LEN = 8; 15 | constexpr int EMBED_DIM = 64; 16 | constexpr int NUM_HEADS = 4; 17 | constexpr int HEAD_DIM = EMBED_DIM / NUM_HEADS; // 16 18 | constexpr int FF_DIM = 128; 19 | constexpr float EPS = 1e-5f; 20 | 21 | using MatMulFn = void(*)(const float*, const float*, float*, int, int, int); 22 | using RMSNormFn = void(*)(const float*, const float*, float*, int, float); 23 | using ActivationFn = void(*)(float*, int); 24 | 25 | float horizontal_sum(__m256 v) { 26 | __m128 low = _mm256_castps256_ps128(v); 27 | __m128 high = _mm256_extractf128_ps(v, 1); 28 | __m128 sum = _mm_add_ps(low, high); 29 | sum = _mm_hadd_ps(sum, sum); 30 | sum = _mm_hadd_ps(sum, sum); 31 | return _mm_cvtss_f32(sum); 32 | } 33 | 34 | void transpose_matrix(const float* src, float* dst, int rows, int cols) { 35 | for (int r = 0; r < rows; ++r) { 36 | for (int c = 0; c < cols; ++c) { 37 | dst[c * rows + r] = src[r * cols + c]; 38 | } 39 | } 40 | } 41 | 42 | void split_heads(const float* src, float* dst) { 43 | for (int s = 0; s < SEQ_LEN; ++s) { 44 | for (int h = 0; h < NUM_HEADS; ++h) { 45 | const float* from = src + s * EMBED_DIM + h * HEAD_DIM; 46 | float* to = dst + (h * SEQ_LEN + s) * HEAD_DIM; 47 | std::copy(from, from + HEAD_DIM, to); 48 | } 49 | } 50 | } 51 | 52 | void combine_heads(const float* src, float* dst) { 53 | for (int s = 0; s < SEQ_LEN; ++s) { 54 | for (int h = 0; h < NUM_HEADS; ++h) { 55 | const float* from = src + (h * SEQ_LEN + s) * HEAD_DIM; 56 | float* to = dst + s * EMBED_DIM + h * HEAD_DIM; 57 | std::copy(from, from + HEAD_DIM, to); 58 | } 59 | } 60 | } 61 | 62 | void softmax_inplace(float* row, int len) { 63 | float max_val = row[0]; 64 | for (int i = 1; i < len; ++i) max_val = std::max(max_val, row[i]); 65 | float sum = 0.0f; 66 | for (int i = 0; i < len; ++i) { 67 | row[i] = std::exp(row[i] - max_val); 68 | sum += row[i]; 69 | } 70 | float inv = 1.0f / sum; 71 | for (int i = 0; i < len; ++i) row[i] *= inv; 72 | } 73 | 74 | void rmsnorm_scalar(const float* input, const float* gamma, float* output, int length, float eps) { 75 | float sum_sq = 0.0f; 76 | for (int i = 0; i < length; ++i) { 77 | sum_sq += input[i] * input[i]; 78 | } 79 | float scale = 1.0f / std::sqrt(sum_sq / length + eps); 80 | for (int i = 0; i < length; ++i) { 81 | output[i] = input[i] * gamma[i % EMBED_DIM] * scale; 82 | } 83 | } 84 | 85 | void rmsnorm_simd(const float* input, const float* gamma, float* output, int length, float eps) { 86 | __m256 acc = _mm256_setzero_ps(); 87 | int i = 0; 88 | for (; i <= length - 8; i += 8) { 89 | __m256 v = _mm256_loadu_ps(input + i); 90 | acc = _mm256_fmadd_ps(v, v, acc); 91 | } 92 | float sum_sq = horizontal_sum(acc); 93 | for (; i < length; ++i) sum_sq += input[i] * input[i]; 94 | float scale = 1.0f / std::sqrt(sum_sq / length + eps); 95 | __m256 scale_vec = _mm256_set1_ps(scale); 96 | for (i = 0; i <= length - 8; i += 8) { 97 | __m256 v = _mm256_loadu_ps(input + i); 98 | __m256 g = _mm256_loadu_ps(gamma + (i % EMBED_DIM)); 99 | __m256 result = _mm256_mul_ps(_mm256_mul_ps(v, g), scale_vec); 100 | _mm256_storeu_ps(output + i, result); 101 | } 102 | for (; i < length; ++i) { 103 | output[i] = input[i] * gamma[i % EMBED_DIM] * scale; 104 | } 105 | } 106 | 107 | void relu_scalar(float* data, int length) { 108 | for (int i = 0; i < length; ++i) data[i] = std::max(0.0f, data[i]); 109 | } 110 | 111 | void relu_simd(float* data, int length) { 112 | __m256 zero = _mm256_setzero_ps(); 113 | int i = 0; 114 | for (; i <= length - 8; i += 8) { 115 | __m256 v = _mm256_loadu_ps(data + i); 116 | _mm256_storeu_ps(data + i, _mm256_max_ps(zero, v)); 117 | } 118 | for (; i < length; ++i) data[i] = std::max(0.0f, data[i]); 119 | } 120 | 121 | void matmul_scalar(const float* A, const float* B_T, float* C, int M, int K, int N) { 122 | for (int i = 0; i < M; ++i) { 123 | for (int j = 0; j < N; ++j) { 124 | float sum = 0.0f; 125 | const float* a_ptr = A + i * K; 126 | const float* b_ptr = B_T + j * K; 127 | for (int k = 0; k < K; ++k) { 128 | sum += a_ptr[k] * b_ptr[k]; 129 | } 130 | C[i * N + j] = sum; 131 | } 132 | } 133 | } 134 | 135 | void matmul_simd(const float* A, const float* B_T, float* C, int M, int K, int N) { 136 | for (int i = 0; i < M; ++i) { 137 | for (int j = 0; j < N; ++j) { 138 | const float* a_ptr = A + i * K; 139 | const float* b_ptr = B_T + j * K; 140 | __m256 acc = _mm256_setzero_ps(); 141 | int k = 0; 142 | for (; k <= K - 8; k += 8) { 143 | __m256 a = _mm256_loadu_ps(a_ptr + k); 144 | __m256 b = _mm256_loadu_ps(b_ptr + k); 145 | acc = _mm256_fmadd_ps(a, b, acc); 146 | } 147 | float sum = horizontal_sum(acc); 148 | for (; k < K; ++k) sum += a_ptr[k] * b_ptr[k]; 149 | C[i * N + j] = sum; 150 | } 151 | } 152 | } 153 | 154 | struct ModelWeights { 155 | std::vector Wq_T, Wk_T, Wv_T, Wo_T, Wff1_T, Wff2_T; 156 | std::vector gamma1, gamma2; 157 | }; 158 | 159 | struct ModelInputs { 160 | std::vector tokens; 161 | }; 162 | 163 | double attention_scale() { 164 | return 1.0 / std::sqrt(static_cast(HEAD_DIM)); 165 | } 166 | 167 | void initialize(ModelWeights& weights, ModelInputs& inputs) { 168 | std::mt19937 rng(42); 169 | std::uniform_real_distribution dist(-0.5f, 0.5f); 170 | 171 | auto fill_and_transpose = [&](int rows, int cols, std::vector& storage) { 172 | std::vector tmp(rows * cols); 173 | for (float& v : tmp) v = dist(rng); 174 | storage.resize(cols * rows); 175 | transpose_matrix(tmp.data(), storage.data(), rows, cols); 176 | }; 177 | 178 | weights.Wq_T.reserve(EMBED_DIM * EMBED_DIM); 179 | weights.Wk_T.reserve(EMBED_DIM * EMBED_DIM); 180 | weights.Wv_T.reserve(EMBED_DIM * EMBED_DIM); 181 | weights.Wo_T.reserve(EMBED_DIM * EMBED_DIM); 182 | weights.Wff1_T.reserve(FF_DIM * EMBED_DIM); 183 | weights.Wff2_T.reserve(EMBED_DIM * FF_DIM); 184 | 185 | fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wq_T); 186 | fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wk_T); 187 | fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wv_T); 188 | fill_and_transpose(EMBED_DIM, EMBED_DIM, weights.Wo_T); 189 | fill_and_transpose(EMBED_DIM, FF_DIM, weights.Wff1_T); 190 | fill_and_transpose(FF_DIM, EMBED_DIM, weights.Wff2_T); 191 | 192 | weights.gamma1.assign(EMBED_DIM, 1.0f); 193 | weights.gamma2.assign(EMBED_DIM, 1.0f); 194 | 195 | inputs.tokens.resize(SEQ_LEN * EMBED_DIM); 196 | for (float& v : inputs.tokens) v = dist(rng); 197 | } 198 | 199 | struct StageTimes { 200 | double rms1 = 0.0; 201 | double qkv = 0.0; 202 | double attn_scores = 0.0; 203 | double attn_context = 0.0; 204 | double attn_proj = 0.0; 205 | double rms2 = 0.0; 206 | double ff1 = 0.0; 207 | double activation = 0.0; 208 | double ff2 = 0.0; 209 | double total = 0.0; 210 | }; 211 | 212 | using Clock = std::chrono::high_resolution_clock; 213 | using Microseconds = std::chrono::microseconds; 214 | 215 | void run_block(const ModelWeights& weights, 216 | const ModelInputs& inputs, 217 | std::vector& output, 218 | MatMulFn matmul_fn, 219 | RMSNormFn rms_fn, 220 | ActivationFn activation_fn, 221 | StageTimes* times = nullptr) { 222 | const int token_dim = SEQ_LEN * EMBED_DIM; 223 | output.assign(token_dim, 0.0f); 224 | 225 | auto add_duration = [](StageTimes* st, double& field, 226 | const Clock::time_point& start, 227 | const Clock::time_point& end) { 228 | if (st) { 229 | field += std::chrono::duration_cast(end - start).count(); 230 | } 231 | }; 232 | 233 | Clock::time_point block_start; 234 | if (times) { 235 | block_start = Clock::now(); 236 | } 237 | 238 | std::vector norm1(token_dim); 239 | Clock::time_point t0 = Clock::now(); 240 | rms_fn(inputs.tokens.data(), weights.gamma1.data(), norm1.data(), token_dim, EPS); 241 | add_duration(times, times->rms1, t0, Clock::now()); 242 | 243 | std::vector Q(token_dim), K(token_dim), V(token_dim); 244 | t0 = Clock::now(); 245 | matmul_fn(norm1.data(), weights.Wq_T.data(), Q.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 246 | matmul_fn(norm1.data(), weights.Wk_T.data(), K.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 247 | matmul_fn(norm1.data(), weights.Wv_T.data(), V.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 248 | add_duration(times, times->qkv, t0, Clock::now()); 249 | 250 | std::vector Q_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM); 251 | std::vector K_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM); 252 | std::vector V_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM); 253 | split_heads(Q.data(), Q_heads.data()); 254 | split_heads(K.data(), K_heads.data()); 255 | split_heads(V.data(), V_heads.data()); 256 | 257 | std::vector context_heads(NUM_HEADS * SEQ_LEN * HEAD_DIM, 0.0f); 258 | std::vector K_heads_T(NUM_HEADS * HEAD_DIM * SEQ_LEN); 259 | std::vector V_heads_T(NUM_HEADS * HEAD_DIM * SEQ_LEN); 260 | std::vector scores(SEQ_LEN * SEQ_LEN); 261 | const float scale = static_cast(attention_scale()); 262 | 263 | for (int h = 0; h < NUM_HEADS; ++h) { 264 | const float* q_head = Q_heads.data() + h * SEQ_LEN * HEAD_DIM; 265 | const float* k_head = K_heads.data() + h * SEQ_LEN * HEAD_DIM; 266 | const float* v_head = V_heads.data() + h * SEQ_LEN * HEAD_DIM; 267 | float* k_t = K_heads_T.data() + h * HEAD_DIM * SEQ_LEN; 268 | float* v_t = V_heads_T.data() + h * HEAD_DIM * SEQ_LEN; 269 | transpose_matrix(k_head, k_t, SEQ_LEN, HEAD_DIM); 270 | transpose_matrix(v_head, v_t, SEQ_LEN, HEAD_DIM); 271 | 272 | t0 = Clock::now(); 273 | matmul_fn(q_head, k_t, scores.data(), SEQ_LEN, HEAD_DIM, SEQ_LEN); 274 | add_duration(times, times->attn_scores, t0, Clock::now()); 275 | for (float& s : scores) s *= scale; 276 | for (int row = 0; row < SEQ_LEN; ++row) { 277 | softmax_inplace(scores.data() + row * SEQ_LEN, SEQ_LEN); 278 | } 279 | float* ctx = context_heads.data() + h * SEQ_LEN * HEAD_DIM; 280 | t0 = Clock::now(); 281 | matmul_fn(scores.data(), v_t, ctx, SEQ_LEN, SEQ_LEN, HEAD_DIM); 282 | add_duration(times, times->attn_context, t0, Clock::now()); 283 | } 284 | 285 | std::vector context(token_dim); 286 | combine_heads(context_heads.data(), context.data()); 287 | 288 | std::vector attn_proj(token_dim); 289 | t0 = Clock::now(); 290 | matmul_fn(context.data(), weights.Wo_T.data(), attn_proj.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 291 | add_duration(times, times->attn_proj, t0, Clock::now()); 292 | 293 | std::vector residual1(token_dim); 294 | for (int i = 0; i < token_dim; ++i) { 295 | residual1[i] = inputs.tokens[i] + attn_proj[i]; 296 | } 297 | 298 | std::vector norm2(token_dim); 299 | t0 = Clock::now(); 300 | rms_fn(residual1.data(), weights.gamma2.data(), norm2.data(), token_dim, EPS); 301 | add_duration(times, times->rms2, t0, Clock::now()); 302 | 303 | std::vector ff1(SEQ_LEN * FF_DIM); 304 | t0 = Clock::now(); 305 | matmul_fn(norm2.data(), weights.Wff1_T.data(), ff1.data(), SEQ_LEN, EMBED_DIM, FF_DIM); 306 | add_duration(times, times->ff1, t0, Clock::now()); 307 | 308 | t0 = Clock::now(); 309 | activation_fn(ff1.data(), static_cast(ff1.size())); 310 | add_duration(times, times->activation, t0, Clock::now()); 311 | 312 | std::vector ff2(token_dim); 313 | t0 = Clock::now(); 314 | matmul_fn(ff1.data(), weights.Wff2_T.data(), ff2.data(), SEQ_LEN, FF_DIM, EMBED_DIM); 315 | add_duration(times, times->ff2, t0, Clock::now()); 316 | 317 | for (int i = 0; i < token_dim; ++i) { 318 | output[i] = residual1[i] + ff2[i]; 319 | } 320 | 321 | if (times) { 322 | times->total += std::chrono::duration_cast(Clock::now() - block_start).count(); 323 | } 324 | } 325 | 326 | float max_abs_diff(const std::vector& a, const std::vector& b) { 327 | float diff = 0.0f; 328 | for (size_t i = 0; i < a.size(); ++i) { 329 | diff = std::max(diff, std::abs(a[i] - b[i])); 330 | } 331 | return diff; 332 | } 333 | 334 | } // namespace 335 | 336 | int main() { 337 | ModelWeights weights; 338 | ModelInputs inputs; 339 | initialize(weights, inputs); 340 | 341 | std::vector scalar_output, simd_output; 342 | 343 | StageTimes scalar_stage{}, simd_stage{}; 344 | auto scalar_block_times = [&]() { 345 | run_block(weights, inputs, scalar_output, matmul_scalar, rmsnorm_scalar, relu_scalar, &scalar_stage); 346 | }; 347 | auto simd_block_times = [&]() { 348 | run_block(weights, inputs, simd_output, matmul_simd, rmsnorm_simd, relu_simd, &simd_stage); 349 | }; 350 | 351 | constexpr int stage_iterations = 10; 352 | for (int i = 0; i < stage_iterations; ++i) { 353 | scalar_block_times(); 354 | } 355 | for (int i = 0; i < stage_iterations; ++i) { 356 | simd_block_times(); 357 | } 358 | 359 | auto normalize = [&](StageTimes& st) { 360 | st.rms1 /= stage_iterations; 361 | st.qkv /= stage_iterations; 362 | st.attn_scores /= stage_iterations; 363 | st.attn_context /= stage_iterations; 364 | st.attn_proj /= stage_iterations; 365 | st.rms2 /= stage_iterations; 366 | st.ff1 /= stage_iterations; 367 | st.activation /= stage_iterations; 368 | st.ff2 /= stage_iterations; 369 | st.total /= stage_iterations; 370 | }; 371 | normalize(scalar_stage); 372 | normalize(simd_stage); 373 | 374 | auto scalar_block = [&]() { 375 | run_block(weights, inputs, scalar_output, matmul_scalar, rmsnorm_scalar, relu_scalar); 376 | }; 377 | auto simd_block = [&]() { 378 | run_block(weights, inputs, simd_output, matmul_simd, rmsnorm_simd, relu_simd); 379 | }; 380 | 381 | scalar_block(); 382 | simd_block(); 383 | 384 | float diff = max_abs_diff(scalar_output, simd_output); 385 | std::cout << "Max |scalar - simd| difference: " << diff << "\n"; 386 | 387 | struct ComponentRow { 388 | std::string name; 389 | int count; 390 | double scalar_us; 391 | double simd_us; 392 | }; 393 | 394 | std::vector components = { 395 | {"rmsnorm", 2, scalar_stage.rms1 + scalar_stage.rms2, simd_stage.rms1 + simd_stage.rms2}, 396 | {"qkv_projections", 1, scalar_stage.qkv, simd_stage.qkv}, 397 | {"attention_scores", 1, scalar_stage.attn_scores, simd_stage.attn_scores}, 398 | {"context_projection", 1, scalar_stage.attn_context, simd_stage.attn_context}, 399 | {"output_projection", 1, scalar_stage.attn_proj, simd_stage.attn_proj}, 400 | {"ffn_expand", 1, scalar_stage.ff1, simd_stage.ff1}, 401 | {"activation", 1, scalar_stage.activation, simd_stage.activation}, 402 | {"ffn_contract", 1, scalar_stage.ff2, simd_stage.ff2} 403 | }; 404 | 405 | double sum_scalar = 0.0; 406 | double sum_simd = 0.0; 407 | for (const auto& c : components) { 408 | sum_scalar += c.scalar_us; 409 | sum_simd += c.simd_us; 410 | } 411 | double others_scalar = std::max(0.0, scalar_stage.total - sum_scalar); 412 | double others_simd = std::max(0.0, simd_stage.total - sum_simd); 413 | components.push_back({"others", 1, others_scalar, others_simd}); 414 | 415 | double total_scalar = scalar_stage.total; 416 | double total_simd = simd_stage.total; 417 | double total_saved = total_scalar - total_simd; 418 | 419 | namespace fs = std::filesystem; 420 | fs::path out_path = fs::current_path().parent_path().parent_path().parent_path() / "artifacts" / "attention_components.csv"; 421 | fs::create_directories(out_path.parent_path()); 422 | std::ofstream file(out_path); 423 | if (file) { 424 | file << "component,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct\n"; 425 | for (const auto& c : components) { 426 | double saved = c.scalar_us - c.simd_us; 427 | double speedup = c.simd_us > 0.0 ? c.scalar_us / c.simd_us : 0.0; 428 | double pct = (total_saved > 0.0) ? (saved / total_saved * 100.0) : 0.0; 429 | file << c.name << ',' << c.count << ',' << c.scalar_us << ',' << c.simd_us << ',' 430 | << speedup << ',' << saved << ',' << pct << '\n'; 431 | } 432 | double overall_speedup = total_simd > 0.0 ? total_scalar / total_simd : 0.0; 433 | file << "overall,1," << total_scalar << ',' << total_simd << ',' 434 | << overall_speedup << ',' << total_saved << ',' << 100.0 << '\n'; 435 | } else { 436 | std::cerr << "Failed to write attention_components.csv" << std::endl; 437 | } 438 | 439 | set_benchmark_suite("03_Examples/05_mha_block"); 440 | benchmark_comparison("attention_block", scalar_block, simd_block, 50); 441 | 442 | std::cout << "First token (scalar vs SIMD):\n"; 443 | for (int d = 0; d < EMBED_DIM; ++d) { 444 | if (d && d % 8 == 0) std::cout << "\n"; 445 | std::cout << scalar_output[d] << " / " << simd_output[d] << " "; 446 | } 447 | std::cout << "\n"; 448 | 449 | return 0; 450 | } 451 | -------------------------------------------------------------------------------- /src/03_Examples/06_tiny_gpt/main.cpp: -------------------------------------------------------------------------------- 1 | #include "../../include/simd_utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace { 17 | 18 | constexpr int SEQ_LEN = 8; 19 | constexpr int EMBED_DIM = 64; 20 | constexpr int NUM_HEADS = 4; 21 | constexpr int HEAD_DIM = EMBED_DIM / NUM_HEADS; // 16 22 | constexpr int FF_DIM = 128; 23 | constexpr int VOCAB_SIZE = 64; 24 | constexpr int NUM_BLOCKS = 61; 25 | constexpr float EPS = 1e-5f; 26 | 27 | using Clock = std::chrono::high_resolution_clock; 28 | using Microseconds = std::chrono::microseconds; 29 | 30 | float horizontal_sum(__m256 v) { 31 | __m128 low = _mm256_castps256_ps128(v); 32 | __m128 high = _mm256_extractf128_ps(v, 1); 33 | __m128 sum = _mm_add_ps(low, high); 34 | sum = _mm_hadd_ps(sum, sum); 35 | sum = _mm_hadd_ps(sum, sum); 36 | return _mm_cvtss_f32(sum); 37 | } 38 | 39 | void transpose_matrix(const float* src, float* dst, int rows, int cols) { 40 | for (int r = 0; r < rows; ++r) { 41 | for (int c = 0; c < cols; ++c) { 42 | dst[c * rows + r] = src[r * cols + c]; 43 | } 44 | } 45 | } 46 | 47 | void split_heads(const float* src, float* dst) { 48 | for (int s = 0; s < SEQ_LEN; ++s) { 49 | for (int h = 0; h < NUM_HEADS; ++h) { 50 | const float* from = src + s * EMBED_DIM + h * HEAD_DIM; 51 | float* to = dst + (h * SEQ_LEN + s) * HEAD_DIM; 52 | std::copy(from, from + HEAD_DIM, to); 53 | } 54 | } 55 | } 56 | 57 | void combine_heads(const float* src, float* dst) { 58 | for (int s = 0; s < SEQ_LEN; ++s) { 59 | for (int h = 0; h < NUM_HEADS; ++h) { 60 | const float* from = src + (h * SEQ_LEN + s) * HEAD_DIM; 61 | float* to = dst + s * EMBED_DIM + h * HEAD_DIM; 62 | std::copy(from, from + HEAD_DIM, to); 63 | } 64 | } 65 | } 66 | 67 | void softmax_inplace(float* row, int len) { 68 | float max_val = row[0]; 69 | for (int i = 1; i < len; ++i) { 70 | max_val = std::max(max_val, row[i]); 71 | } 72 | float sum = 0.0f; 73 | for (int i = 0; i < len; ++i) { 74 | row[i] = std::exp(row[i] - max_val); 75 | sum += row[i]; 76 | } 77 | float inv = 1.0f / sum; 78 | for (int i = 0; i < len; ++i) { 79 | row[i] *= inv; 80 | } 81 | } 82 | 83 | void rmsnorm_scalar(const float* input, const float* gamma, float* output, int length, float eps) { 84 | float sum_sq = 0.0f; 85 | for (int i = 0; i < length; ++i) { 86 | sum_sq += input[i] * input[i]; 87 | } 88 | float scale = 1.0f / std::sqrt(sum_sq / length + eps); 89 | for (int i = 0; i < length; ++i) { 90 | output[i] = input[i] * gamma[i % EMBED_DIM] * scale; 91 | } 92 | } 93 | 94 | void rmsnorm_simd(const float* input, const float* gamma, float* output, int length, float eps) { 95 | __m256 acc = _mm256_setzero_ps(); 96 | int i = 0; 97 | for (; i <= length - 8; i += 8) { 98 | __m256 v = _mm256_loadu_ps(input + i); 99 | acc = _mm256_fmadd_ps(v, v, acc); 100 | } 101 | float sum_sq = horizontal_sum(acc); 102 | for (; i < length; ++i) { 103 | sum_sq += input[i] * input[i]; 104 | } 105 | float scale = 1.0f / std::sqrt(sum_sq / length + eps); 106 | __m256 scale_vec = _mm256_set1_ps(scale); 107 | i = 0; 108 | for (; i <= length - 8; i += 8) { 109 | __m256 v = _mm256_loadu_ps(input + i); 110 | __m256 g = _mm256_loadu_ps(gamma + (i % EMBED_DIM)); 111 | __m256 result = _mm256_mul_ps(_mm256_mul_ps(v, g), scale_vec); 112 | _mm256_storeu_ps(output + i, result); 113 | } 114 | for (; i < length; ++i) { 115 | output[i] = input[i] * gamma[i % EMBED_DIM] * scale; 116 | } 117 | } 118 | 119 | void relu_scalar(float* data, int length) { 120 | for (int i = 0; i < length; ++i) { 121 | data[i] = std::max(0.0f, data[i]); 122 | } 123 | } 124 | 125 | void relu_simd(float* data, int length) { 126 | __m256 zero = _mm256_setzero_ps(); 127 | int i = 0; 128 | for (; i <= length - 8; i += 8) { 129 | __m256 v = _mm256_loadu_ps(data + i); 130 | _mm256_storeu_ps(data + i, _mm256_max_ps(zero, v)); 131 | } 132 | for (; i < length; ++i) { 133 | data[i] = std::max(0.0f, data[i]); 134 | } 135 | } 136 | 137 | void residual_add_scalar(const float* a, const float* b, float* out, int length) { 138 | for (int i = 0; i < length; ++i) { 139 | out[i] = a[i] + b[i]; 140 | } 141 | } 142 | 143 | void residual_add_simd(const float* a, const float* b, float* out, int length) { 144 | int i = 0; 145 | for (; i <= length - 8; i += 8) { 146 | __m256 va = _mm256_loadu_ps(a + i); 147 | __m256 vb = _mm256_loadu_ps(b + i); 148 | _mm256_storeu_ps(out + i, _mm256_add_ps(va, vb)); 149 | } 150 | for (; i < length; ++i) { 151 | out[i] = a[i] + b[i]; 152 | } 153 | } 154 | 155 | void matmul_scalar(const float* A, const float* B_T, float* C, int M, int K, int N) { 156 | for (int i = 0; i < M; ++i) { 157 | for (int j = 0; j < N; ++j) { 158 | float sum = 0.0f; 159 | const float* a_ptr = A + i * K; 160 | const float* b_ptr = B_T + j * K; 161 | for (int k = 0; k < K; ++k) { 162 | sum += a_ptr[k] * b_ptr[k]; 163 | } 164 | C[i * N + j] = sum; 165 | } 166 | } 167 | } 168 | 169 | void matmul_simd(const float* A, const float* B_T, float* C, int M, int K, int N) { 170 | for (int i = 0; i < M; ++i) { 171 | for (int j = 0; j < N; ++j) { 172 | const float* a_ptr = A + i * K; 173 | const float* b_ptr = B_T + j * K; 174 | __m256 acc = _mm256_setzero_ps(); 175 | int k = 0; 176 | for (; k <= K - 8; k += 8) { 177 | __m256 a = _mm256_loadu_ps(a_ptr + k); 178 | __m256 b = _mm256_loadu_ps(b_ptr + k); 179 | acc = _mm256_fmadd_ps(a, b, acc); 180 | } 181 | float sum = horizontal_sum(acc); 182 | for (; k < K; ++k) { 183 | sum += a_ptr[k] * b_ptr[k]; 184 | } 185 | C[i * N + j] = sum; 186 | } 187 | } 188 | } 189 | 190 | struct FloatLinear { 191 | int out_dim = 0; 192 | int in_dim = 0; 193 | std::vector weights_T; 194 | }; 195 | 196 | struct QuantizedLinear { 197 | int out_dim = 0; 198 | int in_dim = 0; 199 | std::vector weights_T; 200 | std::vector scales; 201 | }; 202 | 203 | struct LinearPair { 204 | FloatLinear fp32; 205 | QuantizedLinear q8; 206 | }; 207 | 208 | struct BlockWeights { 209 | LinearPair wq; 210 | LinearPair wk; 211 | LinearPair wv; 212 | LinearPair wo; 213 | LinearPair wff1; 214 | LinearPair wff2; 215 | std::vector gamma1; 216 | std::vector gamma2; 217 | }; 218 | 219 | struct ModelWeights { 220 | std::vector embedding; 221 | std::vector blocks; 222 | LinearPair logits; 223 | }; 224 | 225 | float attention_scale() { 226 | return 1.0f / std::sqrt(static_cast(HEAD_DIM)); 227 | } 228 | 229 | void quantize_into(const FloatLinear& src, QuantizedLinear& dst) { 230 | dst.out_dim = src.out_dim; 231 | dst.in_dim = src.in_dim; 232 | dst.weights_T.resize(static_cast(dst.out_dim) * dst.in_dim); 233 | dst.scales.resize(dst.out_dim); 234 | for (int row = 0; row < dst.out_dim; ++row) { 235 | const float* src_row = src.weights_T.data() + row * dst.in_dim; 236 | float max_abs = 0.0f; 237 | for (int col = 0; col < dst.in_dim; ++col) { 238 | max_abs = std::max(max_abs, std::abs(src_row[col])); 239 | } 240 | float scale = max_abs > 0.0f ? (max_abs / 127.0f) : 1.0f; 241 | dst.scales[row] = scale; 242 | float inv_scale = scale > 0.0f ? (1.0f / scale) : 0.0f; 243 | int8_t* dst_row = dst.weights_T.data() + row * dst.in_dim; 244 | for (int col = 0; col < dst.in_dim; ++col) { 245 | float scaled = src_row[col] * inv_scale; 246 | int value = static_cast(std::round(scaled)); 247 | value = std::max(-127, std::min(127, value)); 248 | dst_row[col] = static_cast(value); 249 | } 250 | } 251 | } 252 | 253 | float dot_q8_simd(const int8_t* w_row, const float* x, int length, float scale) { 254 | __m256 acc = _mm256_setzero_ps(); 255 | __m256 scale_vec = _mm256_set1_ps(scale); 256 | int i = 0; 257 | for (; i <= length - 16; i += 16) { 258 | __m128i packed = _mm_loadu_si128(reinterpret_cast(w_row + i)); 259 | __m128i lo_bytes = packed; 260 | __m128i hi_bytes = _mm_srli_si128(packed, 8); 261 | __m128i lo_i16 = _mm_cvtepi8_epi16(lo_bytes); 262 | __m128i hi_i16 = _mm_cvtepi8_epi16(hi_bytes); 263 | __m256i lo_i32 = _mm256_cvtepi16_epi32(lo_i16); 264 | __m256i hi_i32 = _mm256_cvtepi16_epi32(hi_i16); 265 | __m256 w_lo = _mm256_mul_ps(_mm256_cvtepi32_ps(lo_i32), scale_vec); 266 | __m256 w_hi = _mm256_mul_ps(_mm256_cvtepi32_ps(hi_i32), scale_vec); 267 | __m256 x_lo = _mm256_loadu_ps(x + i); 268 | __m256 x_hi = _mm256_loadu_ps(x + i + 8); 269 | acc = _mm256_fmadd_ps(w_lo, x_lo, acc); 270 | acc = _mm256_fmadd_ps(w_hi, x_hi, acc); 271 | } 272 | for (; i <= length - 8; i += 8) { 273 | __m128i packed8 = _mm_loadl_epi64(reinterpret_cast(w_row + i)); 274 | __m128i i16 = _mm_cvtepi8_epi16(packed8); 275 | __m256i i32 = _mm256_cvtepi16_epi32(i16); 276 | __m256 w_vec = _mm256_mul_ps(_mm256_cvtepi32_ps(i32), scale_vec); 277 | __m256 x_vec = _mm256_loadu_ps(x + i); 278 | acc = _mm256_fmadd_ps(w_vec, x_vec, acc); 279 | } 280 | float sum = horizontal_sum(acc); 281 | for (; i < length; ++i) { 282 | sum += static_cast(w_row[i]) * scale * x[i]; 283 | } 284 | return sum; 285 | } 286 | 287 | void matmul_q8_simd(const float* A, const QuantizedLinear& W_T, float* C, int M, int K, int N) { 288 | for (int i = 0; i < M; ++i) { 289 | const float* a_ptr = A + i * K; 290 | for (int j = 0; j < N; ++j) { 291 | const int8_t* w_row = W_T.weights_T.data() + j * K; 292 | float scale = W_T.scales[j]; 293 | C[i * N + j] = dot_q8_simd(w_row, a_ptr, K, scale); 294 | } 295 | } 296 | } 297 | 298 | struct StageTimes { 299 | double embed = 0.0; 300 | double rms1 = 0.0; 301 | double qkv = 0.0; 302 | double attn_scores = 0.0; 303 | double attn_softmax = 0.0; 304 | double attn_context = 0.0; 305 | double attn_proj = 0.0; 306 | double residual1 = 0.0; 307 | double rms2 = 0.0; 308 | double ffn_expand = 0.0; 309 | double activation = 0.0; 310 | double ffn_contract = 0.0; 311 | double residual2 = 0.0; 312 | double logits = 0.0; 313 | double sampling = 0.0; 314 | double total = 0.0; 315 | 316 | StageTimes& accumulate(const StageTimes& other) { 317 | embed += other.embed; 318 | rms1 += other.rms1; 319 | qkv += other.qkv; 320 | attn_scores += other.attn_scores; 321 | attn_softmax += other.attn_softmax; 322 | attn_context += other.attn_context; 323 | attn_proj += other.attn_proj; 324 | residual1 += other.residual1; 325 | rms2 += other.rms2; 326 | ffn_expand += other.ffn_expand; 327 | activation += other.activation; 328 | ffn_contract += other.ffn_contract; 329 | residual2 += other.residual2; 330 | logits += other.logits; 331 | sampling += other.sampling; 332 | total += other.total; 333 | return *this; 334 | } 335 | 336 | StageTimes& scale(double factor) { 337 | embed *= factor; 338 | rms1 *= factor; 339 | qkv *= factor; 340 | attn_scores *= factor; 341 | attn_softmax *= factor; 342 | attn_context *= factor; 343 | attn_proj *= factor; 344 | residual1 *= factor; 345 | rms2 *= factor; 346 | ffn_expand *= factor; 347 | activation *= factor; 348 | ffn_contract *= factor; 349 | residual2 *= factor; 350 | logits *= factor; 351 | sampling *= factor; 352 | total *= factor; 353 | return *this; 354 | } 355 | }; 356 | 357 | struct ScalarKernels { 358 | static void rmsnorm(const float* input, const std::vector& gamma, float* output, int length, float eps) { 359 | rmsnorm_scalar(input, gamma.data(), output, length, eps); 360 | } 361 | 362 | static void apply_linear(const LinearPair& weight, const float* input, float* output, int M, int K, int N) { 363 | matmul_scalar(input, weight.fp32.weights_T.data(), output, M, K, N); 364 | } 365 | 366 | static void add_residual(const float* a, const float* b, float* out, int length) { 367 | residual_add_scalar(a, b, out, length); 368 | } 369 | 370 | static void activation(float* data, int length) { 371 | relu_scalar(data, length); 372 | } 373 | 374 | static void matmul_float(const float* A, const float* B_T, float* C, int M, int K, int N) { 375 | matmul_scalar(A, B_T, C, M, K, N); 376 | } 377 | }; 378 | 379 | struct SimdKernels { 380 | static void rmsnorm(const float* input, const std::vector& gamma, float* output, int length, float eps) { 381 | rmsnorm_simd(input, gamma.data(), output, length, eps); 382 | } 383 | 384 | static void apply_linear(const LinearPair& weight, const float* input, float* output, int M, int K, int N) { 385 | matmul_q8_simd(input, weight.q8, output, M, K, N); 386 | } 387 | 388 | static void add_residual(const float* a, const float* b, float* out, int length) { 389 | residual_add_simd(a, b, out, length); 390 | } 391 | 392 | static void activation(float* data, int length) { 393 | relu_simd(data, length); 394 | } 395 | 396 | static void matmul_float(const float* A, const float* B_T, float* C, int M, int K, int N) { 397 | matmul_simd(A, B_T, C, M, K, N); 398 | } 399 | }; 400 | 401 | std::vector random_matrix_T(int rows, int cols, std::mt19937& rng, std::uniform_real_distribution& dist) { 402 | std::vector original(static_cast(rows) * cols); 403 | for (float& v : original) { 404 | v = dist(rng); 405 | } 406 | std::vector transposed(static_cast(rows) * cols); 407 | transpose_matrix(original.data(), transposed.data(), rows, cols); 408 | return transposed; 409 | } 410 | 411 | LinearPair make_linear_pair(int out_dim, int in_dim, std::mt19937& rng, std::uniform_real_distribution& dist) { 412 | LinearPair pair; 413 | pair.fp32.out_dim = out_dim; 414 | pair.fp32.in_dim = in_dim; 415 | pair.fp32.weights_T = random_matrix_T(out_dim, in_dim, rng, dist); 416 | pair.q8.out_dim = out_dim; 417 | pair.q8.in_dim = in_dim; 418 | quantize_into(pair.fp32, pair.q8); 419 | return pair; 420 | } 421 | 422 | void initialize(ModelWeights& weights) { 423 | std::mt19937 rng(1337); 424 | std::uniform_real_distribution dist(-0.8f, 0.8f); 425 | 426 | weights.embedding.resize(static_cast(VOCAB_SIZE) * EMBED_DIM); 427 | for (float& v : weights.embedding) { 428 | v = dist(rng); 429 | } 430 | 431 | weights.blocks.clear(); 432 | weights.blocks.reserve(NUM_BLOCKS); 433 | for (int i = 0; i < NUM_BLOCKS; ++i) { 434 | BlockWeights block; 435 | block.wq = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist); 436 | block.wk = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist); 437 | block.wv = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist); 438 | block.wo = make_linear_pair(EMBED_DIM, EMBED_DIM, rng, dist); 439 | block.wff1 = make_linear_pair(FF_DIM, EMBED_DIM, rng, dist); 440 | block.wff2 = make_linear_pair(EMBED_DIM, FF_DIM, rng, dist); 441 | block.gamma1.assign(EMBED_DIM, 1.0f); 442 | block.gamma2.assign(EMBED_DIM, 1.0f); 443 | weights.blocks.push_back(std::move(block)); 444 | } 445 | 446 | weights.logits = make_linear_pair(VOCAB_SIZE, EMBED_DIM, rng, dist); 447 | } 448 | 449 | inline double elapsed_us(const Clock::time_point& start, const Clock::time_point& end) { 450 | return static_cast(std::chrono::duration_cast(end - start).count()); 451 | } 452 | 453 | float max_abs_diff(const std::vector& a, const std::vector& b) { 454 | float diff = 0.0f; 455 | for (size_t i = 0; i < a.size(); ++i) { 456 | diff = std::max(diff, std::abs(a[i] - b[i])); 457 | } 458 | return diff; 459 | } 460 | 461 | template 462 | void decode_impl(const ModelWeights& weights, 463 | const std::vector& tokens, 464 | std::vector& block_output, 465 | std::vector& logits_out, 466 | int& next_token, 467 | StageTimes* times) { 468 | StageTimes local; 469 | 470 | Clock::time_point total_start; 471 | if (times) { 472 | total_start = Clock::now(); 473 | } 474 | 475 | constexpr int token_dim = SEQ_LEN * EMBED_DIM; 476 | constexpr int ff_dim = SEQ_LEN * FF_DIM; 477 | 478 | std::array hidden{}; 479 | std::array norm1{}; 480 | std::array Q{}; 481 | std::array K{}; 482 | std::array V{}; 483 | std::array Q_heads{}; 484 | std::array K_heads{}; 485 | std::array V_heads{}; 486 | std::array K_heads_T{}; 487 | std::array V_heads_T{}; 488 | std::array context_heads{}; 489 | std::array context{}; 490 | std::array attn_proj{}; 491 | std::array residual1{}; 492 | std::array norm2{}; 493 | std::array ff1{}; 494 | std::array ff2{}; 495 | std::array residual2{}; 496 | std::array scores{}; 497 | std::array logits{}; 498 | 499 | auto record = [&](double& field, const Clock::time_point& start_tp, const Clock::time_point& end_tp) { 500 | if (times) { 501 | field += elapsed_us(start_tp, end_tp); 502 | } 503 | }; 504 | 505 | auto t0 = Clock::now(); 506 | for (int t = 0; t < SEQ_LEN; ++t) { 507 | const float* src = weights.embedding.data() + tokens[t] * EMBED_DIM; 508 | std::copy(src, src + EMBED_DIM, hidden.data() + t * EMBED_DIM); 509 | } 510 | record(local.embed, t0, Clock::now()); 511 | 512 | const float scale = static_cast(attention_scale()); 513 | for (std::size_t block_idx = 0; block_idx < weights.blocks.size(); ++block_idx) { 514 | const BlockWeights& block = weights.blocks[block_idx]; 515 | 516 | t0 = Clock::now(); 517 | Kernels::rmsnorm(hidden.data(), block.gamma1, norm1.data(), token_dim, EPS); 518 | record(local.rms1, t0, Clock::now()); 519 | 520 | t0 = Clock::now(); 521 | Kernels::apply_linear(block.wq, norm1.data(), Q.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 522 | Kernels::apply_linear(block.wk, norm1.data(), K.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 523 | Kernels::apply_linear(block.wv, norm1.data(), V.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 524 | record(local.qkv, t0, Clock::now()); 525 | 526 | split_heads(Q.data(), Q_heads.data()); 527 | split_heads(K.data(), K_heads.data()); 528 | split_heads(V.data(), V_heads.data()); 529 | 530 | for (int h = 0; h < NUM_HEADS; ++h) { 531 | const float* q_head = Q_heads.data() + h * SEQ_LEN * HEAD_DIM; 532 | const float* k_head = K_heads.data() + h * SEQ_LEN * HEAD_DIM; 533 | const float* v_head = V_heads.data() + h * SEQ_LEN * HEAD_DIM; 534 | float* k_t = K_heads_T.data() + h * HEAD_DIM * SEQ_LEN; 535 | float* v_t = V_heads_T.data() + h * HEAD_DIM * SEQ_LEN; 536 | transpose_matrix(k_head, k_t, SEQ_LEN, HEAD_DIM); 537 | transpose_matrix(v_head, v_t, SEQ_LEN, HEAD_DIM); 538 | 539 | t0 = Clock::now(); 540 | Kernels::matmul_float(q_head, k_t, scores.data(), SEQ_LEN, HEAD_DIM, SEQ_LEN); 541 | record(local.attn_scores, t0, Clock::now()); 542 | 543 | auto softmax_start = Clock::now(); 544 | for (float& s : scores) { 545 | s *= scale; 546 | } 547 | for (int row = 0; row < SEQ_LEN; ++row) { 548 | softmax_inplace(scores.data() + row * SEQ_LEN, SEQ_LEN); 549 | } 550 | record(local.attn_softmax, softmax_start, Clock::now()); 551 | 552 | float* ctx = context_heads.data() + h * SEQ_LEN * HEAD_DIM; 553 | t0 = Clock::now(); 554 | Kernels::matmul_float(scores.data(), v_t, ctx, SEQ_LEN, SEQ_LEN, HEAD_DIM); 555 | record(local.attn_context, t0, Clock::now()); 556 | } 557 | 558 | combine_heads(context_heads.data(), context.data()); 559 | 560 | t0 = Clock::now(); 561 | Kernels::apply_linear(block.wo, context.data(), attn_proj.data(), SEQ_LEN, EMBED_DIM, EMBED_DIM); 562 | record(local.attn_proj, t0, Clock::now()); 563 | 564 | t0 = Clock::now(); 565 | Kernels::add_residual(hidden.data(), attn_proj.data(), residual1.data(), token_dim); 566 | record(local.residual1, t0, Clock::now()); 567 | 568 | t0 = Clock::now(); 569 | Kernels::rmsnorm(residual1.data(), block.gamma2, norm2.data(), token_dim, EPS); 570 | record(local.rms2, t0, Clock::now()); 571 | 572 | t0 = Clock::now(); 573 | Kernels::apply_linear(block.wff1, norm2.data(), ff1.data(), SEQ_LEN, EMBED_DIM, FF_DIM); 574 | record(local.ffn_expand, t0, Clock::now()); 575 | 576 | t0 = Clock::now(); 577 | Kernels::activation(ff1.data(), ff_dim); 578 | record(local.activation, t0, Clock::now()); 579 | 580 | t0 = Clock::now(); 581 | Kernels::apply_linear(block.wff2, ff1.data(), ff2.data(), SEQ_LEN, FF_DIM, EMBED_DIM); 582 | record(local.ffn_contract, t0, Clock::now()); 583 | 584 | t0 = Clock::now(); 585 | Kernels::add_residual(residual1.data(), ff2.data(), residual2.data(), token_dim); 586 | record(local.residual2, t0, Clock::now()); 587 | 588 | std::copy(residual2.begin(), residual2.end(), hidden.begin()); 589 | } 590 | 591 | block_output.assign(hidden.begin(), hidden.end()); 592 | 593 | const float* last_token = hidden.data() + (SEQ_LEN - 1) * EMBED_DIM; 594 | t0 = Clock::now(); 595 | Kernels::apply_linear(weights.logits, last_token, logits.data(), 1, EMBED_DIM, VOCAB_SIZE); 596 | record(local.logits, t0, Clock::now()); 597 | 598 | logits_out.assign(logits.begin(), logits.end()); 599 | 600 | std::vector probs(logits.begin(), logits.end()); 601 | auto samp_start = Clock::now(); 602 | softmax_inplace(probs.data(), VOCAB_SIZE); 603 | next_token = static_cast(std::distance(probs.begin(), std::max_element(probs.begin(), probs.end()))); 604 | record(local.sampling, samp_start, Clock::now()); 605 | 606 | if (times) { 607 | local.total += elapsed_us(total_start, Clock::now()); 608 | times->accumulate(local); 609 | } 610 | } 611 | 612 | 613 | } // namespace 614 | 615 | int main() { 616 | ModelWeights weights; 617 | initialize(weights); 618 | 619 | std::vector prompt = {3, 17, 12, 8, 5, 9, 2, 0}; 620 | 621 | std::vector scalar_hidden; 622 | std::vector scalar_logits; 623 | std::vector simd_hidden; 624 | std::vector simd_logits; 625 | int scalar_next = -1; 626 | int simd_next = -1; 627 | 628 | constexpr int warmup = 5; 629 | for (int i = 0; i < warmup; ++i) { 630 | decode_impl(weights, prompt, scalar_hidden, scalar_logits, scalar_next, nullptr); 631 | decode_impl(weights, prompt, simd_hidden, simd_logits, simd_next, nullptr); 632 | } 633 | 634 | StageTimes scalar_accum; 635 | StageTimes simd_accum; 636 | constexpr int iterations = 20; 637 | for (int i = 0; i < iterations; ++i) { 638 | decode_impl(weights, prompt, scalar_hidden, scalar_logits, scalar_next, &scalar_accum); 639 | } 640 | for (int i = 0; i < iterations; ++i) { 641 | decode_impl(weights, prompt, simd_hidden, simd_logits, simd_next, &simd_accum); 642 | } 643 | 644 | scalar_accum.scale(1.0 / iterations); 645 | simd_accum.scale(1.0 / iterations); 646 | 647 | float hidden_diff = max_abs_diff(scalar_hidden, simd_hidden); 648 | float logits_diff = max_abs_diff(scalar_logits, simd_logits); 649 | 650 | std::cout << "Tiny GPT block (scalar vs SIMD quantized)\n"; 651 | std::cout << "Decoder blocks: " << weights.blocks.size() << "\n"; 652 | std::cout << "Prompt tokens: "; 653 | for (size_t i = 0; i < prompt.size(); ++i) { 654 | if (i) std::cout << ", "; 655 | std::cout << prompt[i]; 656 | } 657 | std::cout << "\n"; 658 | std::cout << "Scalar next token: " << scalar_next << "\n"; 659 | std::cout << "SIMD next token: " << simd_next << "\n"; 660 | std::cout << "Max hidden diff: " << hidden_diff << "\n"; 661 | std::cout << "Max logits diff: " << logits_diff << "\n"; 662 | 663 | struct ComponentRow { 664 | std::string name; 665 | int count; 666 | double StageTimes::*field; 667 | }; 668 | 669 | const int block_count = static_cast(weights.blocks.size()); 670 | const int per_head = block_count * NUM_HEADS; 671 | 672 | std::vector components = { 673 | {"embedding", 1, &StageTimes::embed}, 674 | {"rmsnorm_1", block_count, &StageTimes::rms1}, 675 | {"qkv_linear", block_count, &StageTimes::qkv}, 676 | {"attention_scores", per_head, &StageTimes::attn_scores}, 677 | {"attention_softmax", per_head, &StageTimes::attn_softmax}, 678 | {"attention_context", per_head, &StageTimes::attn_context}, 679 | {"attention_projection", block_count, &StageTimes::attn_proj}, 680 | {"residual_1", block_count, &StageTimes::residual1}, 681 | {"rmsnorm_2", block_count, &StageTimes::rms2}, 682 | {"ffn_expand", block_count, &StageTimes::ffn_expand}, 683 | {"activation", block_count, &StageTimes::activation}, 684 | {"ffn_contract", block_count, &StageTimes::ffn_contract}, 685 | {"residual_2", block_count, &StageTimes::residual2}, 686 | {"logits_projection", 1, &StageTimes::logits}, 687 | {"sampling", 1, &StageTimes::sampling} 688 | }; 689 | 690 | double total_scalar = scalar_accum.total; 691 | double total_simd = simd_accum.total; 692 | double total_saved = total_scalar - total_simd; 693 | 694 | namespace fs = std::filesystem; 695 | fs::path out_dir = fs::current_path().parent_path().parent_path().parent_path() / "artifacts"; 696 | fs::create_directories(out_dir); 697 | 698 | fs::path csv_path = out_dir / "tiny_gpt_components.csv"; 699 | std::ofstream csv(csv_path); 700 | if (csv) { 701 | csv << "stage,count,scalar_total_us,simd_total_us,speedup,time_saved_us,contribution_pct\n"; 702 | for (const auto& comp : components) { 703 | double scalar_val = scalar_accum.*(comp.field); 704 | double simd_val = simd_accum.*(comp.field); 705 | double saved = scalar_val - simd_val; 706 | double speedup = simd_val > 0.0 ? scalar_val / simd_val : 0.0; 707 | double pct = (total_saved != 0.0) ? (saved / total_saved * 100.0) : 0.0; 708 | csv << comp.name << ',' << comp.count << ',' << scalar_val << ',' << simd_val << ',' 709 | << speedup << ',' << saved << ',' << pct << '\n'; 710 | } 711 | double overall_speedup = total_simd > 0.0 ? total_scalar / total_simd : 0.0; 712 | csv << "overall," << block_count << ',' << total_scalar << ',' << total_simd << ',' 713 | << overall_speedup << ',' << total_saved << ',' << 100.0 << '\n'; 714 | } else { 715 | std::cerr << "Failed to write " << csv_path << "\n"; 716 | } 717 | 718 | set_benchmark_suite("03_Examples/06_tiny_gpt"); 719 | auto scalar_decode = [&]() { 720 | std::vector hidden; 721 | std::vector logits; 722 | int next = -1; 723 | decode_impl(weights, prompt, hidden, logits, next, nullptr); 724 | }; 725 | auto simd_decode = [&]() { 726 | std::vector hidden; 727 | std::vector logits; 728 | int next = -1; 729 | decode_impl(weights, prompt, hidden, logits, next, nullptr); 730 | }; 731 | 732 | benchmark_comparison("tiny_gpt_decode", scalar_decode, simd_decode, 50); 733 | 734 | std::cout << "Scalar average total: " << total_scalar << " us\n"; 735 | std::cout << "SIMD average total: " << total_simd << " us\n"; 736 | std::cout << "Overall speedup: " << (total_simd > 0.0 ? total_scalar / total_simd : 0.0) << "x\n"; 737 | 738 | return 0; 739 | } 740 | --------------------------------------------------------------------------------