├── .gitignore
├── License.md
├── README.md
├── download_openelm.py
├── graviton_requirements.txt
├── img
    └── Learn on Arm_banner.png
├── lab1.ipynb
├── lab2.ipynb
├── lab3.ipynb
├── openelm_results
    ├── f16_grav_results.csv
    ├── f16_grav_results.txt
    ├── f16_pi_results.csv
    ├── f16_pi_results.txt
    ├── q4_grav_results.csv
    ├── q4_grav_results.txt
    ├── q4_pi_results.csv
    ├── q4_pi_results.txt
    ├── q8_grav_results.csv
    ├── q8_grav_results.txt
    ├── q8_pi_results.csv
    └── q8_pi_results.txt
├── scripts
    ├── bench_f32.sh
    ├── bench_q4.sh
    ├── benchmark_openelm.sh
    ├── chat_Q4.sh
    ├── chat_Q8.sh
    ├── chat_f32.sh
    ├── convert_openelm_to_gguf.sh
    ├── download_openelm_hf.py
    ├── f16_pi_results.txt
    ├── parse_results.py
    ├── q4_pi_results.txt
    └── q8_pi_results.txt
├── setup_graviton.sh
├── setup_pi5.sh
├── slides
    ├── chapter1.pptx
    ├── chapter2.pptx
    ├── chapter3.pptx
    └── chapter4.pptx
└── src
    ├── c
        ├── .ipynb_checkpoints
        │   ├── benchmark_fp32_neon-checkpoint.c
        │   ├── benchmark_int8_neon-checkpoint.c
        │   └── benchmark_naive-checkpoint.c
        ├── benchmark_fp32_neon.c
        ├── benchmark_int8_neon.c
        └── benchmark_naive.c
    └── cpp
        ├── blas
            ├── CMakeLists.txt
            ├── benchmark_f32.cpp
            └── f32_inf.cpp
        ├── common
            └── sizes.cpp
        ├── f16_f16_f16p
            ├── CMakeLists.txt
            ├── benchmark_f16.cpp
            ├── benchmark_f16_scaling.cpp
            └── kai_f16_inf.cpp
        ├── f32_f32_f32p
            ├── CMakeLists.txt
            ├── benchmark_f32.cpp
            ├── benchmark_f32_scaling.cpp
            └── kai_f32_inf.cpp
        ├── f32_i8_i4_dotprod
            ├── CMakeLists.txt
            ├── benchmark_i8_dotprod.cpp
            ├── benchmark_i8_dotprod_scaling.cpp
            └── kai_i8_dotprod_inf.cpp
        ├── f32_i8_i4_i8mm
            ├── CMakeLists.txt
            ├── benchmark_i8_i8mm.cpp
            ├── benchmark_i8_i8mm_scaling.cpp
            └── kai_i8_i8mm_inf.cpp
        ├── naive
            ├── CMakeLists.txt
            ├── benchmark_naive.cpp
            └── kernel.h
        └── results
            ├── blas_f32_scaling_results.csv
            ├── f16_scaling_results.csv
            ├── f32_scaling_results.csv
            ├── i8_dotprod_scaling_results.csv
            └── i8_i8mm_scaling_results.csv


/.gitignore:
--------------------------------------------------------------------------------
1 | pi5_env/*
2 | models/*
3 | llama.cpp/*
4 | graviton_env/*


--------------------------------------------------------------------------------
/License.md:
--------------------------------------------------------------------------------
 1 | ARM EDUCATION
 2 | END USER LICENSE AGREEMENT FOR TEACHING AND LEARNING CONTENT
 3 | These terms are displayed for you to read prior to use of the teaching and learning content and any related documentation and materials (“Materials”) made available to you by Arm Limited (“Arm”). If you choose not to agree with these terms, do not use or access the Materials.
 4 | 
 5 | THESE TERMS AND CONDITIONS CONSTITUTE A LEGALLY BINDING AGREEMENT BETWEEN YOU AND ARM (“AGREEMENT”). The Materials may include third party elements, and/or hardware distributed on behalf of a third party, which may be subject to separate terms of use. You acknowledge that any hardware provided to you is a third-party product and Arm is solely distributing such hardware on behalf of the third party. To the fullest extent permitted by law, Arm shall have no responsibility or liability in respect of any such hardware.
 6 | 
 7 | You acknowledge that the Materials are specifically designed and licensed only for non-commercial, educational purposes.
 8 | 
 9 | INTELLECTUAL PROPERTY. The Materials are protected by copyright laws, international copyright treaties, and trade secret laws, and other intellectual property laws and treaties around the world. The Materials are licensed, not sold, to you, and can only be used in accordance with the terms of this Agreement. Arm and its licensors retain title and ownership of the Materials (other than in respect of any third party elements), including all intellectual property rights in the Materials. Arm reserves all rights not specifically granted under this Agreement.
10 | 
11 | LICENCE. Subject to your compliance with this Agreement and unless otherwise specified in Clause 13, Arm hereby grants to you a limited, non-exclusive, non-transferable, royalty-free, licence under its copyright subsisting in the Materials to use, copy and modify the Materials (and if and only if you are a current member of an educational institution, make the Materials available to others within your educational institution) upon the terms of this Agreement, solely for non-commercial educational purposes. For the purposes of these terms, “educational purposes” means internal use in taught classes and related projects at educational institutions, including but not limited to schools, colleges and universities. Unless otherwise specified in Clause 13: (a) you may refer to and cite the Materials in publications provided that you give a full citation referencing Arm as the author and owner of the Materials; (b) you may not use the Materials for any commercial purpose, except as expressly approved by Arm in writing (contact education@arm.com to request permission); (c) you may not remove any proprietary notice(s) contained in the Materials; (d) you may not use the Materials to design or to manufacture Arm-based processors, or use the Materials for the purposes of adapting or developing resources or educational materials relating to non-Arm based architectures and platforms; and (e) you may not modify the Materials for use with non-Arm-based architectures. You may not use the Materials for any unlawful purpose, you may not (except as far as permitted by applicable law) reverse engineer or decompile any software tools delivered as part of the Materials, and you may not use the Materials in any way that may damage Arm’s reputation or bring Arm into disrepute. The licence granted by Arm pursuant to this Clause 5 is effective until terminated. Without prejudice to any other rights, Arm may terminate this Agreement and your right to use the Materials if you are in material breach of any of these terms or if you do anything that infringes Arm’s intellectual property rights. Termination will be effective immediately on the service of Arm’s notice of termination to you. In the event of termination, you shall destroy all copies of the Materials, including all portions and derivatives of them, and cease all use of the Materials immediately. Clauses impliedly or expressly intended to survive termination shall continue in full force and effect.
12 | 
13 | WARRANTIES AND LIMITATIONS. a. THE MATERIALS ARE PROVIDED “AS IS”. ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED OR STATUTORY, ARE HEREBY EXCLUDED, INCLUDING, WITHOUT LIMITATION, THOSE RELATING TO MERCHANTABILITY, SATISFACTORY QUALITY, FREEDOM FROM DEFECTS, RELIABILITY, AVAILABILITY, ACCURACY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE.
14 | 
15 | b. IN NO EVENT SHALL ARM BE LIABLE UNDER THIS AGREEMENT OR OTHERWISE IN CONNECTION WITH THE MATERIALS FOR ANY INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES WHETHER SUCH DAMAGES ARE ALLEGED AS A RESULT OF TORTIOUS CONDUCT OR BREACH OF CONTRACT OR OTHERWISE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
16 | 
17 | c. THE MAXIMUM LIABILITY OF ARM TO YOU IN AGGREGATE FOR ALL CLAIMS MADE AGAINST ARM IN CONTRACT, TORT OR OTHERWISE UNDER OR IN CONNECTION WITH THIS AGREEMENT, OR OTHERWISE IN CONNECTION WITH THE MATERIALS SHALL NOT EXCEED THE GREATER OF ONE HUNDRED U.S. DOLLARS ($100 USD) OR THE TOTAL OF SUMS PAID, IF ANY, BY YOU TO ARM UNDER THIS AGREEMENT.
18 | 
19 | d. Nothing in this section shall operate to exclude or limit liability for: (i) death or personal injury resulting from either party's negligence; or (ii) fraud; or (iii) any other matters in respect of which by law liability cannot be excluded or limited.
20 | 
21 | EXPORT CONTROL. You hereby acknowledge and agree that the Materials are subject to U.S. export control laws, including the U.S. Export Administration Act and its associated regulations, and may be subject to export or import regulations in other countries. You agree to comply fully with all export laws and regulations of the United States and other countries (“Export Laws”) to assure that neither the Materials, nor any direct products of them are; ( i ) exported, directly or indirectly, in violation of Export Laws, either to any countries that are subject to U.S export restrictions or to any end user who has been prohibited from participating in the U.S. export transactions by any federal agency of the U.S. government; or (ii) intended to be used for any purpose prohibited by Export Laws, including, without limitation, nuclear, chemical, or biological weapons proliferation. The Materials consist solely of commercial items. If applicable, your institution shall be responsible for ensuring that any Materials provided to the US Government in accordance with the terms of this Agreement are provided with the rights and subject to restrictions described elsewhere in this Agreement.
22 | 
23 | CONTRIBUTIONS, FEEDBACK AND MODIFICATIONS. If you wish to submit any contributions or feedback (“Contributions”) to Arm you may do subject to you granting Arm the license in this Clause 8, You hereby grant to Arm a perpetual, irrevocable, non-exclusive, royalty-free, fully paid-up, worldwide licence under all your intellectual property rights subsisting in the Contributions to (i) use, copy, create derivative works of and modify the Contributions; (ii) sell supply, distribute the Contributions solely as incorporated into or in conjunction with the Materials; and (iii) and sublicense the rights granted in Clause 8(1) to third parties. To the extent that any claim in any patent application filed by you would not have been conceived by you but for having had access to the Materials, such claim shall be deemed to be feedback and is hereby licensed to Arm in accordance with the provisions of this Clause 8. You represent that you are legally entitled to grant the licence set out in this Clause 8. If your employer has rights to intellectual property that you submit to Arm, you represent that you have permission to make such submissions on behalf of your employer. You further represent that any contribution, suggestion, comment, or feedback you provide to Arm is your original creation. You represent and agree that you will not submit any third-party intellectual property or materials to Arm without Arm’s express prior written approval. You agree to notify Arm promptly of any circumstances or facts of which you become aware that would make any of these representations inaccurate in any respect. If you have any questions or wish to contact Arm in relation to the contribution process, please contact education@arm.com.
24 | 
25 | GOVERNING LAW. This Agreement and any disputes or claims arising out of or in connection with it or its subject matter or formation (including non-contractual disputes or claims) are governed by and construed in accordance with the laws of England and Wales.
26 | 
27 | PERSONAL DATA. Arm will process any personal data in accordance with its privacy policy: www.arm.com/company/policies/privacy.
28 | 
29 | UPDATES. Arm may update this Agreement from time to time. Where appropriate, updates to the Agreement will be notified to you by email. If you do not agree with an update, you should cease use of the Materials.
30 | 
31 | CONTACTING US. If you have any queries in relation to this Agreement or the Materials, please email education@arm.com.
32 | 
33 | SPECIAL CONDITIONS. The provisions of this Clause 13 shall take precedence in the event of any conflict with the remainder of this Agreement.
34 | 
35 | 13.1. Arm Education Core: Use of the pedagogical processor model provided with the Introduction to Computer Architecture Education Kit (the "Arm Education Core") is subject to the following special conditions:
36 | 
37 | (a) you may only refer to or publish the Arm Education Core, or any modifications you make to the Arm Education Core or any results you obtain from its use, in whole or in part if you have first obtained Arm's specific prior written consent by emailing us at the address specified in Clause 12 above;
38 | 
39 | (b) you may use the Arm Education Core to synthesize and implement an SoC on FPGA or for an ASIC implementation of the SoC for non-commercial educational purposes (as defined in Clause 5 above);
40 | 
41 | (c) you may not make any claims about the Arm Education Core's functionality or otherwise other than those published by Arm. The Arm Education Core is not representative of any commercially available Arm processors and you may not make any statements that indicate or imply the contrary.
42 | 
43 | 13.2. Knowledge, Skills and Abilities Framework (the “KSA Framework”): Use of the KSA Framework is subject to the following special conditions:
44 | 
45 | (a) You may use, copy and modify the KSA Framework for internal use only;
46 | 
47 | (b) If you modify the KSA Framework, the modified version shall include attribution to Arm as the source of the original KSA Framework. Attribution shall be in the following format or a format which provides substantially similar information: “Arm Limited (2024). Knowledge, Skills, and Abilities Framework, version 1.0”;
48 | 
49 | (c) You may not distribute or grant any sublicenses in respect of the KSA Framework, or any modified version of the KSA Framework modified by you under the license granted in Clause 13.2(a), except as expressly approved by Arm in writing (contact education@arm.com to request approval) and subject to the terms and conditions set out in such written approval from Arm; and
50 | 
51 | (d) You shall not make any representations or warranties on behalf of Arm in respect of the KSA Framework, or in respect of modified versions of the KSA Framework created by You under the license granted in Clause 13.2(a). Furthermore, you will not represent or imply that Arm endorses any third-party products, materials or services, including without limitation any modified versions of the KSA Framework created by you under the terms of this Agreement, except as expressly permitted by Arm in writing separately.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # **Optimizing Generative AI on Arm Processors**
  2 | 
  3 | ![learn_on_arm](./img/Learn%20on%20Arm_banner.png)
  4 | 
  5 | ## Important
  6 | ### To download the latest stable version, please click below instead of using the "Download ZIP" button.
  7 | ### [Click here to download](https://github.com/arm-university/AI-on-Arm/releases/download/v2.1.0/AI-on-Arm-main.zip)
  8 | 
  9 | Welcome to **Optimizing Generative AI on Arm Processors**, a hands-on course designed to help you optimize generative AI workloads on Arm architectures. Through practical labs and structured lectures, you will learn how to deploy AI models efficiently across different Arm-based environments.
 10 | 
 11 | ## Course Structure
 12 | 
 13 | This course consists of three hands-on labs and four lectures.
 14 | 
 15 | ### Hands-On Labs
 16 | - **Lab 1**: Optimizing generative AI on mobile devices, such as the Raspberry Pi 5.
 17 | - **Lab 2**: Deploying AI workloads on Arm-based cloud servers, including AWS Graviton.
 18 | - **Lab 3**: Comparing Cloud vs. Edge inference, analyzing challenges and trade-offs.
 19 | 
 20 | ### Lecture Series
 21 | Inside the `slides/` folder, you will find four lectures covering the key concepts and challenges in AI inference on Arm:
 22 | 
 23 | 1. **Challenges Facing Cloud and Edge GenAI Inference** – Understanding the limitations and constraints of AI inference in different environments.
 24 | 2. **Generative AI Models** – Exploring model architectures, training methodologies, and deployment considerations.
 25 | 3. **ML Frameworks and Optimized Libraries** – A deep dive into AI software stacks, including PyTorch, ONNX Runtime, and Arm-specific optimizations.
 26 | 4. **Optimization for CPU Inference** – Techniques such as quantization, pruning, and leveraging SIMD instructions for faster AI performance.
 27 | 
 28 | ## What You'll Learn
 29 | 
 30 | You will learn how to optimize AI inference using Arm-specific techniques such as SIMD (SVE, Neon) and low-bit quantization. The course covers practical strategies for running generative AI efficiently on mobile, Edge, and Cloud-based Arm platforms. You will also explore the trade-offs between cloud and edge deployment, gaining both theoretical knowledge and hands-on skills.
 31 | 
 32 | By the end of this course, you will have a strong foundation in deploying high-performance AI models on Arm hardware.
 33 | 
 34 | ## Requirements
 35 | 
 36 | This course assumes a foundational understanding of machine learning, including completion of a basic introductory course, such as one at the undergraduate level.
 37 | 
 38 | Additionally to run the laboratory exercises, we assume you have access to a Raspberry Pi 5 and an Arm-based cloud instance. We have validated this on an AWS Graviton instance but it is expected to work through other cloud service providers. 
 39 | 
 40 | 
 41 | ---
 42 | 
 43 | ## **Getting Started**
 44 | 
 45 | ### **Lab 1: Optimizing Generative AI on Raspberry Pi**
 46 | 
 47 | 1. **Run the setup script**  
 48 |    Open a terminal in the project directory and execute the setup script:  
 49 |    ```bash
 50 |    ./setup_pi5.sh
 51 |    ```
 52 |    Please Note: This can take around 30 minutes to complete this step. 
 53 | 
 54 | 2. **Login to a Hugging Face account**
 55 |    ```bash
 56 |    huggingface-cli login
 57 |    ```
 58 | 3. **Open the course material**  
 59 |    The course material is provided as Jupyter notebooks. To access the content:
 60 |    ```bash
 61 |    source pi5_env/bin/activate
 62 |    jupyter lab
 63 |    ```
 64 | 
 65 | 4. Follow the instructions provided in `lab1.ipynb` to complete the lab.
 66 | 
 67 | ---
 68 | 
 69 | ### **Lab 2: Optimizing Generative AI on Arm Servers**
 70 | 
 71 | 1. **Launch an AWS EC2 instance**  
 72 |    - Go to Amazon EC2 and create a new instance.
 73 |    - **Select key pair**: Create a key for SSH connection (e.g., `yourkey.pem`).
 74 |    - **Choose an AMI**: Use the `Ubuntu 22.04` AMI as the operating system.
 75 |    - **Instance type**: Select `m7g.xlarge` (Graviton-based instance with Arm Neoverse cores).
 76 |    - **Storage**: Add 32 GB of root storage.
 77 | 
 78 | 2. **Connect to the instance via SSH**  
 79 |    Use the following command to establish an SSH connection (replace with your instance details):
 80 |    ```bash
 81 |    ssh -i "yourkey.pem" -L 8888:localhost:8888 ubuntu@<ec2-public-dns>
 82 |    ```
 83 | 
 84 | 3. **Clone the repository**  
 85 |    Once connected to the instance, clone the repository:
 86 |    ```bash
 87 |    git clone https://github.com/arm-university/AI-on-Arm.git
 88 |    ```
 89 | 
 90 | 4. **Run the setup script**  
 91 |    Change to the repository directory and run the setup script:
 92 |    ```bash
 93 |    ./setup_graviton.sh
 94 |    ```
 95 | 
 96 | 5. **Activate the virtual environment and log in to Hugging Face**  
 97 |    After the setup completes, activate the virtual environment:
 98 |    ```bash
 99 |    source graviton_env/bin/activate
100 |    huggingface-cli login
101 |    ```
102 |    (You will need to log in to Hugging Face to download the required large language model.)
103 | 
104 | 6. **Launch the lab**  
105 |    Start Jupyter Lab by running:
106 |    ```bash
107 |    jupyter lab
108 |    ```
109 |    Copy the link provided in the terminal output, open it in your local browser, and follow the instructions in the notebooks.
110 | 
111 | ---
112 | 
113 | ### **Lab 3: Comparative Inference Benchmarking on Arm Server and Edge Devices**
114 | 
115 | 1. Follow the setup stpes for `lab1` on your local Raspberry Pi.
116 | 2. Follow the setup stpes for `lab2` on your Raspberry Pi, to create and connect to a cloud instance.
117 | 3. Open `lab3.ipynb` to find the instructions for completing the lab 
118 | 
119 | ---
120 | 
121 | ## **Additional Notes**
122 | - To complete this course you are required to have access to a Raspberry Pi 5, for the cloud sections, AWS can be utilised. 
123 | - For Lab 2 and 3 make sure to terminate the EC2 instance when you're done to avoid unnecessary charges.
124 | 
125 | **Happy learning!**
126 | 
127 | **Note:** The primary content writer for this course is an AI researcher, [Oliver Grainge](https://github.com/OliverGrainge).
128 | 


--------------------------------------------------------------------------------
/download_openelm.py:
--------------------------------------------------------------------------------
 1 | from huggingface_hub import snapshot_download
 2 | 
 3 |    # Specify the target directory for downloading the model
 4 | target_directory = "models/hf_models/OpenELM-3B-Instruct"
 5 | 
 6 |    # Download a snapshot of the model repository
 7 | snapshot_download(
 8 |        repo_id="apple/OpenELM-3B-Instruct",
 9 |        local_dir=target_directory,
10 |        revision="main",  # Optional: specify a branch, tag, or commit hash
11 |        local_dir_use_symlinks=False  # Set to True if you want symlinks instead of file copies
12 |    )
13 |    # Download a snapshot of the tokenizer
14 | snapshot_download(
15 |         repo_id="meta-llama/Llama-2-7b-hf",
16 |         local_dir="models/tokenizer/llama2",
17 |         allow_patterns=["tokenizer.model", "tokenizer_config.json", "special_tokens_map.json"],
18 |         local_dir_use_symlinks=False
19 |     )


--------------------------------------------------------------------------------
/graviton_requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | numpy==2.3.3
 3 | matplotlib==3.10.6
 4 | pandas==2.3.3
 5 | transformers==4.39.3
 6 | jupyterlab==4.4.9
 7 | ipykernel==6.30.1
 8 | ipywidgets==8.1.7
 9 | seaborn==0.13.2
10 | torch==2.8.0
11 | expecttest==0.3.0
12 | mistral-common==1.8.5
13 | sentencepiece==0.2.1
14 | 


--------------------------------------------------------------------------------
/img/Learn on Arm_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/img/Learn on Arm_banner.png


--------------------------------------------------------------------------------
/openelm_results/f16_grav_results.csv:
--------------------------------------------------------------------------------
 1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
 2 | openelm 3B F16,5.66,3.04,1,pp12,6.25,0.02
 3 | openelm 3B F16,5.66,3.04,1,tg6,4.37,0.02
 4 | openelm 3B F16,5.66,3.04,2,pp12,11.85,0.01
 5 | openelm 3B F16,5.66,3.04,2,tg6,8.07,0.01
 6 | openelm 3B F16,5.66,3.04,3,pp12,17.35,0.01
 7 | openelm 3B F16,5.66,3.04,3,tg6,11.15,0.03
 8 | openelm 3B F16,5.66,3.04,4,pp12,22.61,0.01
 9 | openelm 3B F16,5.66,3.04,4,tg6,14.17,0.06
10 | 


--------------------------------------------------------------------------------
/openelm_results/f16_grav_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       1 |          pp12 |          6.25 ± 0.02 |
 4 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       1 |           tg6 |          4.37 ± 0.02 |
 5 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       2 |          pp12 |         11.85 ± 0.01 |
 6 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       2 |           tg6 |          8.07 ± 0.01 |
 7 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       3 |          pp12 |         17.35 ± 0.01 |
 8 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       3 |           tg6 |         11.15 ± 0.03 |
 9 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       4 |          pp12 |         22.61 ± 0.01 |
10 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | CPU        |       4 |           tg6 |         14.17 ± 0.06 |
11 | 
12 | build: f11cfdfd (4489)
13 | 


--------------------------------------------------------------------------------
/openelm_results/f16_pi_results.csv:
--------------------------------------------------------------------------------
1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
2 | 


--------------------------------------------------------------------------------
/openelm_results/f16_pi_results.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/openelm_results/f16_pi_results.txt


--------------------------------------------------------------------------------
/openelm_results/q4_grav_results.csv:
--------------------------------------------------------------------------------
 1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
 2 | openelm 3B Q4_0,1.62,3.04,1,pp12,27.33,0.06
 3 | openelm 3B Q4_0,1.62,3.04,1,tg6,10.23,0.01
 4 | openelm 3B Q4_0,1.62,3.04,2,pp12,47.95,0.23
 5 | openelm 3B Q4_0,1.62,3.04,2,tg6,18.24,0.01
 6 | openelm 3B Q4_0,1.62,3.04,3,pp12,68.97,0.03
 7 | openelm 3B Q4_0,1.62,3.04,3,tg6,25.99,0.01
 8 | openelm 3B Q4_0,1.62,3.04,4,pp12,88.43,0.41
 9 | openelm 3B Q4_0,1.62,3.04,4,tg6,33.18,0.02
10 | 


--------------------------------------------------------------------------------
/openelm_results/q4_grav_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       1 |          pp12 |         27.33 ± 0.06 |
 4 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       1 |           tg6 |         10.23 ± 0.01 |
 5 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       2 |          pp12 |         47.95 ± 0.23 |
 6 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       2 |           tg6 |         18.24 ± 0.01 |
 7 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       3 |          pp12 |         68.97 ± 0.03 |
 8 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       3 |           tg6 |         25.99 ± 0.01 |
 9 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       4 |          pp12 |         88.43 ± 0.41 |
10 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       4 |           tg6 |         33.18 ± 0.02 |
11 | 
12 | build: f11cfdfd (4489)
13 | 


--------------------------------------------------------------------------------
/openelm_results/q4_pi_results.csv:
--------------------------------------------------------------------------------
 1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
 2 | openelm 3B Q4_0,1.62,3.04,1,pp12,11.2,0.2
 3 | openelm 3B Q4_0,1.62,3.04,1,tg6,5.41,0.21
 4 | openelm 3B Q4_0,1.62,3.04,2,pp12,19.65,2.12
 5 | openelm 3B Q4_0,1.62,3.04,2,tg6,5.57,0.27
 6 | openelm 3B Q4_0,1.62,3.04,3,pp12,21.43,1.14
 7 | openelm 3B Q4_0,1.62,3.04,3,tg6,4.91,0.24
 8 | openelm 3B Q4_0,1.62,3.04,4,pp12,22.37,1.38
 9 | openelm 3B Q4_0,1.62,3.04,4,tg6,5.12,0.51
10 | 


--------------------------------------------------------------------------------
/openelm_results/q4_pi_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       1 |          pp12 |         11.20 ± 0.20 |
 4 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       1 |           tg6 |          5.41 ± 0.21 |
 5 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       2 |          pp12 |         19.65 ± 2.12 |
 6 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       2 |           tg6 |          5.57 ± 0.27 |
 7 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       3 |          pp12 |         21.43 ± 1.14 |
 8 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       3 |           tg6 |          4.91 ± 0.24 |
 9 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       4 |          pp12 |         22.37 ± 1.38 |
10 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | CPU        |       4 |           tg6 |          5.12 ± 0.51 |
11 | 
12 | build: 1d850433 (4488)
13 | 


--------------------------------------------------------------------------------
/openelm_results/q8_grav_results.csv:
--------------------------------------------------------------------------------
 1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
 2 | openelm 3B Q8_0,3.01,3.04,1,pp12,10.15,0.03
 3 | openelm 3B Q8_0,3.01,3.04,1,tg6,5.29,0.03
 4 | openelm 3B Q8_0,3.01,3.04,2,pp12,19.05,0.01
 5 | openelm 3B Q8_0,3.01,3.04,2,tg6,8.89,0.01
 6 | openelm 3B Q8_0,3.01,3.04,3,pp12,27.91,0.02
 7 | openelm 3B Q8_0,3.01,3.04,3,tg6,12.72,0.02
 8 | openelm 3B Q8_0,3.01,3.04,4,pp12,36.56,0.01
 9 | openelm 3B Q8_0,3.01,3.04,4,tg6,16.63,0.02
10 | 


--------------------------------------------------------------------------------
/openelm_results/q8_grav_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       1 |          pp12 |         10.15 ± 0.03 |
 4 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       1 |           tg6 |          5.29 ± 0.03 |
 5 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       2 |          pp12 |         19.05 ± 0.01 |
 6 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       2 |           tg6 |          8.89 ± 0.01 |
 7 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       3 |          pp12 |         27.91 ± 0.02 |
 8 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       3 |           tg6 |         12.72 ± 0.02 |
 9 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       4 |          pp12 |         36.56 ± 0.01 |
10 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       4 |           tg6 |         16.63 ± 0.02 |
11 | 
12 | build: f11cfdfd (4489)
13 | 


--------------------------------------------------------------------------------
/openelm_results/q8_pi_results.csv:
--------------------------------------------------------------------------------
 1 | Model,Size (GiB),Params (B),Threads,Test,Speed (t/s),Error (t/s)
 2 | openelm 3B Q8_0,3.01,3.04,1,pp12,4.81,0.07
 3 | openelm 3B Q8_0,3.01,3.04,1,tg6,3.12,0.14
 4 | openelm 3B Q8_0,3.01,3.04,2,pp12,9.35,0.06
 5 | openelm 3B Q8_0,3.01,3.04,2,tg6,2.99,0.28
 6 | openelm 3B Q8_0,3.01,3.04,3,pp12,13.27,0.32
 7 | openelm 3B Q8_0,3.01,3.04,3,tg6,2.67,0.15
 8 | openelm 3B Q8_0,3.01,3.04,4,pp12,14.03,1.28
 9 | openelm 3B Q8_0,3.01,3.04,4,tg6,2.29,0.27
10 | 


--------------------------------------------------------------------------------
/openelm_results/q8_pi_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       1 |          pp12 |          4.81 ± 0.07 |
 4 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       1 |           tg6 |          3.12 ± 0.14 |
 5 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       2 |          pp12 |          9.35 ± 0.06 |
 6 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       2 |           tg6 |          2.99 ± 0.28 |
 7 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       3 |          pp12 |         13.27 ± 0.32 |
 8 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       3 |           tg6 |          2.67 ± 0.15 |
 9 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       4 |          pp12 |         14.03 ± 1.28 |
10 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | CPU        |       4 |           tg6 |          2.29 ± 0.27 |
11 | 
12 | build: 1d850433 (4488)
13 | 


--------------------------------------------------------------------------------
/scripts/bench_f32.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | llama-bench -m models/gguf_models/OpenELM-3B-Instruct-f32.gguf -p 128 -n 1 --threads 1


--------------------------------------------------------------------------------
/scripts/bench_q4.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | llama-bench -m models/gguf_models/OpenELM-3B-Instruct-q4_0.gguf


--------------------------------------------------------------------------------
/scripts/benchmark_openelm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # benchmark the floating point 16 model and write it into 
 4 | mkdir openelm_results
 5 | llama.cpp/build/bin/llama-bench -m models/gguf_models/OpenELM-3B-Instruct-q4_0.gguf -p 12 -n 6 --threads 1,2,3,4 | tee openelm_results/q4_pi_results.txt
 6 | llama.cpp/build/bin/llama-bench -m models/gguf_models/OpenELM-3B-Instruct-q8_0.gguf -p 12 -n 6 --threads 1,2,3,4 | tee openelm_results/q8_pi_results.txt 
 7 | 
 8 | # NOTE: we are not benchmarking f16 on Raspberry Pi5 as the benchmark takes too long
 9 | # llama.cpp/build/bin/llama-bench -m models/gguf_models/OpenELM-3B-Instruct-f16.gguf -p 12 -n 6 --threads 1,2,3,4 | tee openelm_results/f16_pi_results.txt 
10 | 
11 |  
12 | 


--------------------------------------------------------------------------------
/scripts/chat_Q4.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | llama-cli -m models/gguf_models/OpenELM-3B-Instruct-q4_0.gguf -p "Could you write a very simpleprogram in c++ to print hello world in less than 10 lines of code?"


--------------------------------------------------------------------------------
/scripts/chat_Q8.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | 
4 | llama.cpp/build/bin/llama-cli -m models/gguf_models/OpenELM-3B-Instruct-q8_0.gguf -p "Could you write a very simpleprogram in c++ to print hello world in less than 10 lines of code?"


--------------------------------------------------------------------------------
/scripts/chat_f32.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | llama-cli -m models/gguf_models/OpenELM-3B-Instruct-f32.gguf -p "Could you write a very simpleprogram in c++ to print hello world in less than 10 lines of code?"


--------------------------------------------------------------------------------
/scripts/convert_openelm_to_gguf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python llama.cpp/convert_hf_to_gguf.py models/hf_models/OpenELM-3B-Instruct/ --outfile models/gguf_models/OpenELM-3B-Instruct-{ftype}.gguf --outtype f32
4 | 
5 | llama-quantize models/gguf_models/OpenELM-3B-Instruct-f32.gguf models/gguf_models/OpenELM-3B-Instruct-f16.gguf F16
6 | llama-quantize models/gguf_models/OpenELM-3B-Instruct-f32.gguf models/gguf_models/OpenELM-3B-Instruct-q8_0.gguf Q8_0 
7 | llama-quantize models/gguf_models/OpenELM-3B-Instruct-f32.gguf models/gguf_models/OpenELM-3B-Instruct-q4_0.gguf Q4_0 
8 | 
9 | 


--------------------------------------------------------------------------------
/scripts/download_openelm_hf.py:
--------------------------------------------------------------------------------
 1 | from huggingface_hub import snapshot_download
 2 | 
 3 | # Specify the target directory for downloading the model
 4 | target_directory = "models/hf_models/OpenELM-3B-Instruct"
 5 | 
 6 | # Download a snapshot of the model repository
 7 | snapshot_download(
 8 |     repo_id="apple/OpenELM-3B-Instruct",
 9 |     local_dir=target_directory,
10 |     revision="main",  # Optional: specify a branch, tag, or commit hash
11 |     local_dir_use_symlinks=False  # Set to True if you want symlinks instead of file copies
12 | )


--------------------------------------------------------------------------------
/scripts/f16_pi_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       1 |          pp12 |        197.89 ± 1.12 |
 4 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       1 |           tg6 |         25.89 ± 0.08 |
 5 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       2 |          pp12 |        198.04 ± 0.44 |
 6 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       2 |           tg6 |         25.93 ± 0.03 |
 7 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       3 |          pp12 |        198.50 ± 0.40 |
 8 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       3 |           tg6 |         25.93 ± 0.07 |
 9 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       4 |          pp12 |        198.10 ± 0.18 |
10 | | openelm 3B F16                 |   5.66 GiB |     3.04 B | Metal,BLAS |       4 |           tg6 |         25.91 ± 0.04 |
11 | 
12 | build: 0ccd7f3 (1)
13 | 


--------------------------------------------------------------------------------
/scripts/parse_results.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | 
 6 | # Check if a file argument is provided
 7 | if len(sys.argv) != 2:
 8 |     print("Usage: python script.py <results_file>")
 9 |     sys.exit(1)
10 | 
11 | results_file = sys.argv[1]
12 | 
13 | # Load the results from the specified file
14 | with open(results_file, "r") as f:
15 |     lines = f.readlines()
16 | 
17 | # Extract relevant data using regex
18 | data = []
19 | pattern = re.compile(r"\|\s+([\w\s]+)\s+\|\s+([\d.]+)\s+GiB\s+\|\s+([\d.]+)\s+B\s+\|\s+CPU\s+\|\s+(\d+)\s+\|\s+(\w+)\s+\|\s+([\d.]+)\s+±\s+([\d.]+)\s+\|")
20 | for line in lines:
21 |     match = pattern.search(line)
22 |     if match:
23 |         model, size, params, threads, test, speed, error = match.groups()
24 |         data.append([model.strip(), float(size), float(params), int(threads), test, float(speed), float(error)])
25 | 
26 | # Convert to DataFrame
27 | df = pd.DataFrame(data, columns=["Model", "Size (GiB)", "Params (B)", "Threads", "Test", "Speed (t/s)", "Error (t/s)"])
28 | 
29 | 
30 | 
31 | output_file = results_file.replace('.txt', '.csv')
32 | df.to_csv(output_file, index=False)
33 | 


--------------------------------------------------------------------------------
/scripts/q4_pi_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       1 |          pp12 |        190.94 ± 1.04 |
 4 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       1 |           tg6 |         65.64 ± 0.17 |
 5 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       2 |          pp12 |        188.98 ± 0.60 |
 6 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       2 |           tg6 |         65.49 ± 0.54 |
 7 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       3 |          pp12 |        188.56 ± 0.93 |
 8 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       3 |           tg6 |         64.91 ± 0.50 |
 9 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       4 |          pp12 |        188.42 ± 0.63 |
10 | | openelm 3B Q4_0                |   1.62 GiB |     3.04 B | Metal,BLAS |       4 |           tg6 |         65.53 ± 0.24 |
11 | 
12 | build: 0ccd7f3 (1)
13 | 


--------------------------------------------------------------------------------
/scripts/q8_pi_results.txt:
--------------------------------------------------------------------------------
 1 | | model                          |       size |     params | backend    | threads |          test |                  t/s |
 2 | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |
 3 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       1 |          pp12 |        184.20 ± 0.61 |
 4 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       1 |           tg6 |         43.02 ± 0.22 |
 5 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       2 |          pp12 |        182.18 ± 0.64 |
 6 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       2 |           tg6 |         42.96 ± 0.33 |
 7 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       3 |          pp12 |        184.52 ± 0.65 |
 8 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       3 |           tg6 |         43.23 ± 0.11 |
 9 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       4 |          pp12 |        184.23 ± 0.52 |
10 | | openelm 3B Q8_0                |   3.01 GiB |     3.04 B | Metal,BLAS |       4 |           tg6 |         43.01 ± 0.14 |
11 | 
12 | build: 0ccd7f3 (1)
13 | 


--------------------------------------------------------------------------------
/setup_graviton.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | echo "======================================================================="
  6 | echo "  0. Update package lists"
  7 | echo "======================================================================="
  8 | sudo apt update
  9 | 
 10 | 
 11 | echo "======================================================================="
 12 | echo "  1. Add essential development packages"
 13 | echo "======================================================================="
 14 | sudo apt install -y \
 15 |     wget build-essential libssl-dev libbz2-dev libreadline-dev libsqlite3-dev \
 16 |     zlib1g-dev libncurses-dev libffi-dev libgdbm-dev liblzma-dev uuid-dev \
 17 |     tk-dev python3-pip libblas-dev \
 18 |     linux-tools-common linux-tools-$(uname -r) \
 19 |     libelf-dev cmake clang llvm llvm-dev
 20 | 
 21 | echo "======================================================================="
 22 | echo "  2. Verify perf installation"
 23 | echo "======================================================================="
 24 | if command -v perf >/dev/null 2>&1; then
 25 |     echo "perf installed successfully."
 26 | else
 27 |     echo "Error: perf installation failed."
 28 |     exit 1
 29 | fi
 30 | 
 31 | echo "======================================================================="
 32 | echo "  3. Add deadsnakes PPA for Python 3.12"
 33 | echo "======================================================================="
 34 | sudo add-apt-repository ppa:deadsnakes/ppa -y
 35 | sudo apt-get update
 36 | 
 37 | echo "======================================================================="
 38 | echo "  4. Install Python 3.12 and related tools"
 39 | echo "======================================================================="
 40 | sudo apt install -y gcc g++ build-essential google-perftools \
 41 |     python3.12 python3.12-venv python3.12-dev
 42 | 
 43 | echo "======================================================================="
 44 | echo "  5. Create (or recreate) Python 3.12 virtual environment 'graviton_env'"
 45 | echo "======================================================================="
 46 | if [ -d graviton_env ]; then
 47 |     echo "Removing existing virtual environment 'graviton_env'..."
 48 |     rm -rf graviton_env
 49 | fi
 50 | 
 51 | python3.12 -m venv graviton_env
 52 | 
 53 | echo "======================================================================="
 54 | echo "  6. Activate the virtual environment"
 55 | echo "======================================================================="
 56 | # shellcheck disable=SC1091
 57 | source graviton_env/bin/activate
 58 | 
 59 | echo "======================================================================="
 60 | echo "  7. Upgrade pip"
 61 | echo "======================================================================="
 62 | python3.12 -m pip install --upgrade pip
 63 | 
 64 | echo "======================================================================="
 65 | echo "  8. Install Python packages from graviton_requirements.txt"
 66 | echo "======================================================================="
 67 | if [ -f "graviton_requirements.txt" ]; then
 68 |     python3.12 -m pip install -r graviton_requirements.txt
 69 | else
 70 |     echo "Error: graviton_requirements.txt not found!"
 71 |     exit 1
 72 | fi
 73 | 
 74 | 
 75 | 
 76 | ####################
 77 | # STEP 15: Clone and build processwatch (if not already cloned)
 78 | #############################################################################
 79 | echo "======================================================================="
 80 | echo "  15. Clone and build 'processwatch'"
 81 | echo "======================================================================="
 82 | 
 83 | # Just in case, re-install the dev packages, though they should already be present:
 84 | sudo apt-get update
 85 | sudo apt-get install -y libelf-dev cmake clang llvm llvm-dev
 86 | sudo apt-get update && sudo apt-get upgrade
 87 | 
 88 | if [ ! -d "processwatch" ]; then
 89 |     #git clone --recursive https://github.com/intel/processwatch.git
 90 |     git clone --recursive https://github.com/grahamwoodward/processwatch.git
 91 | else
 92 |     echo "processwatch folder already exists. Skipping clone."
 93 | fi
 94 | sudo apt-get install -y linux-tools-generic
 95 | cd processwatch
 96 | ./build.sh
 97 | cd ..
 98 | echo "ubuntu ALL=(ALL) NOPASSWD: /home/ubuntu/processwatch/processwatch" | sudo tee /etc/sudoers.d/99-processwatch
 99 | sudo chmod 0440 /etc/sudoers.d/99-processwatch
100 | #############################################################################
101 | 
102 | echo "======================================================================="
103 | echo "Setup script completed successfully!"
104 | echo "Activate your environment using: source graviton_env/bin/activate"
105 | echo "======================================================================="
106 | 


--------------------------------------------------------------------------------
/setup_pi5.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | #############################################################################
  6 | # STEP 1: Update and (optonally) upgrade the system
  7 | #############################################################################
  8 | echo "======================================================================="
  9 | echo "  1. Update and upgrade the system"
 10 | echo "======================================================================="
 11 | sudo apt update
 12 | # sudo apt upgrade -y
 13 | 
 14 | #############################################################################
 15 | # STEP 2: Install essential development packages
 16 | #############################################################################
 17 | echo "======================================================================="
 18 | echo "  2. Install essential development packages"
 19 | echo "======================================================================="
 20 | sudo apt install -y \
 21 |     wget build-essential libssl-dev libbz2-dev libreadline-dev libsqlite3-dev \
 22 |     zlib1g-dev libncurses5-dev libncursesw5-dev libffi-dev libgdbm-dev \
 23 |     liblzma-dev uuid-dev tk-dev linux-perf cmake
 24 | 
 25 | #############################################################################
 26 | # STEP 3: Verify perf installation
 27 | #############################################################################
 28 | echo "======================================================================="
 29 | echo "  3. Verify perf installation"
 30 | echo "======================================================================="
 31 | if command -v perf >/dev/null 2>&1; then
 32 |     echo "perf installed successfully."
 33 | else
 34 |     echo "Error: perf installation failed."
 35 |     exit 1
 36 | fi
 37 | 
 38 | #############################################################################
 39 | # STEP 4: Check and install Python 3.12 if necessary
 40 | #############################################################################
 41 | echo "======================================================================="
 42 | echo "  4. Check and install Python 3.12 if necessary"
 43 | echo "======================================================================="
 44 | if ! python3.12 --version >/dev/null 2>&1; then
 45 |     echo "Downloading and building Python 3.12..."
 46 |     cd /tmp
 47 |     if [ ! -f /tmp/Python-3.12.0.tgz ]; then
 48 |         wget https://www.python.org/ftp/python/3.12.0/Python-3.12.0.tgz
 49 |     else
 50 |         echo "Python 3.12 source archive already exists. Skipping download."
 51 |     fi
 52 | 
 53 |     tar -xf Python-3.12.0.tgz
 54 |     cd Python-3.12.0
 55 |     ./configure --enable-optimizations
 56 |     sudo make altinstall
 57 | else
 58 |     echo "Python 3.12 is already installed. Skipping build."
 59 | fi
 60 | 
 61 | #############################################################################
 62 | # STEP 5: Create Python virtual environment
 63 | #############################################################################
 64 | echo "======================================================================="
 65 | echo "  5. Create Python virtual environment"
 66 | echo "======================================================================="
 67 | if [ ! -d pi5_env ]; then
 68 |     echo "Creating Python virtual environment..."
 69 |     python3.12 -m venv pi5_env
 70 | else
 71 |     echo "Virtual environment already exists. Skipping creation."
 72 | fi
 73 | 
 74 | #############################################################################
 75 | # STEP 6: Activate the virtual environment and install packages
 76 | #############################################################################
 77 | echo "======================================================================="
 78 | echo "  6. Activate the virtual environment and install packages"
 79 | echo "======================================================================="
 80 | # shellcheck disable=SC1091
 81 | source pi5_env/bin/activate
 82 | 
 83 | echo "======================================================================="
 84 | echo "  7. Upgrade pip"
 85 | echo "======================================================================="
 86 | python3.12 -m pip install --upgrade pip
 87 | 
 88 | echo "======================================================================="
 89 | echo "  8. Install required Python packages"
 90 | echo "======================================================================="
 91 | python3.12 -m pip install --upgrade --trusted-host archive1.piwheels.org \
 92 |     numpy \
 93 |     matplotlib \
 94 |     pandas \
 95 |     torch \
 96 |     transformers==4.53.3 \
 97 |     jupyterlab \
 98 |     ipykernel \
 99 |     ipywidgets \
100 |     seaborn \
101 |     sentencepiece \
102 |     mistral-common
103 | 
104 | #############################################################################
105 | echo "======================================================================="
106 | echo "Setup script completed successfully!"
107 | echo "Activate your environment using: source pi5_env/bin/activate"
108 | echo "======================================================================="
109 | 


--------------------------------------------------------------------------------
/slides/chapter1.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/slides/chapter1.pptx


--------------------------------------------------------------------------------
/slides/chapter2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/slides/chapter2.pptx


--------------------------------------------------------------------------------
/slides/chapter3.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/slides/chapter3.pptx


--------------------------------------------------------------------------------
/slides/chapter4.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arm-university/AI-on-Arm/8f0b6ebd5520283ad6d9883a3baa0756283ce867/slides/chapter4.pptx


--------------------------------------------------------------------------------
/src/c/.ipynb_checkpoints/benchmark_fp32_neon-checkpoint.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/fp32_neon.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     fp = fopen("results/fp32_neon_latency_results.csv", "w");
22 |     if (fp == NULL) {
23 |         printf("Error opening file for writing\n");
24 |         return 1;
25 |     }
26 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
27 | 
28 |     for (int s = 0; s < num_sizes; s++) {
29 |         int N = sizes[s];
30 |         float* A;
31 |         float* B;
32 |         float* C;
33 | 
34 |         posix_memalign((void**)&A, 16, N * N * sizeof(float));
35 |         posix_memalign((void**)&B, 16, N * N * sizeof(float));
36 |         posix_memalign((void**)&C, 16, N * N * sizeof(float));
37 | 
38 |         // Initialize matrices with some values
39 |         for (int i = 0; i < N * N; i++) {
40 |             A[i] = 1.0f;
41 |             B[i] = 1.0f;
42 |         }
43 | 
44 |         // Warm-up iterations
45 |         for (int warmup = 0; warmup < 3; warmup++) {
46 |             matmul_fp32_neon(A, B, C, N);
47 |         }
48 | 
49 |         double start = get_time();
50 |         matmul_fp32_neon(A, B, C, N);
51 |         double end = get_time();
52 | 
53 |         double time_taken = end - start;
54 |         printf("FP32 NEON Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
55 |         fprintf(fp, "%d,%f\n", N, time_taken);
56 | 
57 |         free(A);
58 |         free(B);
59 |         free(C);
60 |     }
61 |     fclose(fp);
62 |     return 0; 
63 | }


--------------------------------------------------------------------------------
/src/c/.ipynb_checkpoints/benchmark_int8_neon-checkpoint.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/int8_neon.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     fp = fopen("results/int8_neon_latency_results.csv", "w");
22 |     if (fp == NULL) {
23 |         printf("Error opening file for writing\n");
24 |         return 1;
25 |     }
26 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
27 | 
28 |     for (int s = 0; s < num_sizes; s++) {
29 |         int N = sizes[s];
30 |         int8_t* A;
31 |         int8_t* B;
32 |         int32_t* C;
33 | 
34 |         posix_memalign((void**)&A, 16, N * N * sizeof(int8_t));
35 |         posix_memalign((void**)&B, 16, N * N * sizeof(int8_t));
36 |         posix_memalign((void**)&C, 16, N * N * sizeof(int32_t));
37 | 
38 |         // Initialize matrices with some values
39 |         for (int i = 0; i < N * N; i++) {
40 |             A[i] = 1;
41 |             B[i] = 1;
42 |         }
43 | 
44 |         // Warm-up iterations
45 |         for (int warmup = 0; warmup < 3; warmup++) {
46 |             matmul_int8_neon(A, B, C, N);
47 |         }
48 | 
49 |         double start = get_time();
50 |         matmul_int8_neon(A, B, C, N);
51 |         double end = get_time();
52 | 
53 |         double time_taken = end - start;
54 |         printf("Int8 Neon Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
55 |         fprintf(fp, "%d,%f\n", N, time_taken);
56 | 
57 |         free(A);
58 |         free(B);
59 |         free(C);
60 |     }
61 |     fclose(fp);
62 |     return 0; 
63 | }


--------------------------------------------------------------------------------
/src/c/.ipynb_checkpoints/benchmark_naive-checkpoint.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/naive.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     // ============================= BENCHMARKING NAIVE ===============================
22 |     fp = fopen("results/naive_latency_results.csv", "w");
23 |     if (fp == NULL) {
24 |         printf("Error opening file for writing\n");
25 |         return 1;
26 |     }
27 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
28 | 
29 |     for (int s = 0; s < num_sizes; s++) {
30 |         int N = sizes[s];
31 |         float* A;
32 |         float* B;
33 |         float* C;
34 | 
35 |         posix_memalign((void**)&A, 16, N * N * sizeof(float));
36 |         posix_memalign((void**)&B, 16, N * N * sizeof(float));
37 |         posix_memalign((void**)&C, 16, N * N * sizeof(float));
38 | 
39 |         // Initialize matrices with some values
40 |         for (int i = 0; i < N * N; i++) {
41 |             A[i] = 1.0f;
42 |             B[i] = 1.0f;
43 |         }
44 | 
45 |         // Warm-up iterations
46 |         for (int warmup = 0; warmup < 3; warmup++) {
47 |             matrix_multiply_naive(A, B, C, N);
48 |         }
49 | 
50 |         double start = get_time();
51 |         matrix_multiply_naive(A, B, C, N);
52 |         double end = get_time();
53 | 
54 |         double time_taken = end - start;
55 |         printf("Naive Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
56 |         fprintf(fp, "%d,%f\n", N, time_taken);
57 | 
58 |         free(A);
59 |         free(B);
60 |         free(C);
61 |     }
62 |     fclose(fp);
63 |     return 0; 
64 | }


--------------------------------------------------------------------------------
/src/c/benchmark_fp32_neon.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/fp32_neon.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     fp = fopen("results/fp32_neon_latency_results.csv", "w");
22 |     if (fp == NULL) {
23 |         printf("Error opening file for writing\n");
24 |         return 1;
25 |     }
26 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
27 | 
28 |     for (int s = 0; s < num_sizes; s++) {
29 |         int N = sizes[s];
30 |         float* A;
31 |         float* B;
32 |         float* C;
33 | 
34 |         posix_memalign((void**)&A, 16, N * N * sizeof(float));
35 |         posix_memalign((void**)&B, 16, N * N * sizeof(float));
36 |         posix_memalign((void**)&C, 16, N * N * sizeof(float));
37 | 
38 |         // Initialize matrices with some values
39 |         for (int i = 0; i < N * N; i++) {
40 |             A[i] = 1.0f;
41 |             B[i] = 1.0f;
42 |         }
43 | 
44 |         // Warm-up iterations
45 |         for (int warmup = 0; warmup < 3; warmup++) {
46 |             matmul_fp32_neon(A, B, C, N);
47 |         }
48 | 
49 |         double start = get_time();
50 |         matmul_fp32_neon(A, B, C, N);
51 |         double end = get_time();
52 | 
53 |         double time_taken = end - start;
54 |         printf("FP32 NEON Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
55 |         fprintf(fp, "%d,%f\n", N, time_taken);
56 | 
57 |         free(A);
58 |         free(B);
59 |         free(C);
60 |     }
61 |     fclose(fp);
62 |     return 0; 
63 | }


--------------------------------------------------------------------------------
/src/c/benchmark_int8_neon.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/int8_neon.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     fp = fopen("results/int8_neon_latency_results.csv", "w");
22 |     if (fp == NULL) {
23 |         printf("Error opening file for writing\n");
24 |         return 1;
25 |     }
26 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
27 | 
28 |     for (int s = 0; s < num_sizes; s++) {
29 |         int N = sizes[s];
30 |         int8_t* A;
31 |         int8_t* B;
32 |         int32_t* C;
33 | 
34 |         posix_memalign((void**)&A, 16, N * N * sizeof(int8_t));
35 |         posix_memalign((void**)&B, 16, N * N * sizeof(int8_t));
36 |         posix_memalign((void**)&C, 16, N * N * sizeof(int32_t));
37 | 
38 |         // Initialize matrices with some values
39 |         for (int i = 0; i < N * N; i++) {
40 |             A[i] = 1;
41 |             B[i] = 1;
42 |         }
43 | 
44 |         // Warm-up iterations
45 |         for (int warmup = 0; warmup < 3; warmup++) {
46 |             matmul_int8_neon(A, B, C, N);
47 |         }
48 | 
49 |         double start = get_time();
50 |         matmul_int8_neon(A, B, C, N);
51 |         double end = get_time();
52 | 
53 |         double time_taken = end - start;
54 |         printf("Int8 Neon Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
55 |         fprintf(fp, "%d,%f\n", N, time_taken);
56 | 
57 |         free(A);
58 |         free(B);
59 |         free(C);
60 |     }
61 |     fclose(fp);
62 |     return 0; 
63 | }


--------------------------------------------------------------------------------
/src/c/benchmark_naive.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <sys/time.h>
 5 | #include <malloc.h>
 6 | #include "kernels/naive.c"
 7 | #include "sizes.c"
 8 | 
 9 | // Function to get time in seconds with high resolution
10 | double get_time() {
11 |     struct timeval tv;
12 |     gettimeofday(&tv, NULL);
13 |     return tv.tv_sec + tv.tv_usec / 1e6;
14 | }
15 | 
16 | int main() {
17 |     int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
18 | 
19 |     FILE* fp;
20 | 
21 |     // ============================= BENCHMARKING NAIVE ===============================
22 |     fp = fopen("results/naive_latency_results.csv", "w");
23 |     if (fp == NULL) {
24 |         printf("Error opening file for writing\n");
25 |         return 1;
26 |     }
27 |     fprintf(fp, "Matrix Size,Latency (seconds)\n");
28 | 
29 |     for (int s = 0; s < num_sizes; s++) {
30 |         int N = sizes[s];
31 |         float* A;
32 |         float* B;
33 |         float* C;
34 | 
35 |         posix_memalign((void**)&A, 16, N * N * sizeof(float));
36 |         posix_memalign((void**)&B, 16, N * N * sizeof(float));
37 |         posix_memalign((void**)&C, 16, N * N * sizeof(float));
38 | 
39 |         // Initialize matrices with some values
40 |         for (int i = 0; i < N * N; i++) {
41 |             A[i] = 1.0f;
42 |             B[i] = 1.0f;
43 |         }
44 | 
45 |         // Warm-up iterations
46 |         for (int warmup = 0; warmup < 3; warmup++) {
47 |             matrix_multiply_naive(A, B, C, N);
48 |         }
49 | 
50 |         double start = get_time();
51 |         matrix_multiply_naive(A, B, C, N);
52 |         double end = get_time();
53 | 
54 |         double time_taken = end - start;
55 |         printf("Naive Matrix Multiplication (Size %d): %f seconds\n", N, time_taken);
56 |         fprintf(fp, "%d,%f\n", N, time_taken);
57 | 
58 |         free(A);
59 |         free(B);
60 |         free(C);
61 |     }
62 |     fclose(fp);
63 |     return 0; 
64 | }


--------------------------------------------------------------------------------
/src/cpp/blas/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Minimum CMake version required
 2 | cmake_minimum_required(VERSION 3.10)
 3 | 
 4 | # Project name and version
 5 | project(BlasScalingBenchmark VERSION 1.0 LANGUAGES CXX)
 6 | 
 7 | # Set C++ standard
 8 | set(CMAKE_CXX_STANDARD 17)
 9 | set(CMAKE_CXX_STANDARD_REQUIRED True)
10 | 
11 | # Specify the executables
12 | add_executable(blas_scaling_benchmark benchmark_f32.cpp)
13 | add_executable(blas_f32_inf f32_inf.cpp)
14 | 
15 | # Find and link BLAS library
16 | find_package(BLAS REQUIRED)
17 | 
18 | if (BLAS_FOUND)
19 |     message(STATUS "BLAS library found: ${BLAS_LIBRARIES}")
20 |     target_link_libraries(blas_scaling_benchmark PUBLIC ${BLAS_LIBRARIES})
21 |     target_link_libraries(blas_f32_inf PUBLIC ${BLAS_LIBRARIES})
22 | else()
23 |     message(FATAL_ERROR "BLAS library not found!")
24 | endif()
25 | 
26 | # Include BLAS directories if required
27 | if (BLAS_INCLUDE_DIRS)
28 |     message(STATUS "Using BLAS include directories: ${BLAS_INCLUDE_DIRS}")
29 |     target_include_directories(blas_scaling_benchmark PUBLIC ${BLAS_INCLUDE_DIRS})
30 |     target_include_directories(blas_f32_inf PUBLIC ${BLAS_INCLUDE_DIRS})
31 | endif()
32 | 
33 | # Add architecture-specific compiler flags (NEON or SVE)
34 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Check if ARM architecture
35 |     message(STATUS "ARM architecture detected; adding NEON/SVE flags")
36 |     target_compile_options(blas_scaling_benchmark PRIVATE -march=armv8.4-a+sve -mtune=neoverse-v1)
37 |     target_compile_options(blas_f32_inf PRIVATE -march=armv8.4-a+sve -mtune=neoverse-v1)
38 | else()
39 |     message(WARNING "Non-ARM architecture detected; skipping NEON/SVE flags")
40 | endif()
41 | 
42 | # Optional: Specify installation rules (if needed)
43 | install(TARGETS blas_scaling_benchmark blas_f32_inf DESTINATION bin)
44 | 


--------------------------------------------------------------------------------
/src/cpp/blas/benchmark_f32.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <vector>
 4 | #include <chrono>
 5 | #include <cstdlib>
 6 | #include <cblas.h>
 7 | #include <algorithm>
 8 | #include "../common/sizes.cpp"
 9 | 
10 | int main() {
11 |     // Open CSV file for writing
12 |     std::ofstream csv_file("../../results/blas_f32_scaling_results.csv");
13 |     // Write header
14 |     csv_file << "Size,Latency(us)\n";
15 | 
16 | 
17 |     for (int size : sizes) {
18 |         // Allocate memory for matrices X, W, and Y
19 |         std::vector<float> X(size * size);
20 |         std::vector<float> W(size * size);
21 |         std::vector<float> Y(size * size, 0.0f);  // Initialize with zeros
22 | 
23 |         // Populate X and W with random values
24 |         std::generate(X.begin(), X.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
25 |         std::generate(W.begin(), W.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
26 | 
27 |         const int M = size;
28 |         const int N = size;
29 |         const int K = size;
30 | 
31 |         const float alpha = 1.0f; // Scaling factor for A*B
32 |         const float beta = 0.0f;  // Scaling factor for C
33 | 
34 |         // Warmup: Run a single matmul to ensure everything is loaded into memory
35 |         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
36 |                     M, N, K, alpha,
37 |                     X.data(), K,
38 |                     W.data(), N,
39 |                     beta,
40 |                     Y.data(), N);
41 | 
42 |         // Measure time for matrix multiplication
43 |         float* X_ptr = X.data();
44 |         float* W_ptr = W.data(); 
45 |         float* Y_ptr = Y.data();
46 |         auto start = std::chrono::high_resolution_clock::now();
47 | 
48 |         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
49 |                     M, N, K, alpha,
50 |                     X_ptr, K,
51 |                     W_ptr, N,
52 |                     beta,
53 |                     Y_ptr, N);
54 | 
55 |         auto end = std::chrono::high_resolution_clock::now();
56 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
57 | 
58 |         // Print and log the results
59 |         //std::cout << "Size: " << size << ", Time taken: " << duration << " milliseconds" << std::endl;
60 |         csv_file << size << "," << duration << "\n";
61 |     }
62 | 
63 |     csv_file.close();
64 |     return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/src/cpp/blas/f32_inf.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <vector>
 4 | #include <chrono>
 5 | #include <cstdlib>
 6 | #include <cblas.h>
 7 | #include <algorithm>
 8 | #include "../common/sizes.cpp"
 9 | 
10 | int main() {
11 |     // Open CSV file for writing
12 |     std::ofstream csv_file("../../results/blas_f32_scaling_results.csv");
13 |     // Write header
14 |     csv_file << "Size,Latency(us)\n";
15 | 
16 | 
17 |     for (int size : sizes) {
18 |         // Allocate memory for matrices X, W, and Y
19 |         std::vector<float> X(size * size);
20 |         std::vector<float> W(size * size);
21 |         std::vector<float> Y(size * size, 0.0f);  // Initialize with zeros
22 | 
23 |         // Populate X and W with random values
24 |         std::generate(X.begin(), X.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
25 |         std::generate(W.begin(), W.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
26 | 
27 |         const int M = size;
28 |         const int N = size;
29 |         const int K = size;
30 | 
31 |         const float alpha = 1.0f; // Scaling factor for A*B
32 |         const float beta = 0.0f;  // Scaling factor for C
33 | 
34 |         // Warmup: Run a single matmul to ensure everything is loaded into memory
35 |         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
36 |                     M, N, K, alpha,
37 |                     X.data(), K,
38 |                     W.data(), N,
39 |                     beta,
40 |                     Y.data(), N);
41 | 
42 |         // Measure time for matrix multiplication
43 |         float* X_ptr = X.data();
44 |         float* W_ptr = W.data(); 
45 |         float* Y_ptr = Y.data();
46 |         auto start = std::chrono::high_resolution_clock::now();
47 | 
48 |         while (true) {
49 |         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
50 |                     M, N, K, alpha,
51 |                     X_ptr, K,
52 |                     W_ptr, N,
53 |                     beta,
54 |                     Y_ptr, N);
55 |         }
56 | 
57 |         auto end = std::chrono::high_resolution_clock::now();
58 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
59 | 
60 |         // Print and log the results
61 |         std::cout << "Size: " << size << ", Time taken: " << duration << " microseconds" << std::endl;
62 |         csv_file << size << "," << duration << "\n";
63 |     }
64 | 
65 |     csv_file.close();
66 |     return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/cpp/common/sizes.cpp:
--------------------------------------------------------------------------------
1 | 
2 | int sizes[] = {32, 64, 128, 256, 512, 1024, 2048};
3 | 


--------------------------------------------------------------------------------
/src/cpp/f16_f16_f16p/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(benchmark_f16)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | set(KLEIDIAI_PATH ../../../kleidiai/)
 7 | set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/)
 8 | set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f16_f16_f16p/)
 9 | 
10 | # KleidiAI include directories
11 | include_directories(
12 |     ${KLEIDIAI_PATH}
13 |     ${MATMUL_PACK_PATH}
14 |     ${MATMUL_PATH}
15 | )
16 | 
17 | # Files requires to build the executable
18 | add_executable(benchmark_f16
19 |     benchmark_f16.cpp
20 |     kernel.cpp
21 |     ${MATMUL_PATH}/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
22 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
23 | )
24 | 
25 | # Files requires to build the executable
26 | add_executable(benchmark_f16_scaling
27 |     benchmark_f16_scaling.cpp#
28 |     kernel.cpp
29 |     ${MATMUL_PATH}/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
30 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
31 | )
32 | 
33 | add_executable(kai_f16_inf
34 |     kai_f16_inf.cpp#
35 |     kernel.cpp
36 |     ${MATMUL_PATH}/kai_matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla.c
37 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.c
38 | )
39 | target_compile_options(benchmark_f16
40 |     PRIVATE -march=armv8.4-a+fp16+sve -mtune=neoverse-v1
41 | )
42 | 
43 | target_compile_options(benchmark_f16_scaling
44 |     PRIVATE -march=armv8.4-a+fp16+sve -mtune=neoverse-v1
45 | )
46 | 
47 | target_compile_options(kai_f16_inf
48 |     PRIVATE -march=armv8.4-a+fp16+sve -mtune=neoverse-v1
49 | )
50 | 
51 | target_compile_definitions(benchmark_f16
52 |     PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
53 | )
54 | 


--------------------------------------------------------------------------------
/src/cpp/f16_f16_f16p/benchmark_f16.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h"
  5 | 
  6 | #include <algorithm>
  7 | #include <cfloat>
  8 | #include <chrono>
  9 | #include <cmath>
 10 | #include <cstddef>
 11 | #include <iomanip>
 12 | #include <iostream>
 13 | 
 14 | 
 15 | void loadMatrix(const char* filename, float16_t* matrix, size_t rows, size_t cols) {
 16 |     FILE* file = fopen(filename, "rb");
 17 |     if (file == NULL) {
 18 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 19 |         return;
 20 |     }
 21 | 
 22 |     // Temporary buffer for float32 values
 23 |     std::vector<float> temp_buffer(rows * cols);
 24 |     
 25 |     size_t elements_read = fread(temp_buffer.data(), sizeof(float), rows * cols, file);
 26 |     if (elements_read != rows * cols) {
 27 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 28 |     }
 29 |     fclose(file);
 30 | 
 31 |     // Convert float32 to float16
 32 |     for (size_t i = 0; i < rows * cols; i++) {
 33 |         matrix[i] = float16_t(temp_buffer[i]);
 34 |     }
 35 | }
 36 | 
 37 | 
 38 | 
 39 | int main() {
 40 |     // Declare matrix dimensions
 41 |     const size_t activation_rows = 6, activation_cols = 1280;
 42 |     const size_t weight_rows = 1280, weight_cols = 32000;
 43 | 
 44 |     std::vector<float16_t> X(activation_rows * activation_cols);
 45 |     std::vector<float16_t> W(weight_rows * weight_cols);
 46 |     std::vector<float16_t> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
 47 | 
 48 |     size_t M = activation_rows; 
 49 |     size_t N = weight_cols;
 50 |     size_t K = activation_cols;
 51 | 
 52 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
 53 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
 54 | 
 55 |     float16_t* lhs = X.data();
 56 |     float16_t* rhs = W.data();
 57 | 
 58 |     const size_t nr = ukernel.get_nr();
 59 |     const size_t kr = ukernel.get_kr();
 60 |     const size_t sr = ukernel.get_sr();
 61 | 
 62 |     // In a single row, we pack nr bias values followed by K rows of nr RHS values
 63 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(N, K);
 64 |     const size_t rhs_packed_cols = nr + K * nr;
 65 |     const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float16_t));
 66 | 
 67 |     float16_t* rhs_packed = new float16_t[rhs_packed_size];
 68 | 
 69 |     const size_t lhs_stride = K * sizeof(float16_t);
 70 |     const size_t rhs_stride = N * sizeof(float16_t);
 71 |     const size_t dst_stride_row = N * sizeof(float16_t);
 72 |     const size_t dst_stride_col = sizeof(float16_t);
 73 |     //float* bias = new float[N];
 74 |     float16_t* bias = new float16_t[N];
 75 |     std::fill_n(bias, N, 0.0f);
 76 |     float16_t* dst = Y.data();
 77 |     
 78 |     
 79 |     kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(
 80 |         1, N, K, nr, kr, sr,  // Packing arguments
 81 |         rhs_stride,           // RHS stride
 82 |         rhs,                  // RHS
 83 |         bias,                 // Bias
 84 |         NULL,                 // Scale
 85 |         rhs_packed,           // RHS packed
 86 |         0, NULL);
 87 | 
 88 |     auto start = std::chrono::high_resolution_clock::now();
 89 |     ukernel.run_matmul(
 90 |         M, N, K,           // Dimensions
 91 |         lhs,               // LHS
 92 |         lhs_stride,        // LHS stride
 93 |         rhs_packed,        // RHS packed
 94 |         dst,               // DST
 95 |         dst_stride_row,    // DST stride (row)
 96 |         dst_stride_col,    // DST stride (col)
 97 |         -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
 98 |     );
 99 | 
100 |     auto end = std::chrono::high_resolution_clock::now();
101 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
102 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
103 |     
104 |     return 0;
105 | }


--------------------------------------------------------------------------------
/src/cpp/f16_f16_f16p/benchmark_f16_scaling.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <fstream>
 4 | #include "../common/sizes.cpp"
 5 | #include "kernel.cpp"
 6 | #include "kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h"
 7 | 
 8 | #include <algorithm>
 9 | #include <cfloat>
10 | #include <chrono>
11 | #include <cmath>
12 | #include <cstddef>
13 | #include <iomanip>
14 | #include <iostream>
15 | 
16 | 
17 | 
18 | 
19 | int main() {
20 |     // Open CSV file for writing
21 |     std::ofstream csv_file("../../results/f16_scaling_results.csv");
22 |     // Write header
23 |     csv_file << "Size,Latency(us)\n";
24 | 
25 |     for (int size: sizes) {
26 |         std::vector<float16_t> X(size * size);
27 |         std::vector<float16_t> W(size * size);
28 |         std::vector<float16_t> Y(size * size, 0.0f);  // Initialize with zeros
29 | 
30 |         std::generate(X.begin(), X.end(), []() { return static_cast<float16_t>(rand()) / RAND_MAX; });
31 |         std::generate(W.begin(), W.end(), []() { return static_cast<float16_t>(rand()) / RAND_MAX; });
32 | 
33 |         size_t M = size; 
34 |         size_t N = size;
35 |         size_t K = size;
36 | 
37 |         float16_t* lhs = X.data();
38 |         float16_t* rhs = W.data();
39 | 
40 |         const size_t nr = ukernel.get_nr();
41 |         const size_t kr = ukernel.get_kr();
42 |         const size_t sr = ukernel.get_sr();
43 | 
44 |         // In a single row, we pack nr bias values followed by K rows of nr RHS values
45 |         const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(N, K);
46 |         const size_t rhs_packed_cols = nr + K * nr;
47 |         const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float16_t));
48 | 
49 |         float16_t* rhs_packed = new float16_t[rhs_packed_size];
50 | 
51 |         const size_t lhs_stride = K * sizeof(float16_t);
52 |         const size_t rhs_stride = N * sizeof(float16_t);
53 |         const size_t dst_stride_row = N * sizeof(float16_t);
54 |         const size_t dst_stride_col = sizeof(float16_t);
55 |         //float* bias = new float[N];
56 |         float16_t* bias = new float16_t[N];
57 |         std::fill_n(bias, N, 0.0f);
58 |         kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(
59 |             1, N, K, nr, kr, sr,  // Packing arguments
60 |             rhs_stride,           // RHS stride
61 |             rhs,                  // RHS
62 |             bias,                 // Bias
63 |             NULL,                 // Scale
64 |             rhs_packed,           // RHS packed
65 |             0, NULL);
66 | 
67 |         float16_t* dst = Y.data();
68 |         auto start = std::chrono::high_resolution_clock::now();
69 | 
70 |         ukernel.run_matmul(
71 |             M, N, K,           // Dimensions
72 |             lhs,               // LHS
73 |             lhs_stride,        // LHS stride
74 |             rhs_packed,        // RHS packed
75 |             dst,               // DST
76 |             dst_stride_row,    // DST stride (row)
77 |             dst_stride_col,    // DST stride (col)
78 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
79 |         );
80 | 
81 |         
82 |         auto end = std::chrono::high_resolution_clock::now();
83 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
84 |         //std::cout << "Time taken: " << duration << " microseconds" << std::endl;
85 |         csv_file << size << "," << duration << "\n";
86 |     }
87 |     
88 |     return 0;
89 | }


--------------------------------------------------------------------------------
/src/cpp/f16_f16_f16p/kai_f16_inf.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon.h"
  5 | 
  6 | #include <algorithm>
  7 | #include <cfloat>
  8 | #include <chrono>
  9 | #include <cmath>
 10 | #include <cstddef>
 11 | #include <iomanip>
 12 | #include <iostream>
 13 | 
 14 | 
 15 | void loadMatrix(const char* filename, float16_t* matrix, size_t rows, size_t cols) {
 16 |     FILE* file = fopen(filename, "rb");
 17 |     if (file == NULL) {
 18 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 19 |         return;
 20 |     }
 21 | 
 22 |     // Temporary buffer for float32 values
 23 |     std::vector<float> temp_buffer(rows * cols);
 24 |     
 25 |     size_t elements_read = fread(temp_buffer.data(), sizeof(float), rows * cols, file);
 26 |     if (elements_read != rows * cols) {
 27 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 28 |     }
 29 |     fclose(file);
 30 | 
 31 |     // Convert float32 to float16
 32 |     for (size_t i = 0; i < rows * cols; i++) {
 33 |         matrix[i] = float16_t(temp_buffer[i]);
 34 |     }
 35 | }
 36 | 
 37 | 
 38 | 
 39 | int main() {
 40 |     // Declare matrix dimensions
 41 |     const size_t activation_rows = 6, activation_cols = 1280;
 42 |     const size_t weight_rows = 1280, weight_cols = 32000;
 43 | 
 44 |     std::vector<float16_t> X(activation_rows * activation_cols);
 45 |     std::vector<float16_t> W(weight_rows * weight_cols);
 46 |     std::vector<float16_t> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
 47 | 
 48 |     size_t M = activation_rows; 
 49 |     size_t N = weight_cols;
 50 |     size_t K = activation_cols;
 51 | 
 52 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
 53 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
 54 | 
 55 |     float16_t* lhs = X.data();
 56 |     float16_t* rhs = W.data();
 57 | 
 58 |     const size_t nr = ukernel.get_nr();
 59 |     const size_t kr = ukernel.get_kr();
 60 |     const size_t sr = ukernel.get_sr();
 61 | 
 62 |     // In a single row, we pack nr bias values followed by K rows of nr RHS values
 63 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(N, K);
 64 |     const size_t rhs_packed_cols = nr + K * nr;
 65 |     const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float16_t));
 66 | 
 67 |     float16_t* rhs_packed = new float16_t[rhs_packed_size];
 68 | 
 69 |     const size_t lhs_stride = K * sizeof(float16_t);
 70 |     const size_t rhs_stride = N * sizeof(float16_t);
 71 |     const size_t dst_stride_row = N * sizeof(float16_t);
 72 |     const size_t dst_stride_col = sizeof(float16_t);
 73 |     //float* bias = new float[N];
 74 |     float16_t* bias = new float16_t[N];
 75 |     std::fill_n(bias, N, 0.0f);
 76 |     float16_t* dst = Y.data();
 77 |     auto start = std::chrono::high_resolution_clock::now();
 78 |     
 79 |     kai_run_rhs_pack_kxn_f16p16x1biasf16_f16_f16_neon(
 80 |         1, N, K, nr, kr, sr,  // Packing arguments
 81 |         rhs_stride,           // RHS stride
 82 |         rhs,                  // RHS
 83 |         bias,                 // Bias
 84 |         NULL,                 // Scale
 85 |         rhs_packed,           // RHS packed
 86 |         0, NULL);
 87 | 
 88 |     while (true) {
 89 |         ukernel.run_matmul(
 90 |             M, N, K,           // Dimensions
 91 |             lhs,               // LHS
 92 |             lhs_stride,        // LHS stride
 93 |             rhs_packed,        // RHS packed
 94 |             dst,               // DST
 95 |             dst_stride_row,    // DST stride (row)
 96 |             dst_stride_col,    // DST stride (col)
 97 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
 98 |         );
 99 |     }
100 |     auto end = std::chrono::high_resolution_clock::now();
101 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
102 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
103 |     
104 |     return 0;
105 | }


--------------------------------------------------------------------------------
/src/cpp/f32_f32_f32p/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(benchmark_f32)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | 
 7 | set(KLEIDIAI_PATH ../../../kleidiai/)
 8 | set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/)
 9 | set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/)
10 | 
11 | # Enable Assembly language
12 | enable_language(ASM)
13 | 
14 | # KleidiAI include directories
15 | include_directories(
16 |     ${KLEIDIAI_PATH}
17 |     ${MATMUL_PACK_PATH}
18 |     ${MATMUL_PATH}
19 | )
20 | 
21 | # Assembly file path
22 | set(ASM_FILE ${MATMUL_PATH}/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla_asm.S)
23 | 
24 | # Files required to build the executables
25 | add_executable(benchmark_f32
26 |     benchmark_f32.cpp
27 |     kernel.cpp
28 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
29 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
30 |     ${ASM_FILE}  # Add Assembly file
31 | )
32 | 
33 | add_executable(benchmark_f32_scaling
34 |     benchmark_f32_scaling.cpp
35 |     kernel.cpp
36 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
37 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
38 |     ${ASM_FILE}  # Add Assembly file
39 | )
40 | 
41 | add_executable(kai_f32_inf
42 |     kai_f32_inf.cpp
43 |     kernel.cpp
44 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.c
45 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.c
46 |     ${ASM_FILE}  # Add Assembly file
47 | )
48 | 
49 | # Compiler flags for ARMv8.4-A and Neoverse V1
50 | set(ARM_FLAGS "-march=armv8.4-a" "-mtune=neoverse-v1")
51 | 
52 | target_compile_options(benchmark_f32 PRIVATE ${ARM_FLAGS})
53 | target_compile_options(benchmark_f32_scaling PRIVATE ${ARM_FLAGS})
54 | target_compile_options(kai_f32_inf PRIVATE ${ARM_FLAGS})
55 | 
56 | # Set correct properties for the assembly file
57 | set_source_files_properties(${ASM_FILE} PROPERTIES LANGUAGE ASM)
58 | 
59 | target_compile_definitions(benchmark_f32
60 |     PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
61 | )
62 | 


--------------------------------------------------------------------------------
/src/cpp/f32_f32_f32p/benchmark_f32.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include "kernel.cpp"
 4 | #include "kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
 5 | 
 6 | #include <algorithm>
 7 | #include <cfloat>
 8 | #include <chrono>
 9 | #include <cmath>
10 | #include <cstddef>
11 | #include <iomanip>
12 | #include <iostream>
13 | 
14 | 
15 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
16 |     FILE* file = fopen(filename, "rb");
17 |     if (file == NULL) {
18 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
19 |         return;
20 |     }
21 | 
22 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
23 |     if (elements_read != rows * cols) {
24 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
25 |     }
26 |     fclose(file);
27 | }
28 | 
29 | 
30 | 
31 | int main() {
32 |     // Declare matrix dimensions
33 |     const size_t activation_rows = 6, activation_cols = 1280;
34 |     const size_t weight_rows = 1280, weight_cols = 32000;
35 | 
36 |     std::vector<float> X(activation_rows * activation_cols);
37 |     std::vector<float> W(weight_rows * weight_cols);
38 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
39 | 
40 |     size_t M = activation_rows; 
41 |     size_t N = weight_cols;
42 |     size_t K = activation_cols;
43 | 
44 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
45 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
46 | 
47 |     float* lhs = X.data();
48 |     float* rhs = W.data();
49 | 
50 |     const size_t nr = ukernel.get_nr();
51 |     const size_t kr = ukernel.get_kr();
52 |     const size_t sr = ukernel.get_sr();
53 | 
54 |     // In a single row, we pack nr bias values followed by K rows of nr RHS values
55 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
56 |     const size_t rhs_packed_cols = nr + K * nr;
57 |     const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float));
58 | 
59 |     float* rhs_packed = new float[rhs_packed_size];
60 | 
61 |     const size_t lhs_stride = K * sizeof(float);
62 |     const size_t rhs_stride = N * sizeof(float);
63 |     const size_t dst_stride_row = N * sizeof(float);
64 |     const size_t dst_stride_col = sizeof(float);
65 |     //float* bias = new float[N];
66 |     float* bias = new float[N];
67 |     std::fill_n(bias, N, 0.0f);
68 |     kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(
69 |         1, N, K, nr, kr, sr,  // Packing arguments
70 |         rhs_stride,           // RHS stride
71 |         rhs,                  // RHS
72 |         bias,                 // Bias
73 |         NULL,                 // Scale
74 |         rhs_packed,           // RHS packed
75 |         0, NULL);
76 | 
77 |     float* dst = Y.data();
78 |     auto start = std::chrono::high_resolution_clock::now();
79 | 
80 |     ukernel.run_matmul(
81 |         M, N, K,           // Dimensions
82 |         lhs,               // LHS
83 |         lhs_stride,        // LHS stride
84 |         rhs_packed,        // RHS packed
85 |         dst,               // DST
86 |         dst_stride_row,    // DST stride (row)
87 |         dst_stride_col,    // DST stride (col)
88 |         -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
89 |     );
90 | 
91 |     
92 |     auto end = std::chrono::high_resolution_clock::now();
93 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
94 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
95 |     
96 |     return 0;
97 | }


--------------------------------------------------------------------------------
/src/cpp/f32_f32_f32p/benchmark_f32_scaling.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <chrono>
 4 | #include <vector>
 5 | #include <algorithm>
 6 | #include <cstdlib>
 7 | #include <ctime>
 8 | #include <cfloat>
 9 | #include "kernel.cpp"
10 | #include "kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
11 | #include "../common/sizes.cpp"
12 | 
13 | 
14 | int main() {
15 | 
16 |     // Open CSV file for writing
17 |     std::ofstream csv_file("../../results/f32_scaling_results.csv");
18 |     // Write header
19 |     csv_file << "Size,Latency(us)\n";
20 | 
21 |     for (int size: sizes) {
22 |         std::vector<float> X(size * size);
23 |         std::vector<float> W(size * size);
24 |         std::vector<float> Y(size * size, 0.0f);  // Initialize with zeros
25 | 
26 |         std::generate(X.begin(), X.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
27 |         std::generate(W.begin(), W.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
28 | 
29 |         size_t M = size; 
30 |         size_t N = size;
31 |         size_t K = size;
32 | 
33 |         float* lhs = X.data();
34 |         float* rhs = W.data();
35 | 
36 |         const size_t nr = ukernel.get_nr();
37 |         const size_t kr = ukernel.get_kr();
38 |         const size_t sr = ukernel.get_sr();
39 | 
40 |         // In a single row, we pack nr bias values followed by K rows of nr RHS values
41 |         const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
42 |         const size_t rhs_packed_cols = nr + K * nr;
43 |         const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float));
44 | 
45 |         float* rhs_packed = new float[rhs_packed_size];
46 | 
47 |         const size_t lhs_stride = K * sizeof(float);
48 |         const size_t rhs_stride = N * sizeof(float);
49 |         const size_t dst_stride_row = N * sizeof(float);
50 |         const size_t dst_stride_col = sizeof(float);
51 |         //float* bias = new float[N];
52 |         float* bias = new float[N];
53 |         std::fill_n(bias, N, 0.0f);
54 |         float* dst = Y.data();
55 |         auto start = std::chrono::high_resolution_clock::now();
56 |         kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(
57 |             1, N, K, nr, kr, sr,  // Packing arguments
58 |             rhs_stride,           // RHS stride
59 |             rhs,                  // RHS
60 |             bias,                 // Bias
61 |             NULL,                 // Scale
62 |             rhs_packed,           // RHS packed
63 |             0, NULL);
64 | 
65 | 
66 |         ukernel.run_matmul(
67 |             M, N, K,           // Dimensions
68 |             lhs,               // LHS
69 |             lhs_stride,        // LHS stride
70 |             rhs_packed,        // RHS packed
71 |             dst,               // DST
72 |             dst_stride_row,    // DST stride (row)
73 |             dst_stride_col,    // DST stride (col)
74 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
75 |         );
76 |         auto end = std::chrono::high_resolution_clock::now();
77 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
78 |         //std::cout << "Time taken: " << duration << " microseconds" << std::endl;
79 |         csv_file << size << "," << duration << "\n";
80 |     }
81 |     return 0;
82 | }


--------------------------------------------------------------------------------
/src/cpp/f32_f32_f32p/kai_f32_inf.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include "kernel.cpp"
 4 | #include "kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
 5 | 
 6 | #include <algorithm>
 7 | #include <cfloat>
 8 | #include <chrono>
 9 | #include <cmath>
10 | #include <cstddef>
11 | #include <iomanip>
12 | #include <iostream>
13 | 
14 | 
15 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
16 |     FILE* file = fopen(filename, "rb");
17 |     if (file == NULL) {
18 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
19 |         return;
20 |     }
21 | 
22 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
23 |     if (elements_read != rows * cols) {
24 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
25 |     }
26 |     fclose(file);
27 | }
28 | 
29 | 
30 | 
31 | int main() {
32 |     // Declare matrix dimensions
33 |     const size_t activation_rows = 6, activation_cols = 1280;
34 |     const size_t weight_rows = 1280, weight_cols = 32000;
35 | 
36 |     std::vector<float> X(activation_rows * activation_cols);
37 |     std::vector<float> W(weight_rows * weight_cols);
38 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
39 | 
40 |     size_t M = activation_rows; 
41 |     size_t N = weight_cols;
42 |     size_t K = activation_cols;
43 | 
44 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
45 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
46 | 
47 |     float* lhs = X.data();
48 |     float* rhs = W.data();
49 | 
50 |     const size_t nr = ukernel.get_nr();
51 |     const size_t kr = ukernel.get_kr();
52 |     const size_t sr = ukernel.get_sr();
53 | 
54 |     // In a single row, we pack nr bias values followed by K rows of nr RHS values
55 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K);
56 |     const size_t rhs_packed_cols = nr + K * nr;
57 |     const size_t rhs_packed_rows = rhs_packed_size / (rhs_packed_cols * sizeof(float));
58 | 
59 |     float* rhs_packed = new float[rhs_packed_size];
60 | 
61 |     const size_t lhs_stride = K * sizeof(float);
62 |     const size_t rhs_stride = N * sizeof(float);
63 |     const size_t dst_stride_row = N * sizeof(float);
64 |     const size_t dst_stride_col = sizeof(float);
65 |     //float* bias = new float[N];
66 |     float* bias = new float[N];
67 |     std::fill_n(bias, N, 0.0f);
68 |     kai_run_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(
69 |         1, N, K, nr, kr, sr,  // Packing arguments
70 |         rhs_stride,           // RHS stride
71 |         rhs,                  // RHS
72 |         bias,                 // Bias
73 |         NULL,                 // Scale
74 |         rhs_packed,           // RHS packed
75 |         0, NULL);
76 | 
77 |     float* dst = Y.data();
78 |     auto start = std::chrono::high_resolution_clock::now();
79 |     while (true) {
80 |         ukernel.run_matmul(
81 |             M, N, K,           // Dimensions
82 |             lhs,               // LHS
83 |             lhs_stride,        // LHS stride
84 |             rhs_packed,        // RHS packed
85 |             dst,               // DST
86 |             dst_stride_row,    // DST stride (row)
87 |             dst_stride_col,    // DST stride (col)
88 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
89 |         );
90 |     }
91 | 
92 |     
93 |     auto end = std::chrono::high_resolution_clock::now();
94 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
95 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
96 |     
97 |     return 0;
98 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_dotprod/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(benchmark_dotprod)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | set(KLEIDIAI_PATH ../../../kleidiai/)
 7 | set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/)
 8 | set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/)
 9 | 
10 | # KleidiAI include directories
11 | include_directories(
12 |     ${KLEIDIAI_PATH}
13 |     ${MATMUL_PACK_PATH}
14 |     ${MATMUL_PATH}
15 | )
16 | 
17 | 
18 | 
19 | add_executable(benchmark_dotprod
20 |     benchmark_i8_dotprod.cpp
21 |     kernel.cpp
22 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
23 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
24 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
25 | )
26 | 
27 | add_executable(benchmark_dotprod_scaling
28 |     benchmark_i8_dotprod_scaling.cpp
29 |     kernel.cpp
30 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
31 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
32 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
33 | )
34 | 
35 | add_executable(kai_i8_dotprod_inf
36 |     kai_i8_dotprod_inf.cpp
37 |     kernel.cpp
38 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
39 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
40 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
41 | )
42 | 
43 | 
44 | target_compile_options(benchmark_dotprod
45 |     PRIVATE -march=armv8.4-a+sve+dotprod -mtune=neoverse-v1
46 | )
47 | 
48 | target_compile_options(benchmark_dotprod_scaling
49 |     PRIVATE -march=armv8.4-a+sve+dotprod -mtune=neoverse-v1
50 | )
51 | 
52 | target_compile_options(kai_i8_dotprod_inf
53 |     PRIVATE -march=armv8.4-a+sve+dotprod -mtune=neoverse-v1
54 | )
55 | 
56 | target_compile_definitions(benchmark_dotprod
57 |     PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
58 | )
59 | 


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_dotprod/benchmark_i8_dotprod.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  5 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  6 | 
  7 | #include <algorithm>
  8 | #include <cfloat>
  9 | #include <chrono>
 10 | #include <cmath>
 11 | #include <cstddef>
 12 | #include <iomanip>
 13 | #include <iostream>
 14 | 
 15 | 
 16 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 17 |     FILE* file = fopen(filename, "rb");
 18 |     if (file == NULL) {
 19 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 20 |         return;
 21 |     }
 22 | 
 23 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 24 |     if (elements_read != rows * cols) {
 25 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 26 |     }
 27 |     fclose(file);
 28 | }
 29 | 
 30 | 
 31 | 
 32 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 33 |     return k / bl;
 34 | }
 35 | 
 36 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 37 |     return bl + sizeof(int16_t);
 38 | }
 39 | 
 40 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 41 |     return (bl / 2) + sizeof(int16_t);
 42 | }
 43 | 
 44 | 
 45 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 46 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 47 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 48 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 49 | 
 50 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 51 |         const float* src_ptr = rhs_f32 + row_idx * k;
 52 | 
 53 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 54 | 
 55 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 56 |             float amax = 0.0f;
 57 |             float max = 0.0f;
 58 | 
 59 |             for (size_t b = 0; b < bl; ++b) {
 60 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 61 |                 const float asrc0_0 = fabsf(src0_0);
 62 | 
 63 |                 if (amax < asrc0_0) {
 64 |                     amax = asrc0_0;
 65 |                     max = src0_0;
 66 |                 }
 67 |             }
 68 | 
 69 |             const float scale = max / -8.0;
 70 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 71 | 
 72 |             // Store the scale at the beginning of the block
 73 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 74 |             dst_ptr += sizeof(uint16_t);
 75 | 
 76 |             const size_t block_size = 32;
 77 |             const size_t num_subblocks = bl / 32;
 78 | 
 79 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 80 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 81 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 82 |                     float v0_f32 = src_ptr[src_base_addr];
 83 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 84 | 
 85 |                     v0_f32 *= recip_scale;
 86 |                     v1_f32 *= recip_scale;
 87 | 
 88 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 89 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 90 | 
 91 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 92 | 
 93 |                     dst_ptr[0] = rhs_v0;
 94 |                     dst_ptr += sizeof(uint8_t);
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | };
100 | 
101 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
102 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
103 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
104 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
105 | 
106 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
107 |         const float* src_ptr = rhs_f32 + row_idx * k;
108 | 
109 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
110 | 
111 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
112 |             float amax = 0.0f;
113 | 
114 |             for (size_t b = 0; b < bl; ++b) {
115 |                 const float src0_0 = src_ptr[block_idx * bl + b];
116 |                 const float asrc0_0 = fabsf(src0_0);
117 | 
118 |                 if (amax < asrc0_0) {
119 |                     amax = asrc0_0;
120 |                 }
121 |             }
122 | 
123 |             const float scale = amax / ((1 << 7) - 1);
124 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
125 | 
126 |             // Store the scale at the beginning of the block
127 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
128 |             dst_ptr += sizeof(uint16_t);
129 | 
130 |             const size_t block_size = 32;
131 |             const size_t num_subblocks = bl / 32;
132 | 
133 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
134 |                 for (size_t i = 0; i < block_size; ++i) {
135 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
136 |                     float v0_f32 = src_ptr[src_base_addr];
137 | 
138 |                     v0_f32 *= recip_scale;
139 | 
140 |                     dst_ptr[0] = roundf(v0_f32);
141 |                     dst_ptr += sizeof(int8_t);
142 |                 }
143 |             }
144 |         }
145 |     }
146 | };
147 | 
148 | 
149 | static void ref_matmul_f32_qs8d32_qs4c32(
150 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
151 |     float scalar_min, float scalar_max) {
152 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
153 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
154 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
155 | 
156 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
157 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
158 | 
159 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
160 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
161 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
162 |             // Main f32 accumulator
163 |             float main_acc = 0.0f;
164 | 
165 |             const size_t block_size = 32;
166 |             const size_t num_subblocks = bl / 32;
167 | 
168 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
169 |                 const int8_t* lhs_ptr = lhs_ptr_start;
170 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
171 | 
172 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
173 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
174 | 
175 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
176 |                     int32_t temp_acc = 0;
177 | 
178 |                     // Get the LHS/RHS quantization scale stored at the
179 |                     // beginning of each block
180 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
181 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
182 | 
183 |                     lhs_ptr += sizeof(uint16_t);
184 |                     rhs_ptr += sizeof(uint16_t);
185 | 
186 |                     for (size_t i = 0; i < block_size / 2; ++i) {
187 |                         // Get the LHS values
188 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
189 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
190 | 
191 |                         // Get the RHS values
192 |                         const uint8_t rhs_byte = rhs_ptr[0];
193 | 
194 |                         // Unpack the RHS values
195 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
196 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
197 | 
198 |                         temp_acc += lhs_v0 * rhs_v0;
199 |                         temp_acc += lhs_v1 * rhs_v1;
200 | 
201 |                         lhs_ptr += 1;
202 |                         rhs_ptr += 1;
203 |                     }
204 | 
205 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
206 |                 }
207 |             }
208 | 
209 |             main_acc = std::max(main_acc, scalar_min);
210 |             main_acc = std::min(main_acc, scalar_max);
211 | 
212 |             dst_f32[0] = main_acc;
213 |             dst_f32 += 1;
214 |         }
215 |     }
216 | };
217 | 
218 | 
219 | 
220 | 
221 | int main() {
222 |     // Declare matrix dimensions
223 |     const size_t activation_rows = 6, activation_cols = 1280;
224 |     const size_t weight_rows = 1280, weight_cols = 32000;
225 | 
226 |     std::vector<float> X(activation_rows * activation_cols);
227 |     std::vector<float> W(weight_rows * weight_cols);
228 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
229 | 
230 |     size_t M = activation_rows; 
231 |     size_t N = weight_cols;
232 |     size_t K = activation_cols;
233 | 
234 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
235 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
236 | 
237 |     float* lhs = X.data();
238 |     float* rhs = W.data();
239 | 
240 |     const size_t mr = ukernel.get_mr();
241 |     const size_t nr = ukernel.get_nr();
242 |     const size_t kr = ukernel.get_kr();
243 |     const size_t sr = ukernel.get_sr();
244 | 
245 |     const size_t bl = 32;  // Block length. It must be 32
246 |     const size_t m = activation_rows;
247 |     const size_t n = weight_cols;
248 |     const size_t k = activation_cols;
249 |     const size_t seed_lhs = 4568;
250 |     const size_t seed_rhs = seed_lhs + 4;
251 |     
252 |     const size_t num_blocks = k / bl;
253 |     const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
254 |     const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
255 | 
256 |     const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
257 |     uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
258 | 
259 |     quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
260 | 
261 | 
262 |     const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
263 |     const size_t dst_ref_size_f32 = m * n * sizeof(float);
264 | 
265 |     uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
266 |     uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
267 | 
268 |     ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
269 | 
270 | 
271 | 
272 | 
273 |     const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
274 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
275 |     const size_t dst_size = ukernel.get_dst_size(m, n);
276 | 
277 |     uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
278 |     uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
279 |     uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
280 | 
281 |     struct kai_rhs_pack_qs4cxs1s0_param params;
282 |     params.lhs_zero_point = 1;
283 |     params.rhs_zero_point = 8;
284 |     /*
285 |     kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
286 |             1, n, k,                                  // Dimensions
287 |             nr, kr, sr,                               // Packing arguments
288 |             bl,                                       // Block length
289 |             (const uint8_t*)(rhs_native_mtx_qs4c32),  // RHS
290 |             NULL,                                     // Bias
291 |             rhs_packed_mtx_qs4c32,                    // RHS packed
292 |             0, &params);
293 |     */
294 |    ref_matmul_f32_qs8d32_qs4c32(
295 |         m, n, k, bl, (const int8_t*)lhs_ref_mtx_qa8d32, (const uint8_t*)rhs_native_mtx_qs4c32, (float*)dst_ref_mtx_f32,
296 |         -FLT_MAX, FLT_MAX);
297 | 
298 |     // If the RHS matrix contains constant values, the packing can be performed
299 |     // only once
300 | 
301 |     const size_t dst_stride = n * sizeof(float);
302 |     const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
303 |     const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
304 |     const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
305 | 
306 |     const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
307 |     const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
308 |     float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
309 |     auto start = std::chrono::high_resolution_clock::now();
310 |     ukernel.run_matmul(
311 |         m, n, k, bl,       // Dimensions
312 |         lhs_ptr,           // LHS packed
313 |         rhs_ptr,           // RHS packed
314 |         dst_ptr,           // DST
315 |         dst_stride,        // DST stride (row)
316 |         sizeof(float),     // DST stride (col)
317 |         -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
318 |     );
319 |     auto end = std::chrono::high_resolution_clock::now();
320 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
321 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
322 |     
323 |     return 0;
324 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_dotprod/benchmark_i8_dotprod_scaling.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "../common/sizes.cpp"
  5 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  6 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  7 | 
  8 | #include <algorithm>
  9 | #include <cfloat>
 10 | #include <chrono>
 11 | #include <cmath>
 12 | #include <cstddef>
 13 | #include <iomanip>
 14 | #include <iostream>
 15 | #include <fstream>
 16 | 
 17 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 18 |     FILE* file = fopen(filename, "rb");
 19 |     if (file == NULL) {
 20 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 21 |         return;
 22 |     }
 23 | 
 24 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 25 |     if (elements_read != rows * cols) {
 26 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 27 |     }
 28 |     fclose(file);
 29 | }
 30 | 
 31 | 
 32 | 
 33 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 34 |     return k / bl;
 35 | }
 36 | 
 37 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 38 |     return bl + sizeof(int16_t);
 39 | }
 40 | 
 41 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 42 |     return (bl / 2) + sizeof(int16_t);
 43 | }
 44 | 
 45 | 
 46 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 47 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 48 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 49 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 50 | 
 51 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 52 |         const float* src_ptr = rhs_f32 + row_idx * k;
 53 | 
 54 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 55 | 
 56 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 57 |             float amax = 0.0f;
 58 |             float max = 0.0f;
 59 | 
 60 |             for (size_t b = 0; b < bl; ++b) {
 61 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 62 |                 const float asrc0_0 = fabsf(src0_0);
 63 | 
 64 |                 if (amax < asrc0_0) {
 65 |                     amax = asrc0_0;
 66 |                     max = src0_0;
 67 |                 }
 68 |             }
 69 | 
 70 |             const float scale = max / -8.0;
 71 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 72 | 
 73 |             // Store the scale at the beginning of the block
 74 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 75 |             dst_ptr += sizeof(uint16_t);
 76 | 
 77 |             const size_t block_size = 32;
 78 |             const size_t num_subblocks = bl / 32;
 79 | 
 80 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 81 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 82 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 83 |                     float v0_f32 = src_ptr[src_base_addr];
 84 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 85 | 
 86 |                     v0_f32 *= recip_scale;
 87 |                     v1_f32 *= recip_scale;
 88 | 
 89 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 90 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 91 | 
 92 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 93 | 
 94 |                     dst_ptr[0] = rhs_v0;
 95 |                     dst_ptr += sizeof(uint8_t);
 96 |                 }
 97 |             }
 98 |         }
 99 |     }
100 | };
101 | 
102 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
103 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
104 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
105 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
106 | 
107 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
108 |         const float* src_ptr = rhs_f32 + row_idx * k;
109 | 
110 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
111 | 
112 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
113 |             float amax = 0.0f;
114 | 
115 |             for (size_t b = 0; b < bl; ++b) {
116 |                 const float src0_0 = src_ptr[block_idx * bl + b];
117 |                 const float asrc0_0 = fabsf(src0_0);
118 | 
119 |                 if (amax < asrc0_0) {
120 |                     amax = asrc0_0;
121 |                 }
122 |             }
123 | 
124 |             const float scale = amax / ((1 << 7) - 1);
125 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
126 | 
127 |             // Store the scale at the beginning of the block
128 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
129 |             dst_ptr += sizeof(uint16_t);
130 | 
131 |             const size_t block_size = 32;
132 |             const size_t num_subblocks = bl / 32;
133 | 
134 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
135 |                 for (size_t i = 0; i < block_size; ++i) {
136 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
137 |                     float v0_f32 = src_ptr[src_base_addr];
138 | 
139 |                     v0_f32 *= recip_scale;
140 | 
141 |                     dst_ptr[0] = roundf(v0_f32);
142 |                     dst_ptr += sizeof(int8_t);
143 |                 }
144 |             }
145 |         }
146 |     }
147 | };
148 | 
149 | 
150 | static void ref_matmul_f32_qs8d32_qs4c32(
151 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
152 |     float scalar_min, float scalar_max) {
153 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
154 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
155 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
156 | 
157 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
158 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
159 | 
160 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
161 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
162 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
163 |             // Main f32 accumulator
164 |             float main_acc = 0.0f;
165 | 
166 |             const size_t block_size = 32;
167 |             const size_t num_subblocks = bl / 32;
168 | 
169 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
170 |                 const int8_t* lhs_ptr = lhs_ptr_start;
171 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
172 | 
173 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
174 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
175 | 
176 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
177 |                     int32_t temp_acc = 0;
178 | 
179 |                     // Get the LHS/RHS quantization scale stored at the
180 |                     // beginning of each block
181 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
182 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
183 | 
184 |                     lhs_ptr += sizeof(uint16_t);
185 |                     rhs_ptr += sizeof(uint16_t);
186 | 
187 |                     for (size_t i = 0; i < block_size / 2; ++i) {
188 |                         // Get the LHS values
189 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
190 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
191 | 
192 |                         // Get the RHS values
193 |                         const uint8_t rhs_byte = rhs_ptr[0];
194 | 
195 |                         // Unpack the RHS values
196 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
197 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
198 | 
199 |                         temp_acc += lhs_v0 * rhs_v0;
200 |                         temp_acc += lhs_v1 * rhs_v1;
201 | 
202 |                         lhs_ptr += 1;
203 |                         rhs_ptr += 1;
204 |                     }
205 | 
206 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
207 |                 }
208 |             }
209 | 
210 |             main_acc = std::max(main_acc, scalar_min);
211 |             main_acc = std::min(main_acc, scalar_max);
212 | 
213 |             dst_f32[0] = main_acc;
214 |             dst_f32 += 1;
215 |         }
216 |     }
217 | };
218 | 
219 | 
220 | 
221 | 
222 | int main() {
223 |     // Declare matrix dimensions
224 |     // Open CSV file for writing
225 |     std::ofstream csv_file("../../results/i8_dotprod_scaling_results.csv");
226 |     // Write header
227 |     csv_file << "Size,Latency(us)\n";
228 | 
229 |     for (int size: sizes) {
230 | 
231 |         std::vector<float> X(size * size);
232 |         std::vector<float> W(size * size);
233 |         std::vector<float> Y(size * size, 0.0f);  // Initialize with zeros
234 | 
235 |         std::generate(X.begin(), X.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
236 |         std::generate(W.begin(), W.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
237 | 
238 |         size_t M = size; 
239 |         size_t N = size;
240 |         size_t K = size;
241 | 
242 |         float* lhs = X.data();
243 |         float* rhs = W.data();
244 | 
245 |         const size_t mr = ukernel.get_mr();
246 |         const size_t nr = ukernel.get_nr();
247 |         const size_t kr = ukernel.get_kr();
248 |         const size_t sr = ukernel.get_sr();
249 | 
250 |         const size_t bl = 32;  // Block length. It must be 32
251 |         const size_t m = size;
252 |         const size_t n = size;
253 |         const size_t k = size;
254 |         const size_t seed_lhs = 4568;
255 |         const size_t seed_rhs = seed_lhs + 4;
256 |         
257 |         const size_t num_blocks = k / bl;
258 |         const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
259 |         const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
260 | 
261 |         const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
262 |         uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
263 | 
264 |         quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
265 | 
266 | 
267 |         const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
268 |         const size_t dst_ref_size_f32 = m * n * sizeof(float);
269 | 
270 |         uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
271 |         uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
272 | 
273 |         ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
274 | 
275 | 
276 | 
277 |         const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
278 |         auto start = std::chrono::high_resolution_clock::now();
279 |         const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
280 |         const size_t dst_size = ukernel.get_dst_size(m, n);
281 | 
282 |         uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
283 |         uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
284 |         uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
285 | 
286 |         struct kai_rhs_pack_qs4cxs1s0_param params;
287 |         params.lhs_zero_point = 1;
288 |         params.rhs_zero_point = 8;
289 | 
290 |         const size_t dst_stride = n * sizeof(float);
291 |         const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
292 |         const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
293 |         const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
294 | 
295 |         const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
296 |         const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
297 |         float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
298 |         
299 |         ukernel.run_matmul(
300 |             m, n, k, bl,       // Dimensions
301 |             lhs_ptr,           // LHS packed
302 |             rhs_ptr,           // RHS packed
303 |             dst_ptr,           // DST
304 |             dst_stride,        // DST stride (row)
305 |             sizeof(float),     // DST stride (col)
306 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
307 |         );
308 | 
309 |         auto end = std::chrono::high_resolution_clock::now();
310 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
311 |         //std::cout << "Time taken: " << duration << " microseconds" << std::endl;
312 |         csv_file << size << "," << duration << "\n";
313 | 
314 |     }
315 |     
316 |     return 0;
317 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_dotprod/kai_i8_dotprod_inf.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  5 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  6 | 
  7 | #include <algorithm>
  8 | #include <cfloat>
  9 | #include <chrono>
 10 | #include <cmath>
 11 | #include <cstddef>
 12 | #include <iomanip>
 13 | #include <iostream>
 14 | 
 15 | 
 16 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 17 |     FILE* file = fopen(filename, "rb");
 18 |     if (file == NULL) {
 19 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 20 |         return;
 21 |     }
 22 | 
 23 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 24 |     if (elements_read != rows * cols) {
 25 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 26 |     }
 27 |     fclose(file);
 28 | }
 29 | 
 30 | 
 31 | 
 32 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 33 |     return k / bl;
 34 | }
 35 | 
 36 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 37 |     return bl + sizeof(int16_t);
 38 | }
 39 | 
 40 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 41 |     return (bl / 2) + sizeof(int16_t);
 42 | }
 43 | 
 44 | 
 45 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 46 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 47 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 48 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 49 | 
 50 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 51 |         const float* src_ptr = rhs_f32 + row_idx * k;
 52 | 
 53 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 54 | 
 55 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 56 |             float amax = 0.0f;
 57 |             float max = 0.0f;
 58 | 
 59 |             for (size_t b = 0; b < bl; ++b) {
 60 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 61 |                 const float asrc0_0 = fabsf(src0_0);
 62 | 
 63 |                 if (amax < asrc0_0) {
 64 |                     amax = asrc0_0;
 65 |                     max = src0_0;
 66 |                 }
 67 |             }
 68 | 
 69 |             const float scale = max / -8.0;
 70 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 71 | 
 72 |             // Store the scale at the beginning of the block
 73 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 74 |             dst_ptr += sizeof(uint16_t);
 75 | 
 76 |             const size_t block_size = 32;
 77 |             const size_t num_subblocks = bl / 32;
 78 | 
 79 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 80 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 81 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 82 |                     float v0_f32 = src_ptr[src_base_addr];
 83 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 84 | 
 85 |                     v0_f32 *= recip_scale;
 86 |                     v1_f32 *= recip_scale;
 87 | 
 88 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 89 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 90 | 
 91 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 92 | 
 93 |                     dst_ptr[0] = rhs_v0;
 94 |                     dst_ptr += sizeof(uint8_t);
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | };
100 | 
101 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
102 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
103 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
104 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
105 | 
106 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
107 |         const float* src_ptr = rhs_f32 + row_idx * k;
108 | 
109 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
110 | 
111 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
112 |             float amax = 0.0f;
113 | 
114 |             for (size_t b = 0; b < bl; ++b) {
115 |                 const float src0_0 = src_ptr[block_idx * bl + b];
116 |                 const float asrc0_0 = fabsf(src0_0);
117 | 
118 |                 if (amax < asrc0_0) {
119 |                     amax = asrc0_0;
120 |                 }
121 |             }
122 | 
123 |             const float scale = amax / ((1 << 7) - 1);
124 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
125 | 
126 |             // Store the scale at the beginning of the block
127 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
128 |             dst_ptr += sizeof(uint16_t);
129 | 
130 |             const size_t block_size = 32;
131 |             const size_t num_subblocks = bl / 32;
132 | 
133 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
134 |                 for (size_t i = 0; i < block_size; ++i) {
135 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
136 |                     float v0_f32 = src_ptr[src_base_addr];
137 | 
138 |                     v0_f32 *= recip_scale;
139 | 
140 |                     dst_ptr[0] = roundf(v0_f32);
141 |                     dst_ptr += sizeof(int8_t);
142 |                 }
143 |             }
144 |         }
145 |     }
146 | };
147 | 
148 | 
149 | static void ref_matmul_f32_qs8d32_qs4c32(
150 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
151 |     float scalar_min, float scalar_max) {
152 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
153 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
154 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
155 | 
156 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
157 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
158 | 
159 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
160 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
161 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
162 |             // Main f32 accumulator
163 |             float main_acc = 0.0f;
164 | 
165 |             const size_t block_size = 32;
166 |             const size_t num_subblocks = bl / 32;
167 | 
168 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
169 |                 const int8_t* lhs_ptr = lhs_ptr_start;
170 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
171 | 
172 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
173 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
174 | 
175 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
176 |                     int32_t temp_acc = 0;
177 | 
178 |                     // Get the LHS/RHS quantization scale stored at the
179 |                     // beginning of each block
180 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
181 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
182 | 
183 |                     lhs_ptr += sizeof(uint16_t);
184 |                     rhs_ptr += sizeof(uint16_t);
185 | 
186 |                     for (size_t i = 0; i < block_size / 2; ++i) {
187 |                         // Get the LHS values
188 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
189 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
190 | 
191 |                         // Get the RHS values
192 |                         const uint8_t rhs_byte = rhs_ptr[0];
193 | 
194 |                         // Unpack the RHS values
195 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
196 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
197 | 
198 |                         temp_acc += lhs_v0 * rhs_v0;
199 |                         temp_acc += lhs_v1 * rhs_v1;
200 | 
201 |                         lhs_ptr += 1;
202 |                         rhs_ptr += 1;
203 |                     }
204 | 
205 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
206 |                 }
207 |             }
208 | 
209 |             main_acc = std::max(main_acc, scalar_min);
210 |             main_acc = std::min(main_acc, scalar_max);
211 | 
212 |             dst_f32[0] = main_acc;
213 |             dst_f32 += 1;
214 |         }
215 |     }
216 | };
217 | 
218 | 
219 | 
220 | 
221 | int main() {
222 |     // Declare matrix dimensions
223 |     const size_t activation_rows = 6, activation_cols = 1280;
224 |     const size_t weight_rows = 1280, weight_cols = 32000;
225 | 
226 |     std::vector<float> X(activation_rows * activation_cols);
227 |     std::vector<float> W(weight_rows * weight_cols);
228 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
229 | 
230 |     size_t M = activation_rows; 
231 |     size_t N = weight_cols;
232 |     size_t K = activation_cols;
233 | 
234 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
235 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
236 | 
237 |     float* lhs = X.data();
238 |     float* rhs = W.data();
239 | 
240 |     const size_t mr = ukernel.get_mr();
241 |     const size_t nr = ukernel.get_nr();
242 |     const size_t kr = ukernel.get_kr();
243 |     const size_t sr = ukernel.get_sr();
244 | 
245 |     const size_t bl = 32;  // Block length. It must be 32
246 |     const size_t m = activation_rows;
247 |     const size_t n = weight_cols;
248 |     const size_t k = activation_cols;
249 |     const size_t seed_lhs = 4568;
250 |     const size_t seed_rhs = seed_lhs + 4;
251 |     
252 |     const size_t num_blocks = k / bl;
253 |     const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
254 |     const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
255 | 
256 |     const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
257 |     uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
258 | 
259 |     quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
260 | 
261 | 
262 |     const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
263 |     const size_t dst_ref_size_f32 = m * n * sizeof(float);
264 | 
265 |     uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
266 |     uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
267 | 
268 |     ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
269 | 
270 | 
271 | 
272 | 
273 |     const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
274 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
275 |     const size_t dst_size = ukernel.get_dst_size(m, n);
276 | 
277 |     uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
278 |     uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
279 |     uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
280 | 
281 |     struct kai_rhs_pack_qs4cxs1s0_param params;
282 |     params.lhs_zero_point = 1;
283 |     params.rhs_zero_point = 8;
284 |     /*
285 |     kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
286 |             1, n, k,                                  // Dimensions
287 |             nr, kr, sr,                               // Packing arguments
288 |             bl,                                       // Block length
289 |             (const uint8_t*)(rhs_native_mtx_qs4c32),  // RHS
290 |             NULL,                                     // Bias
291 |             rhs_packed_mtx_qs4c32,                    // RHS packed
292 |             0, &params);
293 |     */
294 |    ref_matmul_f32_qs8d32_qs4c32(
295 |         m, n, k, bl, (const int8_t*)lhs_ref_mtx_qa8d32, (const uint8_t*)rhs_native_mtx_qs4c32, (float*)dst_ref_mtx_f32,
296 |         -FLT_MAX, FLT_MAX);
297 | 
298 |     // If the RHS matrix contains constant values, the packing can be performed
299 |     // only once
300 | 
301 |     const size_t dst_stride = n * sizeof(float);
302 |     const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
303 |     const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
304 |     const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
305 | 
306 |     const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
307 |     const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
308 |     float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
309 |     auto start = std::chrono::high_resolution_clock::now();
310 |     while (true) {
311 |         ukernel.run_matmul(
312 |             m, n, k, bl,       // Dimensions
313 |             lhs_ptr,           // LHS packed
314 |             rhs_ptr,           // RHS packed
315 |             dst_ptr,           // DST
316 |             dst_stride,        // DST stride (row)
317 |             sizeof(float),     // DST stride (col)
318 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
319 |         );
320 |     }
321 | 
322 |     auto end = std::chrono::high_resolution_clock::now();
323 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
324 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
325 |     
326 |     return 0;
327 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_i8mm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(benchmark_i8_i8mm)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | 
 7 | # Paths
 8 | set(KLEIDIAI_PATH ../../../kleidiai/)
 9 | set(MATMUL_PACK_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/pack/)
10 | set(MATMUL_PATH ${KLEIDIAI_PATH}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/)
11 | 
12 | # KleidiAI include directories
13 | include_directories(
14 |     ${KLEIDIAI_PATH}
15 |     ${MATMUL_PACK_PATH}
16 |     ${MATMUL_PATH}
17 | )
18 | 
19 | # Executable
20 | add_executable(benchmark_i8_i8mm
21 |     benchmark_i8_i8mm.cpp
22 |     kernel.cpp
23 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c
24 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
25 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
26 | )
27 | 
28 | add_executable(benchmark_i8_i8mm_scaling
29 |     benchmark_i8_i8mm_scaling.cpp
30 |     kernel.cpp
31 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c
32 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
33 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
34 | )
35 | 
36 | add_executable(kai_i8_i8mm_inf
37 |     kai_i8_i8mm_inf.cpp
38 |     kernel.cpp
39 |     ${MATMUL_PATH}/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_8x4x32_neon_i8mm.c
40 |     ${MATMUL_PACK_PATH}/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
41 |     ${MATMUL_PACK_PATH}/kai_lhs_quant_pack_qsi8d32p_f32.c
42 | )
43 | 
44 | # Compilation options
45 | target_compile_options(benchmark_i8_i8mm
46 |     PRIVATE -march=armv8.4-a+sve+i8mm -mtune=neoverse-v1
47 | )
48 | 
49 | target_compile_options(benchmark_i8_i8mm_scaling
50 |     PRIVATE -march=armv8.4-a+sve+i8mm -mtune=neoverse-v1
51 | )
52 | 
53 | target_compile_options(kai_i8_i8mm_inf
54 |     PRIVATE -march=armv8.4-a+sve+i8mm -mtune=neoverse-v1
55 | )
56 | 
57 | # Debug definitions
58 | target_compile_definitions(benchmark_i8_i8mm
59 |     PRIVATE $<$<CONFIG:Debug>:KAI_DEBUG>
60 | )
61 | 


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_i8mm/benchmark_i8_i8mm.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  5 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  6 | 
  7 | #include <algorithm>
  8 | #include <cfloat>
  9 | #include <chrono>
 10 | #include <cmath>
 11 | #include <cstddef>
 12 | #include <iomanip>
 13 | #include <iostream>
 14 | 
 15 | 
 16 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 17 |     FILE* file = fopen(filename, "rb");
 18 |     if (file == NULL) {
 19 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 20 |         return;
 21 |     }
 22 | 
 23 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 24 |     if (elements_read != rows * cols) {
 25 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 26 |     }
 27 |     fclose(file);
 28 | }
 29 | 
 30 | 
 31 | 
 32 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 33 |     return k / bl;
 34 | }
 35 | 
 36 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 37 |     return bl + sizeof(int16_t);
 38 | }
 39 | 
 40 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 41 |     return (bl / 2) + sizeof(int16_t);
 42 | }
 43 | 
 44 | 
 45 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 46 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 47 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 48 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 49 | 
 50 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 51 |         const float* src_ptr = rhs_f32 + row_idx * k;
 52 | 
 53 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 54 | 
 55 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 56 |             float amax = 0.0f;
 57 |             float max = 0.0f;
 58 | 
 59 |             for (size_t b = 0; b < bl; ++b) {
 60 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 61 |                 const float asrc0_0 = fabsf(src0_0);
 62 | 
 63 |                 if (amax < asrc0_0) {
 64 |                     amax = asrc0_0;
 65 |                     max = src0_0;
 66 |                 }
 67 |             }
 68 | 
 69 |             const float scale = max / -8.0;
 70 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 71 | 
 72 |             // Store the scale at the beginning of the block
 73 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 74 |             dst_ptr += sizeof(uint16_t);
 75 | 
 76 |             const size_t block_size = 32;
 77 |             const size_t num_subblocks = bl / 32;
 78 | 
 79 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 80 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 81 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 82 |                     float v0_f32 = src_ptr[src_base_addr];
 83 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 84 | 
 85 |                     v0_f32 *= recip_scale;
 86 |                     v1_f32 *= recip_scale;
 87 | 
 88 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 89 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 90 | 
 91 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 92 | 
 93 |                     dst_ptr[0] = rhs_v0;
 94 |                     dst_ptr += sizeof(uint8_t);
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | };
100 | 
101 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
102 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
103 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
104 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
105 | 
106 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
107 |         const float* src_ptr = rhs_f32 + row_idx * k;
108 | 
109 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
110 | 
111 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
112 |             float amax = 0.0f;
113 | 
114 |             for (size_t b = 0; b < bl; ++b) {
115 |                 const float src0_0 = src_ptr[block_idx * bl + b];
116 |                 const float asrc0_0 = fabsf(src0_0);
117 | 
118 |                 if (amax < asrc0_0) {
119 |                     amax = asrc0_0;
120 |                 }
121 |             }
122 | 
123 |             const float scale = amax / ((1 << 7) - 1);
124 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
125 | 
126 |             // Store the scale at the beginning of the block
127 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
128 |             dst_ptr += sizeof(uint16_t);
129 | 
130 |             const size_t block_size = 32;
131 |             const size_t num_subblocks = bl / 32;
132 | 
133 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
134 |                 for (size_t i = 0; i < block_size; ++i) {
135 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
136 |                     float v0_f32 = src_ptr[src_base_addr];
137 | 
138 |                     v0_f32 *= recip_scale;
139 | 
140 |                     dst_ptr[0] = roundf(v0_f32);
141 |                     dst_ptr += sizeof(int8_t);
142 |                 }
143 |             }
144 |         }
145 |     }
146 | };
147 | 
148 | 
149 | static void ref_matmul_f32_qs8d32_qs4c32(
150 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
151 |     float scalar_min, float scalar_max) {
152 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
153 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
154 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
155 | 
156 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
157 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
158 | 
159 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
160 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
161 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
162 |             // Main f32 accumulator
163 |             float main_acc = 0.0f;
164 | 
165 |             const size_t block_size = 32;
166 |             const size_t num_subblocks = bl / 32;
167 | 
168 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
169 |                 const int8_t* lhs_ptr = lhs_ptr_start;
170 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
171 | 
172 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
173 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
174 | 
175 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
176 |                     int32_t temp_acc = 0;
177 | 
178 |                     // Get the LHS/RHS quantization scale stored at the
179 |                     // beginning of each block
180 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
181 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
182 | 
183 |                     lhs_ptr += sizeof(uint16_t);
184 |                     rhs_ptr += sizeof(uint16_t);
185 | 
186 |                     for (size_t i = 0; i < block_size / 2; ++i) {
187 |                         // Get the LHS values
188 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
189 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
190 | 
191 |                         // Get the RHS values
192 |                         const uint8_t rhs_byte = rhs_ptr[0];
193 | 
194 |                         // Unpack the RHS values
195 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
196 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
197 | 
198 |                         temp_acc += lhs_v0 * rhs_v0;
199 |                         temp_acc += lhs_v1 * rhs_v1;
200 | 
201 |                         lhs_ptr += 1;
202 |                         rhs_ptr += 1;
203 |                     }
204 | 
205 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
206 |                 }
207 |             }
208 | 
209 |             main_acc = std::max(main_acc, scalar_min);
210 |             main_acc = std::min(main_acc, scalar_max);
211 | 
212 |             dst_f32[0] = main_acc;
213 |             dst_f32 += 1;
214 |         }
215 |     }
216 | };
217 | 
218 | 
219 | 
220 | 
221 | int main() {
222 |     // Declare matrix dimensions
223 |     const size_t activation_rows = 6, activation_cols = 1280;
224 |     const size_t weight_rows = 1280, weight_cols = 32000;
225 | 
226 |     std::vector<float> X(activation_rows * activation_cols);
227 |     std::vector<float> W(weight_rows * weight_cols);
228 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
229 | 
230 |     size_t M = activation_rows; 
231 |     size_t N = weight_cols;
232 |     size_t K = activation_cols;
233 | 
234 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
235 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
236 | 
237 |     float* lhs = X.data();
238 |     float* rhs = W.data();
239 | 
240 |     const size_t mr = ukernel.get_mr();
241 |     const size_t nr = ukernel.get_nr();
242 |     const size_t kr = ukernel.get_kr();
243 |     const size_t sr = ukernel.get_sr();
244 | 
245 |     const size_t bl = 32;  // Block length. It must be 32
246 |     const size_t m = activation_rows;
247 |     const size_t n = weight_cols;
248 |     const size_t k = activation_cols;
249 |     const size_t seed_lhs = 4568;
250 |     const size_t seed_rhs = seed_lhs + 4;
251 |     
252 |     const size_t num_blocks = k / bl;
253 |     const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
254 |     const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
255 | 
256 |     const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
257 |     uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
258 | 
259 |     quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
260 | 
261 | 
262 |     const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
263 |     const size_t dst_ref_size_f32 = m * n * sizeof(float);
264 | 
265 |     uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
266 |     uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
267 | 
268 |     ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
269 | 
270 | 
271 | 
272 | 
273 |     const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
274 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
275 |     const size_t dst_size = ukernel.get_dst_size(m, n);
276 | 
277 |     uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
278 |     uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
279 |     uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
280 | 
281 |     struct kai_rhs_pack_qs4cxs1s0_param params;
282 |     params.lhs_zero_point = 1;
283 |     params.rhs_zero_point = 8;
284 |     /*
285 |     kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
286 |             1, n, k,                                  // Dimensions
287 |             nr, kr, sr,                               // Packing arguments
288 |             bl,                                       // Block length
289 |             (const uint8_t*)(rhs_native_mtx_qs4c32),  // RHS
290 |             NULL,                                     // Bias
291 |             rhs_packed_mtx_qs4c32,                    // RHS packed
292 |             0, &params);
293 |     */
294 |    ref_matmul_f32_qs8d32_qs4c32(
295 |         m, n, k, bl, (const int8_t*)lhs_ref_mtx_qa8d32, (const uint8_t*)rhs_native_mtx_qs4c32, (float*)dst_ref_mtx_f32,
296 |         -FLT_MAX, FLT_MAX);
297 | 
298 |     // If the RHS matrix contains constant values, the packing can be performed
299 |     // only once
300 | 
301 | 
302 |     
303 | 
304 |     const size_t dst_stride = n * sizeof(float);
305 |     const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
306 |     const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
307 |     const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
308 | 
309 |     const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
310 |     const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
311 |     float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
312 | 
313 |     auto start = std::chrono::high_resolution_clock::now();
314 |     
315 |     ukernel.run_matmul(
316 |         m, n, k, bl,       // Dimensions
317 |         lhs_ptr,           // LHS packed
318 |         rhs_ptr,           // RHS packed
319 |         dst_ptr,           // DST
320 |         dst_stride,        // DST stride (row)
321 |         sizeof(float),     // DST stride (col)
322 |         -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
323 |     );
324 | 
325 |     auto end = std::chrono::high_resolution_clock::now();
326 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
327 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
328 |     
329 |     return 0;
330 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_i8mm/benchmark_i8_i8mm_scaling.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "../common/sizes.cpp"
  4 | #include "kernel.cpp"
  5 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  6 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  7 | 
  8 | #include <algorithm>
  9 | #include <cfloat>
 10 | #include <chrono>
 11 | #include <cmath>
 12 | #include <cstddef>
 13 | #include <iomanip>
 14 | #include <iostream>
 15 | #include <fstream>
 16 | 
 17 | 
 18 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 19 |     FILE* file = fopen(filename, "rb");
 20 |     if (file == NULL) {
 21 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 22 |         return;
 23 |     }
 24 | 
 25 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 26 |     if (elements_read != rows * cols) {
 27 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 28 |     }
 29 |     fclose(file);
 30 | }
 31 | 
 32 | 
 33 | 
 34 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 35 |     return k / bl;
 36 | }
 37 | 
 38 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 39 |     return bl + sizeof(int16_t);
 40 | }
 41 | 
 42 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 43 |     return (bl / 2) + sizeof(int16_t);
 44 | }
 45 | 
 46 | 
 47 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 48 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 49 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 50 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 51 | 
 52 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 53 |         const float* src_ptr = rhs_f32 + row_idx * k;
 54 | 
 55 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 56 | 
 57 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 58 |             float amax = 0.0f;
 59 |             float max = 0.0f;
 60 | 
 61 |             for (size_t b = 0; b < bl; ++b) {
 62 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 63 |                 const float asrc0_0 = fabsf(src0_0);
 64 | 
 65 |                 if (amax < asrc0_0) {
 66 |                     amax = asrc0_0;
 67 |                     max = src0_0;
 68 |                 }
 69 |             }
 70 | 
 71 |             const float scale = max / -8.0;
 72 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 73 | 
 74 |             // Store the scale at the beginning of the block
 75 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 76 |             dst_ptr += sizeof(uint16_t);
 77 | 
 78 |             const size_t block_size = 32;
 79 |             const size_t num_subblocks = bl / 32;
 80 | 
 81 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 82 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 83 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 84 |                     float v0_f32 = src_ptr[src_base_addr];
 85 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 86 | 
 87 |                     v0_f32 *= recip_scale;
 88 |                     v1_f32 *= recip_scale;
 89 | 
 90 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 91 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 92 | 
 93 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 94 | 
 95 |                     dst_ptr[0] = rhs_v0;
 96 |                     dst_ptr += sizeof(uint8_t);
 97 |                 }
 98 |             }
 99 |         }
100 |     }
101 | };
102 | 
103 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
104 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
105 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
106 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
107 | 
108 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
109 |         const float* src_ptr = rhs_f32 + row_idx * k;
110 | 
111 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
112 | 
113 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
114 |             float amax = 0.0f;
115 | 
116 |             for (size_t b = 0; b < bl; ++b) {
117 |                 const float src0_0 = src_ptr[block_idx * bl + b];
118 |                 const float asrc0_0 = fabsf(src0_0);
119 | 
120 |                 if (amax < asrc0_0) {
121 |                     amax = asrc0_0;
122 |                 }
123 |             }
124 | 
125 |             const float scale = amax / ((1 << 7) - 1);
126 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
127 | 
128 |             // Store the scale at the beginning of the block
129 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
130 |             dst_ptr += sizeof(uint16_t);
131 | 
132 |             const size_t block_size = 32;
133 |             const size_t num_subblocks = bl / 32;
134 | 
135 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
136 |                 for (size_t i = 0; i < block_size; ++i) {
137 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
138 |                     float v0_f32 = src_ptr[src_base_addr];
139 | 
140 |                     v0_f32 *= recip_scale;
141 | 
142 |                     dst_ptr[0] = roundf(v0_f32);
143 |                     dst_ptr += sizeof(int8_t);
144 |                 }
145 |             }
146 |         }
147 |     }
148 | };
149 | 
150 | 
151 | static void ref_matmul_f32_qs8d32_qs4c32(
152 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
153 |     float scalar_min, float scalar_max) {
154 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
155 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
156 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
157 | 
158 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
159 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
160 | 
161 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
162 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
163 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
164 |             // Main f32 accumulator
165 |             float main_acc = 0.0f;
166 | 
167 |             const size_t block_size = 32;
168 |             const size_t num_subblocks = bl / 32;
169 | 
170 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
171 |                 const int8_t* lhs_ptr = lhs_ptr_start;
172 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
173 | 
174 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
175 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
176 | 
177 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
178 |                     int32_t temp_acc = 0;
179 | 
180 |                     // Get the LHS/RHS quantization scale stored at the
181 |                     // beginning of each block
182 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
183 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
184 | 
185 |                     lhs_ptr += sizeof(uint16_t);
186 |                     rhs_ptr += sizeof(uint16_t);
187 | 
188 |                     for (size_t i = 0; i < block_size / 2; ++i) {
189 |                         // Get the LHS values
190 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
191 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
192 | 
193 |                         // Get the RHS values
194 |                         const uint8_t rhs_byte = rhs_ptr[0];
195 | 
196 |                         // Unpack the RHS values
197 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
198 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
199 | 
200 |                         temp_acc += lhs_v0 * rhs_v0;
201 |                         temp_acc += lhs_v1 * rhs_v1;
202 | 
203 |                         lhs_ptr += 1;
204 |                         rhs_ptr += 1;
205 |                     }
206 | 
207 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
208 |                 }
209 |             }
210 | 
211 |             main_acc = std::max(main_acc, scalar_min);
212 |             main_acc = std::min(main_acc, scalar_max);
213 | 
214 |             dst_f32[0] = main_acc;
215 |             dst_f32 += 1;
216 |         }
217 |     }
218 | };
219 | 
220 | 
221 | 
222 | 
223 | int main() {
224 |     std::ofstream csv_file("../../results/i8_i8mm_scaling_results.csv");
225 |     // Write header
226 |     csv_file << "Size,Latency(us)\n";
227 | 
228 |     for (int size: sizes) {
229 | 
230 |         std::vector<float> X(size * size);
231 |         std::vector<float> W(size * size);
232 |         std::vector<float> Y(size * size, 0.0f);  // Initialize with zeros
233 | 
234 |         std::generate(X.begin(), X.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
235 |         std::generate(W.begin(), W.end(), []() { return static_cast<float>(rand()) / RAND_MAX; });
236 | 
237 |         size_t M = size; 
238 |         size_t N = size;
239 |         size_t K = size;
240 | 
241 |         float* lhs = X.data();
242 |         float* rhs = W.data();
243 | 
244 |         const size_t mr = ukernel.get_mr();
245 |         const size_t nr = ukernel.get_nr();
246 |         const size_t kr = ukernel.get_kr();
247 |         const size_t sr = ukernel.get_sr();
248 | 
249 |         const size_t bl = 32;  // Block length. It must be 32
250 |         const size_t m = size;
251 |         const size_t n = size;
252 |         const size_t k = size;
253 |         const size_t seed_lhs = 4568;
254 |         const size_t seed_rhs = seed_lhs + 4;
255 |         
256 |         const size_t num_blocks = k / bl;
257 |         const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
258 |         const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
259 | 
260 |         const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
261 |         uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
262 | 
263 |         quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
264 | 
265 | 
266 |         const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
267 |         const size_t dst_ref_size_f32 = m * n * sizeof(float);
268 | 
269 |         uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
270 |         uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
271 | 
272 |         ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
273 | 
274 | 
275 |         const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
276 |         auto start = std::chrono::high_resolution_clock::now();
277 |         const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
278 |         
279 |         const size_t dst_size = ukernel.get_dst_size(m, n);
280 | 
281 |         uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
282 |         uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
283 |         uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
284 | 
285 |         struct kai_rhs_pack_qs4cxs1s0_param params;
286 |         params.lhs_zero_point = 1;
287 |         params.rhs_zero_point = 8;
288 | 
289 |         const size_t dst_stride = n * sizeof(float);
290 |         const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
291 |         const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
292 |         const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
293 | 
294 |         const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
295 |         const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
296 |         float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
297 |         
298 |         ukernel.run_matmul(
299 |             m, n, k, bl,       // Dimensions
300 |             lhs_ptr,           // LHS packed
301 |             rhs_ptr,           // RHS packed
302 |             dst_ptr,           // DST
303 |             dst_stride,        // DST stride (row)
304 |             sizeof(float),     // DST stride (col)
305 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
306 |         );
307 | 
308 |         auto end = std::chrono::high_resolution_clock::now();
309 |         double duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
310 |         //std::cout << "Time taken: " << duration << " microseconds" << std::endl;
311 |         csv_file << size << "," << duration << "\n";
312 |     }
313 |     return 0;
314 | }


--------------------------------------------------------------------------------
/src/cpp/f32_i8_i4_i8mm/kai_i8_i8mm_inf.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include "kernel.cpp"
  4 | #include "kai_lhs_quant_pack_qsi8d32p_f32.h"
  5 | #include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
  6 | 
  7 | #include <algorithm>
  8 | #include <cfloat>
  9 | #include <chrono>
 10 | #include <cmath>
 11 | #include <cstddef>
 12 | #include <iomanip>
 13 | #include <iostream>
 14 | 
 15 | 
 16 | void loadMatrix(const char* filename, float* matrix, size_t rows, size_t cols) {
 17 |     FILE* file = fopen(filename, "rb");
 18 |     if (file == NULL) {
 19 |         fprintf(stderr, "Error: Could not open file %s\n", filename);
 20 |         return;
 21 |     }
 22 | 
 23 |     size_t elements_read = fread(matrix, sizeof(float), rows * cols, file);
 24 |     if (elements_read != rows * cols) {
 25 |         fprintf(stderr, "Error: Only %zu elements could be read.\n", elements_read);
 26 |     }
 27 |     fclose(file);
 28 | }
 29 | 
 30 | 
 31 | 
 32 | static inline size_t num_blocks_per_row(size_t k, size_t bl) {
 33 |     return k / bl;
 34 | }
 35 | 
 36 | static inline size_t num_bytes_per_block_qs8c32(size_t bl) {
 37 |     return bl + sizeof(int16_t);
 38 | }
 39 | 
 40 | static inline size_t num_bytes_per_block_qs4c32(size_t bl) {
 41 |     return (bl / 2) + sizeof(int16_t);
 42 | }
 43 | 
 44 | 
 45 | static void quant_qs4c32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs4c32) {
 46 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
 47 |     const size_t num_bytes_block = num_bytes_per_block_qs4c32(bl);
 48 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
 49 | 
 50 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
 51 |         const float* src_ptr = rhs_f32 + row_idx * k;
 52 | 
 53 |         uint8_t* dst_ptr = (uint8_t*)rhs_qs4c32 + row_idx * dst_stride;
 54 | 
 55 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
 56 |             float amax = 0.0f;
 57 |             float max = 0.0f;
 58 | 
 59 |             for (size_t b = 0; b < bl; ++b) {
 60 |                 const float src0_0 = src_ptr[block_idx * bl + b];
 61 |                 const float asrc0_0 = fabsf(src0_0);
 62 | 
 63 |                 if (amax < asrc0_0) {
 64 |                     amax = asrc0_0;
 65 |                     max = src0_0;
 66 |                 }
 67 |             }
 68 | 
 69 |             const float scale = max / -8.0;
 70 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
 71 | 
 72 |             // Store the scale at the beginning of the block
 73 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
 74 |             dst_ptr += sizeof(uint16_t);
 75 | 
 76 |             const size_t block_size = 32;
 77 |             const size_t num_subblocks = bl / 32;
 78 | 
 79 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
 80 |                 for (size_t i = 0; i < block_size / 2; ++i) {
 81 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
 82 |                     float v0_f32 = src_ptr[src_base_addr];
 83 |                     float v1_f32 = src_ptr[src_base_addr + block_size / 2];
 84 | 
 85 |                     v0_f32 *= recip_scale;
 86 |                     v1_f32 *= recip_scale;
 87 | 
 88 |                     const uint8_t v0_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v0_f32 + 8.5f));
 89 |                     const uint8_t v1_u8 = (uint8_t)std::min((int8_t)15, (int8_t)(v1_f32 + 8.5f));
 90 | 
 91 |                     const uint8_t rhs_v0 = (v1_u8 << 4) | v0_u8;
 92 | 
 93 |                     dst_ptr[0] = rhs_v0;
 94 |                     dst_ptr += sizeof(uint8_t);
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | };
100 | 
101 | static void ref_quant_qs8d32_f32(size_t n, size_t k, size_t bl, const float* rhs_f32, uint8_t* rhs_qs8c32) {
102 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
103 |     const size_t num_bytes_block = num_bytes_per_block_qs8c32(bl);
104 |     const size_t dst_stride = num_blocks_row * num_bytes_block;
105 | 
106 |     for (size_t row_idx = 0; row_idx < n; ++row_idx) {
107 |         const float* src_ptr = rhs_f32 + row_idx * k;
108 | 
109 |         int8_t* dst_ptr = (int8_t*)rhs_qs8c32 + row_idx * dst_stride;
110 | 
111 |         for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
112 |             float amax = 0.0f;
113 | 
114 |             for (size_t b = 0; b < bl; ++b) {
115 |                 const float src0_0 = src_ptr[block_idx * bl + b];
116 |                 const float asrc0_0 = fabsf(src0_0);
117 | 
118 |                 if (amax < asrc0_0) {
119 |                     amax = asrc0_0;
120 |                 }
121 |             }
122 | 
123 |             const float scale = amax / ((1 << 7) - 1);
124 |             const float recip_scale = scale ? 1.0f / scale : 0.0f;
125 | 
126 |             // Store the scale at the beginning of the block
127 |             *((uint16_t*)dst_ptr) = kai_cast_f16_f32(scale);
128 |             dst_ptr += sizeof(uint16_t);
129 | 
130 |             const size_t block_size = 32;
131 |             const size_t num_subblocks = bl / 32;
132 | 
133 |             for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
134 |                 for (size_t i = 0; i < block_size; ++i) {
135 |                     const size_t src_base_addr = block_idx * bl + i + subblock_idx * block_size;
136 |                     float v0_f32 = src_ptr[src_base_addr];
137 | 
138 |                     v0_f32 *= recip_scale;
139 | 
140 |                     dst_ptr[0] = roundf(v0_f32);
141 |                     dst_ptr += sizeof(int8_t);
142 |                 }
143 |             }
144 |         }
145 |     }
146 | };
147 | 
148 | 
149 | static void ref_matmul_f32_qs8d32_qs4c32(
150 |     size_t m, size_t n, size_t k, size_t bl, const int8_t* lhs_qa8d32, const uint8_t* rhs_qs4c32, float* dst_f32,
151 |     float scalar_min, float scalar_max) {
152 |     const size_t num_blocks_row = num_blocks_per_row(k, bl);
153 |     const size_t num_bytes_block_qs4c32 = num_bytes_per_block_qs4c32(bl);
154 |     const size_t num_bytes_block_qs8c32 = num_bytes_per_block_qs8c32(bl);
155 | 
156 |     const size_t lhs_stride = num_blocks_row * num_bytes_block_qs8c32;
157 |     const size_t rhs_stride = num_blocks_row * num_bytes_block_qs4c32;
158 | 
159 |     for (size_t row_idx = 0; row_idx < m; ++row_idx) {
160 |         const int8_t* lhs_ptr_start = lhs_qa8d32 + row_idx * lhs_stride;
161 |         for (size_t col_idx = 0; col_idx < n; ++col_idx) {
162 |             // Main f32 accumulator
163 |             float main_acc = 0.0f;
164 | 
165 |             const size_t block_size = 32;
166 |             const size_t num_subblocks = bl / 32;
167 | 
168 |             for (size_t block_idx = 0; block_idx < num_blocks_row; ++block_idx) {
169 |                 const int8_t* lhs_ptr = lhs_ptr_start;
170 |                 const uint8_t* rhs_ptr = rhs_qs4c32 + col_idx * rhs_stride;
171 | 
172 |                 lhs_ptr += block_idx * num_bytes_block_qs8c32;
173 |                 rhs_ptr += block_idx * num_bytes_block_qs4c32;
174 | 
175 |                 for (size_t subblock_idx = 0; subblock_idx < num_subblocks; ++subblock_idx) {
176 |                     int32_t temp_acc = 0;
177 | 
178 |                     // Get the LHS/RHS quantization scale stored at the
179 |                     // beginning of each block
180 |                     const float lhs_scale = kai_cast_f32_f16(*(const uint16_t*)lhs_ptr);
181 |                     const float rhs_scale = kai_cast_f32_f16(*(const uint16_t*)rhs_ptr);
182 | 
183 |                     lhs_ptr += sizeof(uint16_t);
184 |                     rhs_ptr += sizeof(uint16_t);
185 | 
186 |                     for (size_t i = 0; i < block_size / 2; ++i) {
187 |                         // Get the LHS values
188 |                         const int32_t lhs_v0 = (int32_t)lhs_ptr[0];
189 |                         const int32_t lhs_v1 = (int32_t)lhs_ptr[block_size / 2];
190 | 
191 |                         // Get the RHS values
192 |                         const uint8_t rhs_byte = rhs_ptr[0];
193 | 
194 |                         // Unpack the RHS values
195 |                         const int32_t rhs_v0 = (((int32_t)(rhs_byte & 0x0F)) - 8);
196 |                         const int32_t rhs_v1 = (((int32_t)(rhs_byte >> 4)) - 8);
197 | 
198 |                         temp_acc += lhs_v0 * rhs_v0;
199 |                         temp_acc += lhs_v1 * rhs_v1;
200 | 
201 |                         lhs_ptr += 1;
202 |                         rhs_ptr += 1;
203 |                     }
204 | 
205 |                     main_acc += temp_acc * lhs_scale * rhs_scale;
206 |                 }
207 |             }
208 | 
209 |             main_acc = std::max(main_acc, scalar_min);
210 |             main_acc = std::min(main_acc, scalar_max);
211 | 
212 |             dst_f32[0] = main_acc;
213 |             dst_f32 += 1;
214 |         }
215 |     }
216 | };
217 | 
218 | 
219 | 
220 | 
221 | int main() {
222 |     // Declare matrix dimensions
223 |     const size_t activation_rows = 6, activation_cols = 1280;
224 |     const size_t weight_rows = 1280, weight_cols = 32000;
225 | 
226 |     std::vector<float> X(activation_rows * activation_cols);
227 |     std::vector<float> W(weight_rows * weight_cols);
228 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
229 | 
230 |     size_t M = activation_rows; 
231 |     size_t N = weight_cols;
232 |     size_t K = activation_cols;
233 | 
234 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
235 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
236 | 
237 |     float* lhs = X.data();
238 |     float* rhs = W.data();
239 | 
240 |     const size_t mr = ukernel.get_mr();
241 |     const size_t nr = ukernel.get_nr();
242 |     const size_t kr = ukernel.get_kr();
243 |     const size_t sr = ukernel.get_sr();
244 | 
245 |     const size_t bl = 32;  // Block length. It must be 32
246 |     const size_t m = activation_rows;
247 |     const size_t n = weight_cols;
248 |     const size_t k = activation_cols;
249 |     const size_t seed_lhs = 4568;
250 |     const size_t seed_rhs = seed_lhs + 4;
251 |     
252 |     const size_t num_blocks = k / bl;
253 |     const size_t num_bytes_per_block_qs4c32 = (bl / 2) + sizeof(int16_t);
254 |     const size_t num_bytes_per_block_qs8c32 = bl + sizeof(int16_t);
255 | 
256 |     const size_t rhs_native_size_qs4c32 = n * num_blocks * num_bytes_per_block_qs4c32;
257 |     uint8_t* rhs_native_mtx_qs4c32 = new uint8_t[rhs_native_size_qs4c32];
258 | 
259 |     quant_qs4c32_f32(n, k, bl, (const float*)W.data(), (uint8_t*)rhs_native_mtx_qs4c32);
260 | 
261 | 
262 |     const size_t lhs_ref_size_qa8d32 = m * num_blocks * num_bytes_per_block_qs8c32;
263 |     const size_t dst_ref_size_f32 = m * n * sizeof(float);
264 | 
265 |     uint8_t* lhs_ref_mtx_qa8d32 = new uint8_t[lhs_ref_size_qa8d32];
266 |     uint8_t* dst_ref_mtx_f32 = new uint8_t[dst_ref_size_f32];
267 | 
268 |     ref_quant_qs8d32_f32(m, k, bl, (const float*)X.data(), (uint8_t*)lhs_ref_mtx_qa8d32);
269 | 
270 | 
271 | 
272 | 
273 |     const size_t lhs_packed_size = kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32(m, k, bl, mr, kr, sr);
274 |     const size_t rhs_packed_size = kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(n, k, nr, kr, bl);
275 |     const size_t dst_size = ukernel.get_dst_size(m, n);
276 | 
277 |     uint8_t* lhs_packed_mtx_qs8d32 = new uint8_t[lhs_packed_size];
278 |     uint8_t* rhs_packed_mtx_qs4c32 = new uint8_t[rhs_packed_size];
279 |     uint8_t* dst_act_mtx_f32 = new uint8_t[dst_size];
280 | 
281 |     struct kai_rhs_pack_qs4cxs1s0_param params;
282 |     params.lhs_zero_point = 1;
283 |     params.rhs_zero_point = 8;
284 |     /*
285 |     kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0(
286 |             1, n, k,                                  // Dimensions
287 |             nr, kr, sr,                               // Packing arguments
288 |             bl,                                       // Block length
289 |             (const uint8_t*)(rhs_native_mtx_qs4c32),  // RHS
290 |             NULL,                                     // Bias
291 |             rhs_packed_mtx_qs4c32,                    // RHS packed
292 |             0, &params);
293 |     */
294 |    ref_matmul_f32_qs8d32_qs4c32(
295 |         m, n, k, bl, (const int8_t*)lhs_ref_mtx_qa8d32, (const uint8_t*)rhs_native_mtx_qs4c32, (float*)dst_ref_mtx_f32,
296 |         -FLT_MAX, FLT_MAX);
297 | 
298 |     // If the RHS matrix contains constant values, the packing can be performed
299 |     // only once
300 | 
301 |     const size_t dst_stride = n * sizeof(float);
302 |     const size_t lhs_offset = ukernel.get_lhs_packed_offset(0, k, bl);
303 |     const size_t rhs_offset = ukernel.get_rhs_packed_offset(0, k, bl);
304 |     const size_t dst_offset = ukernel.get_dst_offset(0, 0, dst_stride);
305 | 
306 |     const void* lhs_ptr = (const void*)((const char*)lhs_packed_mtx_qs8d32 + lhs_offset);
307 |     const void* rhs_ptr = (const void*)((const char*)rhs_packed_mtx_qs4c32 + rhs_offset);
308 |     float* dst_ptr = (float*)((uint8_t*)dst_act_mtx_f32 + dst_offset);
309 | 
310 |     auto start = std::chrono::high_resolution_clock::now();
311 |     while (true) {
312 |         ukernel.run_matmul(
313 |             m, n, k, bl,       // Dimensions
314 |             lhs_ptr,           // LHS packed
315 |             rhs_ptr,           // RHS packed
316 |             dst_ptr,           // DST
317 |             dst_stride,        // DST stride (row)
318 |             sizeof(float),     // DST stride (col)
319 |             -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
320 |         );
321 |     }
322 | 
323 |     auto end = std::chrono::high_resolution_clock::now();
324 |     double duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
325 |     std::cout << "Time taken: " << duration << " milliseconds" << std::endl;
326 |     
327 |     return 0;
328 | }


--------------------------------------------------------------------------------
/src/cpp/naive/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | # Set the project name
 4 | project(NaiveBenchmark)
 5 | 
 6 | # Specify C++11
 7 | set(CMAKE_CXX_STANDARD 11)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | 
10 | # Add the executable
11 | add_executable(benchmark_naive benchmark_naive.cpp kernel.cpp)
12 | 


--------------------------------------------------------------------------------
/src/cpp/naive/benchmark_naive.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <chrono>
 4 | #include <vector>
 5 | #include "../common/sizes.cpp"
 6 | #include "kernel.h"
 7 | 
 8 | void loadMatrix(const std::string& filename, float* matrix, size_t rows, size_t cols) {
 9 |     std::ifstream file(filename, std::ios::binary);
10 |     if (!file) {
11 |         std::cerr << "Error: Could not open file " << filename << std::endl;
12 |         return;
13 |     }
14 | 
15 |     file.read(reinterpret_cast<char*>(matrix), rows * cols * sizeof(float));
16 |     if (file.gcount() != rows * cols * sizeof(float)) {
17 |         std::cerr << "Error: Only " << file.gcount() << " bytes could be read." << std::endl;
18 |     }
19 |     file.close();
20 | }
21 | 
22 | int main() {
23 |     // Declare matrix dimensions
24 |     const size_t activation_rows = 23, activation_cols = 3072;
25 |     const size_t weight_rows = 3072, weight_cols = 32000;
26 | 
27 |     std::vector<float> X(activation_rows * activation_cols);
28 |     std::vector<float> W(weight_rows * weight_cols);
29 |     std::vector<float> Y(activation_rows * weight_cols, 0.0f);  // Initialize with zeros
30 | 
31 |     loadMatrix("../../assets/x_fp32.bin", X.data(), activation_rows, activation_cols);
32 |     loadMatrix("../../assets/w_fp32.bin", W.data(), weight_rows, weight_cols);
33 | 
34 | 
35 |     auto start = std::chrono::high_resolution_clock::now();
36 |     matrix_multiply_naive(X.data(), W.data(), Y.data(), activation_rows, activation_cols, weight_cols);
37 |     
38 |     auto end = std::chrono::high_resolution_clock::now();
39 |     double duration = std::chrono::duration_cast<std::chrono::seconds>(end - start).count();
40 |     std::cout << "Time taken: " << duration << " seconds" << std::endl;
41 | 
42 |     return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/src/cpp/naive/kernel.h:
--------------------------------------------------------------------------------
1 | #ifndef KERNEL_H
2 | #define KERNEL_H
3 | 
4 | void matrix_multiply_naive(float* A, float* B, float* C, int M, int K, int N);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/src/cpp/results/blas_f32_scaling_results.csv:
--------------------------------------------------------------------------------
1 | Size,Latency(us)
2 | 32,14
3 | 64,111
4 | 128,836
5 | 256,6579
6 | 512,54471
7 | 1024,425776
8 | 2048,3.44762e+06
9 | 


--------------------------------------------------------------------------------
/src/cpp/results/f16_scaling_results.csv:
--------------------------------------------------------------------------------
1 | Size,Latency(us)
2 | 32,2
3 | 64,6
4 | 128,42
5 | 256,332
6 | 512,2651
7 | 1024,23228
8 | 2048,210452
9 | 


--------------------------------------------------------------------------------
/src/cpp/results/f32_scaling_results.csv:
--------------------------------------------------------------------------------
1 | Size,Latency(us)
2 | 32,4
3 | 64,17
4 | 128,94
5 | 256,729
6 | 512,6004
7 | 1024,47109
8 | 2048,427602
9 | 


--------------------------------------------------------------------------------
/src/cpp/results/i8_dotprod_scaling_results.csv:
--------------------------------------------------------------------------------
1 | Size,Latency(us)
2 | 32,3
3 | 64,17
4 | 128,103
5 | 256,636
6 | 512,5439
7 | 1024,50734
8 | 2048,424113
9 | 


--------------------------------------------------------------------------------
/src/cpp/results/i8_i8mm_scaling_results.csv:
--------------------------------------------------------------------------------
1 | Size,Latency(us)
2 | 32,3
3 | 64,11
4 | 128,46
5 | 256,267
6 | 512,2331
7 | 1024,21685
8 | 2048,167514
9 | 


--------------------------------------------------------------------------------