├── .clang-format
├── .flake8
├── .github
    └── workflows
    │   └── doxygen.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── Doxyfile
├── LICENSE
├── README.md
├── assets
    ├── figures
    │   ├── chat.gif
    │   ├── chat_demo_gpu.gif
    │   ├── chat_demo_m1.gif
    │   ├── coding_demo_gpu.gif
    │   ├── coding_demo_m1.gif
    │   ├── overview.png
    │   ├── smoothquant_intuition.png
    │   ├── tinychat_logo.png
    │   ├── vlm_demo
    │   │   ├── CPR.jpg
    │   │   ├── Wall_fissure.png
    │   │   ├── animal_blocking.png
    │   │   ├── car.png
    │   │   ├── pedestrian.png
    │   │   ├── statue.jpg
    │   │   └── windmill_people.png
    │   └── vlm_demo_m1.gif
    └── slides.pdf
├── kernels
    ├── avx
    │   ├── matmul_avx_fp32.cc
    │   ├── matmul_avx_int4.cc
    │   ├── matmul_avx_int8.cc
    │   └── matmul_avx_int8_int4.cc
    ├── cuda
    │   ├── gemv_cuda.cu
    │   ├── matmul_int4.cu
    │   ├── matmul_ref_fp32.cc
    │   └── matmul_ref_int8.cc
    ├── matmul.h
    ├── matmul_imp.cc
    ├── matmul_int4.cc
    ├── matmul_int8.cc
    ├── metal
    │   ├── Makefile
    │   ├── download_metal-cpp.sh
    │   ├── include
    │   │   ├── MetalMatmulInt4.hpp
    │   │   └── opParams.h
    │   ├── kernel
    │   │   └── op.metal
    │   ├── matmul_metal_int4.cc
    │   ├── matmul_metal_int4_imp.cc
    │   ├── matmul_metal_int4_imp.h
    │   ├── matmul_ref_fp32.cc
    │   ├── matmul_ref_int8.cc
    │   └── src
    │   │   └── MetalMatmulInt4.cpp
    ├── neon
    │   ├── matmul_neon_fp32.cc
    │   ├── matmul_neon_int4.cc
    │   ├── matmul_neon_int4_offset.cc
    │   ├── matmul_neon_int8_int4.cc
    │   └── matmul_ref_int8.cc
    ├── pthread_pool.cc
    ├── pthread_pool.h
    └── ref
    │   ├── matmul_ref_fp32.cc
    │   ├── matmul_ref_int4.cc
    │   └── matmul_ref_int8.cc
├── llm
    ├── Makefile
    ├── application
    │   ├── README.md
    │   ├── chat.cc
    │   └── sts_utils
    │   │   ├── clean_up.patch
    │   │   ├── listen
    │   │   └── speak
    ├── chat_llama2-13b
    ├── chat_llama2-7b
    ├── code
    ├── half-2.2.0
    │   └── include
    │   │   ├── README.md
    │   │   └── half.hpp
    ├── include
    │   ├── GPTBigCodeTokenizer.h
    │   ├── Generate.h
    │   ├── LLaMATokenizer.h
    │   ├── OPTTokenizer.h
    │   ├── common.h
    │   ├── interface.h
    │   ├── model.h
    │   ├── nn_modules
    │   │   ├── Fp32CLIPAttention.h
    │   │   ├── Fp32CLIPEncoder.h
    │   │   ├── Fp32CLIPEncoderLayer.h
    │   │   ├── Fp32CLIPVisionTransformer.h
    │   │   ├── Fp32GPTBigCodeAttention.h
    │   │   ├── Fp32GPTBigCodeDecoder.h
    │   │   ├── Fp32GPTBigCodeDecoderLayer.h
    │   │   ├── Fp32GPTBigCodeForCausalLM.h
    │   │   ├── Fp32OPTAttention.h
    │   │   ├── Fp32OPTDecoder.h
    │   │   ├── Fp32OPTDecoderLayer.h
    │   │   ├── Fp32OPTForCausalLM.h
    │   │   ├── Fp32llamaAttention.h
    │   │   ├── Fp32llamaDecoder.h
    │   │   ├── Fp32llamaDecoderLayer.h
    │   │   ├── Fp32llamaForCausalLM.h
    │   │   ├── Int4GPTBigCodeAttention.h
    │   │   ├── Int4GPTBigCodeDecoder.h
    │   │   ├── Int4GPTBigCodeDecoderLayer.h
    │   │   ├── Int4GPTBigCodeForCausalLM.h
    │   │   ├── Int4OPTAttention.h
    │   │   ├── Int4OPTDecoder.h
    │   │   ├── Int4OPTDecoderLayer.h
    │   │   ├── Int4OPTForCausalLM.h
    │   │   ├── Int4llamaAttention.h
    │   │   ├── Int4llamaDecoder.h
    │   │   ├── Int4llamaDecoderLayer.h
    │   │   ├── Int4llamaForCausalLM.h
    │   │   ├── Int8OPTAttention.h
    │   │   ├── Int8OPTDecoder.h
    │   │   ├── Int8OPTDecoderLayer.h
    │   │   └── OPTForCausalLM.h
    │   ├── operators.h
    │   ├── ops
    │   │   ├── BMM_F32T.h
    │   │   ├── BMM_S8T_S8N_F32T.h
    │   │   ├── BMM_S8T_S8N_S8T.h
    │   │   ├── Conv2D.h
    │   │   ├── Embedding.h
    │   │   ├── Gelu.h
    │   │   ├── LayerNorm.h
    │   │   ├── LayerNormQ.h
    │   │   ├── LlamaRMSNorm.h
    │   │   ├── RotaryPosEmb.h
    │   │   ├── W8A8B8O8Linear.h
    │   │   ├── W8A8B8O8LinearReLU.h
    │   │   ├── W8A8BFP32OFP32Linear.h
    │   │   ├── arg_max.h
    │   │   ├── cuda
    │   │   │   ├── BMM_F16T.cuh
    │   │   │   ├── Embedding.cuh
    │   │   │   ├── LlamaRMSNorm.cuh
    │   │   │   ├── RotaryPosEmb.cuh
    │   │   │   └── reduction.cuh
    │   │   └── linear.h
    │   ├── profiler.h
    │   ├── stb_image.h
    │   └── utils.h
    ├── mistral
    ├── models
    │   ├── llama3_vocab.bin
    │   ├── llama_vocab.bin
    │   ├── mistral_vocab.bin
    │   ├── opt_merges.txt
    │   ├── opt_vocab.json
    │   └── starcoder_vocab.bin
    ├── scripts
    │   ├── chat-13b.sh
    │   ├── chat.sh
    │   ├── code.sh
    │   ├── llava.sh
    │   ├── vila.sh
    │   ├── voice_llava.sh
    │   ├── voice_vila.sh
    │   └── voicechat.sh
    ├── src
    │   ├── GPTBigCodeGenerate.cc
    │   ├── GPTBigCodeTokenizer.cc
    │   ├── Generate.cc
    │   ├── LLaMATokenizer.cc
    │   ├── OPTGenerate.cc
    │   ├── OPTTokenizer.cc
    │   ├── interface.cc
    │   ├── nn_modules
    │   │   ├── Fp32CLIPAttention.cc
    │   │   ├── Fp32CLIPEncoder.cc
    │   │   ├── Fp32CLIPEncoderLayer.cc
    │   │   ├── Fp32CLIPVisionTransformer.cc
    │   │   ├── Fp32GPTBigCodeAttention.cc
    │   │   ├── Fp32GPTBigCodeDecoder.cc
    │   │   ├── Fp32GPTBigCodeDecoderLayer.cc
    │   │   ├── Fp32GPTBigCodeForCausalLM.cc
    │   │   ├── Fp32OPTAttention.cc
    │   │   ├── Fp32OPTDecoder.cc
    │   │   ├── Fp32OPTDecoderLayer.cc
    │   │   ├── Fp32OPTForCausalLM.cc
    │   │   ├── Fp32llamaAttention.cc
    │   │   ├── Fp32llamaDecoder.cc
    │   │   ├── Fp32llamaDecoderLayer.cc
    │   │   ├── Fp32llamaForCausalLM.cc
    │   │   ├── Int4GPTBigCodeAttention.cc
    │   │   ├── Int4GPTBigCodeDecoder.cc
    │   │   ├── Int4GPTBigCodeDecoderLayer.cc
    │   │   ├── Int4GPTBigCodeForCausalLM.cc
    │   │   ├── Int4OPTAttention.cc
    │   │   ├── Int4OPTDecoder.cc
    │   │   ├── Int4OPTDecoderLayer.cc
    │   │   ├── Int4OPTForCausalLM.cc
    │   │   ├── Int8OPTAttention.cc
    │   │   ├── Int8OPTDecoder.cc
    │   │   ├── Int8OPTDecoderLayer.cc
    │   │   ├── OPTForCausalLM.cc
    │   │   ├── cuda
    │   │   │   ├── Int4llamaAttention.cu
    │   │   │   ├── Int4llamaDecoder.cu
    │   │   │   ├── Int4llamaDecoderLayer.cu
    │   │   │   ├── Int4llamaForCausalLM.cu
    │   │   │   ├── LLaMA3Generate.cu
    │   │   │   ├── LLaMAGenerate.cu
    │   │   │   ├── LLaVAGenerate.cu
    │   │   │   ├── MistralGenerate.cu
    │   │   │   └── utils.cu
    │   │   └── non_cuda
    │   │   │   ├── Int4llamaAttention.cc
    │   │   │   ├── Int4llamaDecoder.cc
    │   │   │   ├── Int4llamaDecoderLayer.cc
    │   │   │   ├── Int4llamaForCausalLM.cc
    │   │   │   ├── LLaMA3Generate.cc
    │   │   │   ├── LLaMAGenerate.cc
    │   │   │   ├── LLaVAGenerate.cc
    │   │   │   └── MistralGenerate.cc
    │   ├── ops
    │   │   ├── BMM_F32T.cc
    │   │   ├── BMM_S8T_S8N_F32T.cc
    │   │   ├── BMM_S8T_S8N_S8T.cc
    │   │   ├── Conv2D.cc
    │   │   ├── Gelu.cc
    │   │   ├── LayerNorm.cc
    │   │   ├── LayerNormQ.cc
    │   │   ├── LlamaRMSNorm.cc
    │   │   ├── RotaryPosEmb.cc
    │   │   ├── W8A8B8O8Linear.cc
    │   │   ├── W8A8B8O8LinearReLU.cc
    │   │   ├── W8A8BFP32OFP32Linear.cc
    │   │   ├── arg_max.cc
    │   │   ├── batch_add.cc
    │   │   ├── cuda
    │   │   │   ├── BMM_F16T.cu
    │   │   │   ├── LlamaRMSNorm.cu
    │   │   │   ├── RotaryPosEmb.cu
    │   │   │   ├── batch_add.cu
    │   │   │   ├── embedding.cu
    │   │   │   ├── linear.cu
    │   │   │   └── softmax.cu
    │   │   ├── embedding.cc
    │   │   ├── linear.cc
    │   │   └── softmax.cc
    │   └── utils.cc
    ├── tests
    │   ├── cuda
    │   │   ├── test_Int4llamaAttention.cu
    │   │   ├── test_Int4llamaDecoder.cu
    │   │   ├── test_Int4llamaDecoderLayer.cu
    │   │   ├── test_Int4llamaForCausalLM.cu
    │   │   └── test_ops.cu
    │   ├── non_cuda
    │   │   ├── test_Int4llamaAttention.cc
    │   │   ├── test_Int4llamaDecoder.cc
    │   │   ├── test_Int4llamaDecoderLayer.cc
    │   │   ├── test_Int4llamaForCausalLM.cc
    │   │   └── test_ops.cc
    │   ├── test_Fp32OPTAttention.cc
    │   ├── test_Fp32OPTDecoder.cc
    │   ├── test_Fp32OPTDecoderLayer.cc
    │   ├── test_Fp32OPTForCausalLM.cc
    │   ├── test_Fp32llamaAttention.cc
    │   ├── test_Fp32llamaDecoder.cc
    │   ├── test_Fp32llamaDecoderLayer.cc
    │   ├── test_Fp32llamaForCausalLM.cc
    │   ├── test_Int8OPTAttention.cc
    │   ├── test_Int8OPTDecoder.cc
    │   ├── test_Int8OPTDecoderLayer.cc
    │   ├── test_LLaMATokenizer.cc
    │   ├── test_OPTForCausalLM.cc
    │   ├── test_OPTGenerate.cc
    │   ├── test_OPTTokenizer.cc
    │   └── utils_memalloc.h
    ├── tools
    │   ├── clip_exporter.py
    │   ├── copy_rotary_emb.sh
    │   ├── download_assets.sh
    │   ├── download_model.py
    │   ├── download_model_from_dropbox.py
    │   ├── export_model.sh
    │   ├── llama3_exporter.py
    │   ├── llama_exporter.py
    │   ├── llama_qkv_merger.py
    │   ├── llava_exporter.py
    │   ├── mistral_exporter.py
    │   ├── model_quantizer.py
    │   ├── opt_smooth_exporter.py
    │   ├── profile.sh
    │   ├── quantize_and_upload.py
    │   ├── quantize_constants.py
    │   ├── quantize_methods.py
    │   ├── rotary_emb_exporter.py
    │   ├── starcoder_exporter.py
    │   ├── test.sh
    │   ├── upload.py
    │   ├── upload_to_dropbox.py
    │   ├── vila_exporter.py
    │   └── zip_assets.sh
    ├── vila
    ├── vila_2.7b
    ├── voice_mistral
    ├── voice_vila
    └── voicechat_setup.sh
├── pyproject.toml
└── requirements.txt


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | ColumnLimit: 120
3 | ContinuationIndentWidth: 4
4 | IndentWidth: 4
5 | TabWidth: 4
6 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/.github/workflows/doxygen.yml:
--------------------------------------------------------------------------------
 1 | name: Generate and Deploy Doxygen Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 | 
15 |     - name: Doxygen Action
16 |       uses: mattnotmitt/doxygen-action@v1.1.0
17 |       with:
18 |         doxyfile-path: "./Doxyfile" # default is ./Doxyfile
19 |         working-directory: "." # default is .
20 | 
21 |     - name: Deploy
22 |       uses: peaceiris/actions-gh-pages@v3
23 |       with:
24 |         github_token: ${{ secrets.GITHUB_TOKEN }}
25 |         publish_dir: ./docs
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.a
 3 | *.pyc
 4 | *.cuu
 5 | *.ccc
 6 | .DS_Store
 7 | .build/
 8 | .cache/
 9 | .direnv/
10 | .envrc
11 | .swiftpm
12 | .venv
13 | .vs/
14 | .vscode/
15 | 
16 | llm/assets/
17 | models/
18 | *.bin
19 | !llama_vocab.bin
20 | !starcoder_vocab.bin
21 | !mistral_vocab.bin
22 | !llama3_vocab.bin
23 | *.zip
24 | *.txt
25 | !requirements.txt
26 | *.pt
27 | *.json
28 | test_*
29 | !test_*.cc
30 | !test_*.cu
31 | demo
32 | chat
33 | voicechat
34 | profile_*
35 | !profile_*.cc
36 | libtorch/
37 | checkpoints/
38 | 
39 | output.wav
40 | tmpfile
41 | TTS/
42 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "json"]
2 | 	path = json
3 | 	url = https://github.com/nlohmann/json
4 | [submodule "transformer/json"]
5 | 	path = llm/json
6 | 	url = https://github.com/nlohmann/json
7 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: "code_generator/tflite/.*"
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v4.0.1
 5 |     hooks:
 6 |       - id: trailing-whitespace
 7 |       - id: mixed-line-ending
 8 |         args: ["--fix=lf"]
 9 |       - id: end-of-file-fixer
10 |       - id: check-merge-conflict
11 |       - id: requirements-txt-fixer
12 |       - id: fix-encoding-pragma
13 |         args: ["--remove"]
14 |       - id: debug-statements
15 |       - id: check-toml
16 |   - repo: https://github.com/executablebooks/mdformat
17 |     rev: 0.7.10
18 |     hooks:
19 |       - id: mdformat
20 |   - repo: https://github.com/psf/black
21 |     rev: 22.3.0
22 |     hooks:
23 |       - id: black
24 |   - repo: https://github.com/pycqa/isort
25 |     rev: 5.12.0
26 |     hooks:
27 |       - id: isort
28 |         args: ["--sp", "pyproject.toml"]
29 |   - repo: https://github.com/pycqa/flake8
30 |     rev: 4.0.1
31 |     hooks:
32 |       - id: flake8
33 |         additional_dependencies:
34 |           - flake8-comprehensions==3.7.0
35 |           - flake8-docstrings==1.6.0
36 |   - repo: local
37 |     hooks:
38 |       - id: pylint
39 |         name: pylint
40 |         entry: pylint
41 |         language: system
42 |         types: [python]
43 |         require_serial: true
44 |   - repo: https://github.com/pre-commit/mirrors-mypy
45 |     rev: v0.910-1
46 |     hooks:
47 |       - id: mypy
48 |   - repo: https://github.com/pre-commit/mirrors-clang-format
49 |     rev: v13.0.0
50 |     hooks:
51 |       - id: clang-format
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 MIT HAN Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/figures/chat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat.gif


--------------------------------------------------------------------------------
/assets/figures/chat_demo_gpu.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat_demo_gpu.gif


--------------------------------------------------------------------------------
/assets/figures/chat_demo_m1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/chat_demo_m1.gif


--------------------------------------------------------------------------------
/assets/figures/coding_demo_gpu.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/coding_demo_gpu.gif


--------------------------------------------------------------------------------
/assets/figures/coding_demo_m1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/coding_demo_m1.gif


--------------------------------------------------------------------------------
/assets/figures/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/overview.png


--------------------------------------------------------------------------------
/assets/figures/smoothquant_intuition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/smoothquant_intuition.png


--------------------------------------------------------------------------------
/assets/figures/tinychat_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/tinychat_logo.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/CPR.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/CPR.jpg


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/Wall_fissure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/Wall_fissure.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/animal_blocking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/animal_blocking.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/car.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/car.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/pedestrian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/pedestrian.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/statue.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/statue.jpg


--------------------------------------------------------------------------------
/assets/figures/vlm_demo/windmill_people.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo/windmill_people.png


--------------------------------------------------------------------------------
/assets/figures/vlm_demo_m1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/figures/vlm_demo_m1.gif


--------------------------------------------------------------------------------
/assets/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/assets/slides.pdf


--------------------------------------------------------------------------------
/kernels/cuda/matmul_int4.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <iostream>
 3 | 
 4 | #include "../matmul.h"
 5 | 
 6 | namespace matmul {
 7 | 
 8 | void MatmulOperator::naive_mat_mul_fp16_int4(const struct matmul_params *params) {
 9 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
10 |     const int block_size = params->block_size;
11 |     // CHECK_MATRICES_int4weight(A, B, C);
12 | 
13 |     naive_float16_t weight;
14 |     for (int i = 0; i < C->row; i++) {
15 |         for (int j = 0; j < C->column; j++) {
16 |             naive_float16_t acc = (naive_float16_t)0.0;
17 | 
18 |             for (int k = 0; k < B->row; k++) {
19 |                 naive_float16_t s = params->fp16_scales[(k / block_size) * C->column + j];
20 |                 naive_float16_t z = static_cast<naive_float16_t>(8.0f); // TODO: support dynamic zeropoint
21 |                 naive_float16_t input = A->fp16_data_ptr[i * A->column + k];
22 | 
23 |                 // order of weights is 0 2 4 6 1 3 5 7
24 |                 if (j % 8 == 0)
25 |                     weight = ((naive_float16_t)(B->int32_data_ptr[k * B->column + (j / 8)] & 0x0000000F) - z) * s;
26 |                 else if (j % 8 == 1)
27 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x000F0000) >> 16) - z) * s;
28 |                 else if (j % 8 == 2)
29 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x000000F0) >> 4) - z) * s;
30 |                 else if (j % 8 == 3)
31 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x00F00000) >> 20) - z) * s;
32 |                 else if (j % 8 == 4)
33 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x00000F00) >> 8) - z) * s;
34 |                 else if (j % 8 == 5)
35 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x0F000000) >> 24) - z) * s;
36 |                 else if (j % 8 == 6)
37 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0x0000F000) >> 12) - z) * s;
38 |                 else if (j % 8 == 7)
39 |                     weight = ((naive_float16_t)((B->int32_data_ptr[k * B->column + (j / 8)] & 0xF0000000) >> 28) - z) * s;
40 | 
41 |                 acc += input * weight;
42 |                 // printf("naive_mat_mul_fp16_int4 - s: %f, input: %f, weight: %f, acc: %f\n", static_cast<float>(s), static_cast<float>(input), static_cast<float>(weight), static_cast<float>(acc));
43 |             }
44 | 
45 |             C->fp16_data_ptr[i * C->column + j] = acc;
46 |         }
47 |     }
48 | }
49 | 
50 | }  // namespace matmul
51 | 


--------------------------------------------------------------------------------
/kernels/cuda/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <stdio.h>
 3 | 
 4 | #include <cmath>
 5 | #include <cstdlib>
 6 | 
 7 | #include "../matmul.h"
 8 | 
 9 | namespace matmul {
10 | void fp32_ref_matmul(const struct matmul_params *params) {
11 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
12 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
13 | 
14 |     assert(A->column == B->row);
15 |     assert(C->row == A->row);
16 |     assert(C->column == B->column);
17 |     int m = A->row, n = B->column, k = A->column;
18 | 
19 |     for (int i = 0; i < m; i++) {
20 |         for (int j = 0; j < n; j++) {
21 |             float acc = 0;
22 |             for (int kk = 0; kk < k; kk++) {
23 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
24 |             }
25 |             acc = acc;
26 |             data_C[i * n + j] = acc;
27 |         }
28 |     }
29 | }
30 | 
31 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
32 |     fp32_ref_matmul(params);
33 | }
34 | 
35 | }  // namespace matmul
36 | 


--------------------------------------------------------------------------------
/kernels/matmul_imp.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | 
 8 | #include "matmul.h"
 9 | 
10 | namespace matmul {
11 | 
12 | void MatmulOperator::CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C) {
13 |     assert(A->column == B->row);
14 |     assert(C->column == B->column);
15 |     assert(C->row == A->row);
16 | }
17 | 
18 | void MatmulOperator::CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C) {
19 |     assert(B->row * B->column == A->column * C->column / 2);
20 |     assert(C->row == A->row);
21 | }
22 | 
23 | void MatmulOperator::mat_mul_transposed(const struct matmul_params *params) {
24 |     int i, j, k;
25 | 
26 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
27 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
28 | 
29 |     for (i = 0; i < C->row; i++)
30 |         for (j = 0; j < C->column; j++) {
31 |             float acc = 0;
32 |             for (k = 0; k < A->column; k++) acc += data_A[i * A->column + k] * data_B[j * B->column + k];
33 |             data_C[i * C->column + j] = acc;
34 |         }
35 | }
36 | 
37 | float interval_to_ms(struct timeval *start, struct timeval *end) {
38 |     float us_seconds = (end->tv_sec - start->tv_sec) * 1000000 + (end->tv_usec - start->tv_usec);
39 |     return us_seconds / 1000;
40 | }
41 | 
42 | }  // namespace matmul
43 | 


--------------------------------------------------------------------------------
/kernels/matmul_int8.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <iostream>
 3 | 
 4 | #include "matmul.h"
 5 | 
 6 | namespace matmul {
 7 | 
 8 | void MatmulOperator::naive_mat_mul_int8(const struct matmul_params *params) {
 9 |     int i, j, k;
10 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
11 |     int32_t A_zp = A->qparams.zero_point, C_zp = C->qparams.zero_point;
12 |     float A_sc = A->qparams.scale, B_sc = B->qparams.scale, C_sc = C->qparams.scale;
13 |     float effective_scale = A_sc * B_sc / C_sc;
14 |     int8_t *data_A = A->int8_data_ptr, *data_B = B->int8_data_ptr, *data_C = C->int8_data_ptr;
15 |     const int8_t q_min = C->qparams.q_min, q_max = C->qparams.q_max;
16 |     CHECK_MATRICES(A, B, C);
17 | 
18 |     for (i = 0; i < C->row; i++)
19 |         for (j = 0; j < C->column; j++) {
20 |             int acc = 0;
21 |             for (k = 0; k < A->column; k++)
22 |                 acc += ((int32_t)data_A[i * A->column + k] - A_zp) * data_B[k * B->column + j];
23 | 
24 |             acc = (int32_t)((float)acc * effective_scale);
25 |             acc -= C_zp;
26 |             acc = MAX(acc, q_min);
27 |             acc = MIN(acc, q_max);
28 |             data_C[i * C->column + j] = (int8_t)acc;
29 |         }
30 | }
31 | }  // namespace matmul
32 | 


--------------------------------------------------------------------------------
/kernels/metal/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = /opt/homebrew/opt/llvm/bin/clang++
 2 | CXXFLAGS = -std=c++17 -stdlib=libc++ -O3
 3 | 
 4 | # Executable and source files
 5 | TEST_TARGET = benchmark
 6 | TARGET = $(TEST_TARGET)
 7 | KERNEL_SRC = $(wildcard ./src/*.cpp)
 8 | 
 9 | SRC = $(KERNEL_SRC)
10 | INCLUDE_DIRS = -I./metal-cpp -I./include
11 | LIB = -framework Metal -framework Foundation -framework MetalKit
12 | 
13 | 
14 | # Default target
15 | all: $(TARGET)
16 | 
17 | # Linking
18 | benchmark: build_metallib
19 | 	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o benchmark.x app/main.cpp $(SRC) $(LIB) $(LDFLAGS)
20 | 
21 | build_air:
22 | 	xcrun -sdk macosx metal -ffast-math -fno-fast-math $(INCLUDE_DIRS) -c kernel/op.metal -o library.air
23 | 
24 | build_metallib: build_air
25 | 	xcrun -sdk macosx metallib library.air -o default.metallib
26 | 
27 | # Clean up
28 | clean:
29 | 	rm -f benchmark.x library.air library.metallib default.metallib
30 | 


--------------------------------------------------------------------------------
/kernels/metal/download_metal-cpp.sh:
--------------------------------------------------------------------------------
1 | wget https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13_iOS16.zip
2 | unzip metal-cpp_macOS13_iOS16.zip
3 | 


--------------------------------------------------------------------------------
/kernels/metal/include/MetalMatmulInt4.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Foundation/Foundation.hpp"
 4 | #include "Metal/Metal.hpp"
 5 | #include "opParams.h"
 6 | 
 7 | class MetalMatmulInt4 {
 8 |    public:
 9 |     MTL::Device *_mDevice;
10 | 
11 |     // The compute pipeline generated from the compute kernel in the .metal shader file.
12 |     MTL::ComputePipelineState *_mMatmulFunctionPSO;
13 | 
14 |     // The command queue used to pass commands to the device.
15 |     MTL::CommandQueue *_mCommandQueue;
16 | 
17 |     // Buffers to hold data.
18 |     MTL::Buffer *_mBufferA;
19 |     MTL::Buffer *_mBufferB;
20 |     MTL::Buffer *_mBufferScales;
21 |     MTL::Buffer *_mBufferResult;
22 |     MTL::Buffer *_mParams;
23 | 
24 |     // Matmul params
25 |     MetalMatMulParams *_mParamsPtr;
26 | 
27 |     MetalMatmulInt4(MTL::Device *device, MetalMatMulParams param);
28 |     ~MetalMatmulInt4();
29 | 
30 |     void prepareData();
31 |     void sendComputeCommand();
32 |     void verifyResults();
33 | 
34 |    private:
35 |     void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder);
36 |     void generateRandomFloatData(MTL::Buffer *buffer, int length);
37 |     void generateRandomIn4Data(MTL::Buffer *buffer, int length);
38 | };
39 | 


--------------------------------------------------------------------------------
/kernels/metal/include/opParams.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | typedef struct {
4 |     unsigned int m;
5 |     unsigned int n;
6 |     unsigned int k;
7 |     unsigned int group_size;
8 | } MetalMatMulParams;
9 | 


--------------------------------------------------------------------------------
/kernels/metal/matmul_metal_int4_imp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | 
 5 | #include "Foundation/Foundation.hpp"
 6 | #include "Metal/Metal.hpp"
 7 | #include "include/opParams.h"
 8 | 
 9 | typedef struct {
10 |     float *A, *C, *scales, *offset;
11 |     unsigned char *B;
12 | } MetalMatmulBuffers;
13 | 
14 | class MetalMatmulInt4IMP {
15 |    public:
16 |     static MTL::Device *_mDevice;
17 | 
18 |     // The compute pipeline generated from the compute kernel in the .metal shader file.
19 |     static MTL::ComputePipelineState *_mMatmulFunctionPSO;
20 | 
21 |     // The command queue used to pass commands to the device.
22 |     static MTL::CommandQueue *_mCommandQueue;
23 | 
24 |     // Buffers to hold data.
25 |     static MTL::Buffer *_mBufferA;
26 |     static MTL::Buffer *_mBufferB;
27 |     static MTL::Buffer *_mBufferScales;
28 |     static MTL::Buffer *_mBufferResult;
29 |     static MTL::Buffer *_mParams;
30 | 
31 |     static std::unordered_map<void *, MTL::Buffer *> _mumap;
32 | 
33 |     static bool has_init;
34 |     static void init();
35 |     static void run(MetalMatMulParams param, MetalMatmulBuffers *bufferParams);
36 |     static void *allocateSharedMem(size_t size);
37 | 
38 |     static MetalMatMulParams *_mParamsPtr;
39 |     static void sendComputeCommand();
40 |     static void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder);
41 |     static MTL::Buffer *getBufferfromPtr(void *ptr);
42 | };
43 | 


--------------------------------------------------------------------------------
/kernels/metal/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void fp32_ref_matmul(const struct matmul_params *params) {
12 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
13 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
14 | 
15 |     assert(A->column == B->row);
16 |     assert(C->row == A->row);
17 |     assert(C->column == B->column);
18 |     int m = A->row, n = B->column, k = A->column;
19 | 
20 |     for (int i = 0; i < m; i++) {
21 |         for (int j = 0; j < n; j++) {
22 |             float acc = 0;
23 |             for (int kk = 0; kk < k; kk++) {
24 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
25 |             }
26 |             acc = acc;
27 |             data_C[i * n + j] = acc;
28 |         }
29 |     }
30 | }
31 | 
32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
33 |     fp32_ref_matmul(params);
34 | }
35 | 
36 | }  // namespace matmul
37 | 


--------------------------------------------------------------------------------
/kernels/pthread_pool.cc:
--------------------------------------------------------------------------------
  1 | #include "pthread_pool.h"
  2 | #include <pthread.h>
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | 
  6 | struct pool_queue {
  7 | 	void *arg;
  8 | 	char free;
  9 | 	struct pool_queue *next;
 10 | };
 11 | 
 12 | struct pool {
 13 | 	char cancelled;
 14 | 	void *(*fn)(void *);
 15 | 	unsigned int remaining;
 16 | 	unsigned int nthreads;
 17 | 	struct pool_queue *q;
 18 | 	struct pool_queue *end;
 19 | 	pthread_mutex_t q_mtx;
 20 | 	pthread_cond_t q_cnd;
 21 | 	pthread_t threads[1];
 22 | };
 23 | 
 24 | static void * thread(void *arg);
 25 | 
 26 | void * pool_start(void * (*thread_func)(void *), unsigned int threads) {
 27 | 	struct pool *p = (struct pool *) malloc(sizeof(struct pool) + (threads-1) * sizeof(pthread_t));
 28 | 	int i;
 29 | 
 30 | 	pthread_mutex_init(&p->q_mtx, NULL);
 31 | 	pthread_cond_init(&p->q_cnd, NULL);
 32 | 	p->nthreads = threads;
 33 | 	p->fn = thread_func;
 34 | 	p->cancelled = 0;
 35 | 	p->remaining = 0;
 36 | 	p->end = NULL;
 37 | 	p->q = NULL;
 38 | 
 39 | 	for (i = 0; i < threads; i++) {
 40 | 		pthread_create(&p->threads[i], NULL, &thread, p);
 41 | 	}
 42 | 
 43 | 	return p;
 44 | }
 45 | 
 46 | void pool_enqueue(void *pool, void *arg, char free) {
 47 | 	struct pool *p = (struct pool *) pool;
 48 | 	struct pool_queue *q = (struct pool_queue *) malloc(sizeof(struct pool_queue));
 49 | 	q->arg = arg;
 50 | 	q->next = NULL;
 51 | 	q->free = free;
 52 | 
 53 | 	pthread_mutex_lock(&p->q_mtx);
 54 | 	if (p->end != NULL) p->end->next = q;
 55 | 	if (p->q == NULL) p->q = q;
 56 | 	p->end = q;
 57 | 	p->remaining++;
 58 | 	pthread_cond_signal(&p->q_cnd);
 59 | 	pthread_mutex_unlock(&p->q_mtx);
 60 | }
 61 | 
 62 | void pool_wait(void *pool) {
 63 | 	struct pool *p = (struct pool *) pool;
 64 | 
 65 | 	pthread_mutex_lock(&p->q_mtx);
 66 | 	while (!p->cancelled && p->remaining) {
 67 | 		pthread_cond_wait(&p->q_cnd, &p->q_mtx);
 68 | 	}
 69 | 	pthread_mutex_unlock(&p->q_mtx);
 70 | }
 71 | 
 72 | void pool_end(void *pool) {
 73 | 	struct pool *p = (struct pool *) pool;
 74 | 	struct pool_queue *q;
 75 | 	int i;
 76 | 
 77 | 	p->cancelled = 1;
 78 | 
 79 | 	pthread_mutex_lock(&p->q_mtx);
 80 | 	pthread_cond_broadcast(&p->q_cnd);
 81 | 	pthread_mutex_unlock(&p->q_mtx);
 82 | 
 83 | 	for (i = 0; i < p->nthreads; i++) {
 84 | 		pthread_join(p->threads[i], NULL);
 85 | 	}
 86 | 
 87 | 	while (p->q != NULL) {
 88 | 		q = p->q;
 89 | 		p->q = q->next;
 90 | 
 91 | 		if (q->free) free(q->arg);
 92 | 		free(q);
 93 | 	}
 94 | 
 95 | 	free(p);
 96 | }
 97 | 
 98 | static void * thread(void *arg) {
 99 | 	struct pool_queue *q;
100 | 	struct pool *p = (struct pool *) arg;
101 | 
102 | 	while (!p->cancelled) {
103 | 		pthread_mutex_lock(&p->q_mtx);
104 | 		while (!p->cancelled && p->q == NULL) {
105 | 			pthread_cond_wait(&p->q_cnd, &p->q_mtx);
106 | 		}
107 | 		if (p->cancelled) {
108 | 			pthread_mutex_unlock(&p->q_mtx);
109 | 			return NULL;
110 | 		}
111 | 		q = p->q;
112 | 		p->q = q->next;
113 | 		p->end = (q == p->end ? NULL : p->end);
114 | 		pthread_mutex_unlock(&p->q_mtx);
115 | 
116 | 		p->fn(q->arg);
117 | 
118 | 		if (q->free) free(q->arg);
119 | 		free(q);
120 | 		q = NULL;
121 | 
122 | 		pthread_mutex_lock(&p->q_mtx);
123 | 		p->remaining--;
124 | 		pthread_cond_broadcast(&p->q_cnd);
125 | 		pthread_mutex_unlock(&p->q_mtx);
126 | 	}
127 | 
128 | 	return NULL;
129 | }
130 | 


--------------------------------------------------------------------------------
/kernels/pthread_pool.h:
--------------------------------------------------------------------------------
 1 | /** \file
 2 |  * This file provides prototypes for an implementation of a pthread pool.
 3 |  */
 4 | 
 5 | #ifndef __PTHREAD_POOL_H__
 6 | /**
 7 |  * Create a new thread pool.
 8 |  * 
 9 |  * New tasks should be enqueued with pool_enqueue. thread_func will be called
10 |  * once per queued task with its sole argument being the argument given to
11 |  * pool_enqueue.
12 |  *
13 |  * \param thread_func The function executed by each thread for each work item.
14 |  * \param threads The number of threads in the pool.
15 |  * \return A pointer to the thread pool.
16 |  */
17 | void * pool_start(void * (*thread_func)(void *), unsigned int threads);
18 | 
19 | /**
20 |  * Enqueue a new task for the thread pool.
21 |  *
22 |  * \param pool A thread pool returned by start_pool.
23 |  * \param arg The argument to pass to the thread worker function.
24 |  * \param free If true, the argument will be freed after the task has completed.
25 |  */
26 | void pool_enqueue(void *pool, void *arg, char free);
27 | 
28 | /**
29 |  * Wait for all queued tasks to be completed.
30 |  */
31 | void pool_wait(void *pool);
32 | 
33 | /**
34 |  * Stop all threads in the pool.
35 |  *
36 |  * Note that this function will block until all threads have terminated.
37 |  * All queued items will also be freed, along with the pool itself.
38 |  * Remaining work item arguments will be freed depending on the free argument to
39 |  * pool_enqueue.
40 |  */
41 | void pool_end(void *pool);
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/kernels/ref/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void fp32_ref_matmul(const struct matmul_params *params) {
12 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
13 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
14 | 
15 |     assert(A->column == B->row);
16 |     assert(C->row == A->row);
17 |     assert(C->column == B->column);
18 |     int m = A->row, n = B->column, k = A->column;
19 | 
20 |     for (int i = 0; i < m; i++) {
21 |         for (int j = 0; j < n; j++) {
22 |             float acc = 0;
23 |             for (int kk = 0; kk < k; kk++) {
24 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
25 |             }
26 |             acc = acc;
27 |             data_C[i * n + j] = acc;
28 |         }
29 |     }
30 | }
31 | 
32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
33 |     fp32_ref_matmul(params);
34 | }
35 | 
36 | }  // namespace matmul
37 | 


--------------------------------------------------------------------------------
/kernels/ref/matmul_ref_int4.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) {
12 |     int i, j, k;
13 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
14 |     const int block_size = params->block_size;
15 |     float *scale = params->scales, *offset = params->offset;
16 | 
17 |     assert(params->block_size == 32);  // support block size 32 for now
18 | 
19 |     for (i = 0; i < C->row; i++) {
20 |         for (j = 0; j < C->column; j++) {
21 |             float acc = 0;
22 |             for (k = 0; k < B->row; k += block_size) {
23 |                 float s = scale[j * (B->row / 16) + k / 32];  // /16:B->column is packed 4bits
24 |                 float o = offset[j * (B->row / 16) + k / 32];
25 |                 uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2];
26 |                 float *x_ptr = &A->data_ptr[i * A->column + k];
27 |                 for (int qi = 0; qi < block_size / 2; qi++) {
28 |                     uint8_t packed_int4 = weight_32_int4[qi];
29 |                     float deq_0 = (float)(packed_int4 & 0x0F) * s + o;
30 |                     float deq_1 = (float)(packed_int4 >> 4) * s + o;
31 |                     acc += *x_ptr++ * deq_0;
32 |                     acc += *x_ptr++ * deq_1;
33 |                 }
34 |             }
35 |             C->data_ptr[i * C->column + j] = acc;
36 |         }
37 |     }
38 | };
39 | 
40 | }  // namespace matmul
41 | 


--------------------------------------------------------------------------------
/llm/application/README.md:
--------------------------------------------------------------------------------
 1 | ## Demo video of our speech-to-speech chatbot
 2 | 
 3 | - Please find the speech-to-speech demo video using TinyChatEngine [here](https://youtu.be/Bw5Dm3aWMnA?si=CCvZDmq3HwowEQcC).
 4 | 
 5 | ## Instructions to run a speech-to-speech chatbot demo
 6 | 
 7 | - Follow the [instructions](../../README.md) to download and deploy LLaMA2-7B-chat.
 8 | 
 9 | - Configure whisper.cpp. You may need to update the Makefile and ggml.h files of whisper.cpp to get it running. For related issues, please refer to the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repository.
10 | 
11 |   ```bash
12 |   # Get whisper.cpp for speech recognition
13 |   cd llm
14 |   git clone https://github.com/ggerganov/whisper.cpp
15 |   cd whisper.cpp
16 |   git checkout a4bb2df
17 | 
18 |   # Install SDL2 on Linux
19 |   sudo apt-get install libsdl2-dev
20 |   # Install SDL2 on Mac OS
21 |   brew install sdl2
22 | 
23 |   git apply ../application/sts_utils/clean_up.patch
24 |   bash ./models/download-ggml-model.sh base.en
25 |   # NVIDIA GPU (Note: you may need to change the Makefile of whisper.cpp depending on your environment or device)
26 |   WHISPER_CUBLAS=1 make -j stream
27 |   # Otherwise
28 |   make stream
29 |   cd ../
30 |   ```
31 | 
32 | - If you have an edge device and want a better TTS program than espeak, download [piper](https://github.com/rhasspy/piper)
33 | 
34 |   ```bash
35 |     mkdir TTS
36 |     cd TTS
37 |     wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz
38 |     tar -xvzf piper_arm64.tar.gz
39 |   ```
40 | 
41 |   - Download your preferred voice from the [huggingface repo](https://huggingface.co/rhasspy/piper-voices/tree/v1.0.0) and drag both the .onxx and .onnx.json files into the TTS directory
42 | 
43 | - Edit the listen shell file in the transformers directory so whisper.cpp is using your preferred parameters.
44 | 
45 |   ```bash
46 |   nano application/sts_utils/listen
47 |   ```
48 | 
49 | - Edit the speak shell file in the transformers directory so the demo uses your preferred TTS program.
50 | 
51 |   ```bash
52 |   nano application/sts_utils/speak
53 |   ```
54 |   
55 | - Test each of the submodules to ensure they are working as intended
56 | 
57 |   ```bash
58 |   ./application/sts_utils/listen
59 |   cat tmpfile
60 |   ./application/sts_utils/speak hello
61 |   ```
62 | 
63 | - Compile and start the voicechat locally. 
64 | 
65 |   ```bash
66 |   make -j chat
67 |   ./chat -v # chat.exe -v on Windows
68 |   ```
69 | 


--------------------------------------------------------------------------------
/llm/application/sts_utils/listen:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | <<comm
 4 | usage: ./stream [options]
 5 | 
 6 | options:
 7 |   -h,       --help          [default] show this help message and exit
 8 |   -t N,     --threads N     [4      ] number of threads to use during computation
 9 |             --step N        [3000   ] audio step size in milliseconds
10 |             --length N      [10000  ] audio length in milliseconds
11 |             --keep N        [200    ] audio to keep from previous step in ms
12 |   -c ID,    --capture ID    [-1     ] capture device ID
13 |   -mt N,    --max-tokens N  [32     ] maximum number of tokens per audio chunk
14 |   -ac N,    --audio-ctx N   [0      ] audio context size (0 - all)
15 |   -vth N,   --vad-thold N   [0.60   ] voice activity detection threshold
16 |   -fth N,   --freq-thold N  [100.00 ] high-pass frequency cutoff
17 |   -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
18 |   -tr,      --translate     [false  ] translate from source language to english
19 |   -nf,      --no-fallback   [false  ] do not use temperature fallback while decoding
20 |   -ps,      --print-special [false  ] print special tokens
21 |   -kc,      --keep-context  [false  ] keep context between audio chunks
22 |   -l LANG,  --language LANG [en     ] spoken language
23 |   -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
24 |   -f FNAME, --file FNAME    [       ] text output file name
25 |   -tdrz,     --tinydiarize  [false  ] enable tinydiarize (requires a tdrz model)
26 | comm
27 | 
28 | ./whisper.cpp/stream -m ./whisper.cpp/models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 -c 1 > tmpfile
29 | 


--------------------------------------------------------------------------------
/llm/application/sts_utils/speak:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Usage:
 4 | #  speak.sh <voice_id> <text-to-speak>
 5 | 
 6 | # espeak
 7 | # Mac OS: brew install espeak
 8 | # Linux: apt-get install espeak
 9 | #
10 | #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$1"
11 | 
12 | # for Mac
13 | say "$1"
14 | 
15 | # for edge devices
16 | # echo "$1" | ./TTS/piper/piper --model ./TTS/en_US-ryan-low.onnx --output_file output.wav && aplay output.wav
17 | 
18 | # Eleven Labs
19 | # To use it, install the elevenlabs module from pip (pip install elevenlabs)
20 | # It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
21 | #Keep the line commented to use the free version whitout api key
22 | #
23 | #export ELEVEN_API_KEY=your_api_key
24 | #wd=$(dirname $0)
25 | #script=$wd/eleven-labs.py
26 | #python3 $script $1 "$1" >/dev/null 2>&1
27 | #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
28 | 


--------------------------------------------------------------------------------
/llm/chat_llama2-13b:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat LLaMA2_13B_chat INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/chat_llama2-7b:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat LLaMA2_7B_chat INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/code:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat CodeLLaMA_7B_Instruct INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/half-2.2.0/include/README.md:
--------------------------------------------------------------------------------
1 | This is the IEEE 754-based half-precision floating-point library by Christian Rau: https://half.sourceforge.net/index.html.
2 | 


--------------------------------------------------------------------------------
/llm/include/GPTBigCodeTokenizer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Adapted from llama.cpp and starcoder.cpp:
 4 | https://github.com/ggerganov/llama.cpp
 5 | https://github.com/bigcode-project/starcoder.cpp
 6 | 
 7 | */
 8 | 
 9 | #ifndef GPTBIGCODE_TOKENIZER_H
10 | #define GPTBIGCODE_TOKENIZER_H
11 | 
12 | #include <cstdint>
13 | #include <cstdio>
14 | #include <iostream>
15 | #include <map>
16 | #include <queue>
17 | #include <string>
18 | #include <unordered_map>
19 | #include <vector>
20 | #include <random>
21 | #include <thread>
22 | #include <fstream>
23 | 
24 | //
25 | // Vocab utils
26 | //
27 | 
28 | std::string trim(const std::string & s);
29 | 
30 | std::string replace(
31 |         const std::string & s,
32 |         const std::string & from,
33 |         const std::string & to);
34 | 
35 | struct starcoder_vocab {
36 |     std::map<std::string, int32_t> token_to_id;
37 |     std::map<int32_t, std::string> id_to_token;
38 |     std::vector<std::string> special_tokens;
39 | 
40 |     void add_special_token(const std::string & token);
41 | };
42 | 
43 | /*
44 |  *  Tokenizer
45 |  */
46 | starcoder_vocab starcoder_init_vocab(const std::string & vocab_file);
47 | 
48 | const char* starcoder_id_to_token(starcoder_vocab& vocab, int id);
49 | 
50 | int starcoder_tokenize(const starcoder_vocab &vocab, const std::string &text, std::vector<int> &final_tokens, int n_max_tokens);
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/llm/include/LLaMATokenizer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | Adapted from llama.cpp:
 4 | https://github.com/ggerganov/llama.cpp
 5 | 
 6 | */
 7 | 
 8 | #ifndef LLaMA_TOKENIZER_H
 9 | #define LLaMA_TOKENIZER_H
10 | 
11 | #include <cstdint>
12 | #include <cstdio>
13 | #include <iostream>
14 | #include <map>
15 | #include <queue>
16 | #include <string>
17 | #include <unordered_map>
18 | #include <vector>
19 | 
20 | static int llama_token_bos() { return 1; }
21 | 
22 | static int llama_token_eos() { return 2; }
23 | 
24 | static int llama_token_nl() { return 13; }
25 | 
26 | struct llama_vocab {
27 |     struct token_score {
28 |         std::string tok;
29 |         float score;
30 |     };
31 | 
32 |     std::unordered_map<std::string, int32_t> token_to_id;
33 |     std::vector<token_score> id_to_token;
34 | };
35 | 
36 | /*
37 |  *  Tokenizer
38 |  */
39 | static size_t utf8_len(char src) {
40 |     const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
41 |     uint8_t highbits = static_cast<uint8_t>(src) >> 4;
42 | 
43 |     return lookup[highbits];
44 | }
45 | 
46 | struct llama_sp_symbol {
47 |     using index = int;
48 |     index prev;
49 |     index next;
50 |     const char* text;
51 |     size_t n;
52 | };
53 | 
54 | struct llama_sp_bigram {
55 |     struct comparator {
56 |         bool operator()(llama_sp_bigram& l, llama_sp_bigram& r) {
57 |             return (l.score < r.score) || (l.score == r.score && l.left > r.left);
58 |         }
59 |     };
60 |     using queue_storage = std::vector<llama_sp_bigram>;
61 |     using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
62 |     llama_sp_symbol::index left;
63 |     llama_sp_symbol::index right;
64 |     float score;
65 |     size_t size;
66 | };
67 | 
68 | llama_vocab llama_init_vocab(const char* vocab_file);
69 | 
70 | const char* llama_id_to_token(const llama_vocab& vocab, int id);
71 | 
72 | int llama_tokenize(const llama_vocab& vocab, const char* text, int* tokens, int n_max_tokens, bool add_bos);
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/llm/include/OPTTokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPT_TOKENIZER_H
 2 | #define OPT_TOKENIZER_H
 3 | 
 4 | #include <algorithm>
 5 | #include <cassert>
 6 | #include <cmath>
 7 | #include <codecvt>
 8 | #include <cstdio>
 9 | #include <fstream>
10 | #include <locale>
11 | #include <map>
12 | #include <queue>
13 | #include <random>
14 | #include <regex>
15 | #include <set>
16 | #include <sstream>
17 | #include <string>
18 | #include <unordered_map>
19 | #include <utility>
20 | #include <vector>
21 | // #include <boost/regex.hpp> // Tricky to support this in windows
22 | #include <nlohmann/json.hpp>
23 | 
24 | // std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
25 | 
26 | struct pair_hash {
27 |     template <class T1, class T2>
28 |     std::size_t operator()(const std::pair<T1, T2> &p) const {
29 |         auto h1 = std::hash<T1>{}(p.first);
30 |         auto h2 = std::hash<T2>{}(p.second);
31 |         return h1 ^ h2;
32 |     }
33 | };
34 | 
35 | class Encoder {
36 |    public:
37 |     Encoder(std::map<std::string, int> encoder, std::vector<std::pair<std::string, std::string>> bpe_merges);
38 |     std::unordered_map<int, std::string> bytes_to_unicode();
39 |     std::set<std::pair<std::string, std::string>> get_pairs(std::vector<std::string> word);
40 |     std::string bpe(std::string token);
41 |     std::vector<int> encode(std::string text);
42 |     std::string decode(std::vector<int> tokens);
43 | 
44 |    private:
45 |     std::map<std::string, int> encoder;
46 |     std::map<int, std::string> decoder;
47 |     std::unordered_map<int, std::string> byte_encoder;
48 |     std::unordered_map<std::string, int> byte_decoder;
49 |     std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
50 |     std::unordered_map<std::string, std::string> cache;
51 | };
52 | 
53 | Encoder get_encoder(std::string vocab_file, std::string bpe_file);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/llm/include/interface.h:
--------------------------------------------------------------------------------
 1 | #ifndef INTERFACE_H
 2 | #define INTERFACE_H
 3 | 
 4 | void set_print_black();
 5 | void set_print_red();
 6 | void set_print_yellow();
 7 | void set_print_bold_yellow();
 8 | void set_print_blue();
 9 | void set_print_white();
10 | void set_print_reset();
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32CLIPAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Fp32CLIPAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Fp32CLIPAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Fp32CLIPAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Fp32CLIPAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                              Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Fp32CLIPAttention {
32 |    public:
33 |     Fp32CLIPAttention(std::string param_path, const struct model_config config);
34 |     Fp32CLIPAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Fp32CLIPAttention_output forward(const struct Fp32CLIPAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     // void shape_qkv(Matrix3D<float> unshape, Matrix3D<float> shaped_q, Matrix3D<float> shaped_k,
42 |     //                                       Matrix3D<float> shaped_v, int sqlen);
43 |     int embed_dim, num_heads, head_dim;
44 |     Linear_FP k_proj, v_proj, q_proj, out_proj, qkv_proj;
45 |     BMM_F32T qk_bmm, pv_bmm;
46 |     std::string profile_name = "Fp32CLIPAttention";
47 | };
48 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32CLIPEncoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32CLIPEncoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32CLIPEncoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32CLIPEncoder_input {
14 |     Matrix3D<float> hidden_states;
15 |     Matrix3D<float> attention_mask;
16 |     std::vector<Matrix3D<float>> past_keys, past_values;
17 |     bool has_past_keys_values;
18 | 
19 |     Fp32CLIPEncoder_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_)
20 |         : hidden_states(hidden_states_), attention_mask(attention_mask_) { 
21 |         has_past_keys_values = false; 
22 |     }
23 |     Fp32CLIPEncoder_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, 
24 |                           std::vector<Matrix3D<float>> past_keys_, std::vector<Matrix3D<float>> past_values_)
25 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), past_keys(past_keys_), past_values(past_values_) {
26 |         has_past_keys_values = true;
27 |     }
28 | };
29 | 
30 | class Fp32CLIPEncoder {
31 |    public:
32 |     Fp32CLIPEncoder(std::string param_path, const struct model_config config);
33 |     Fp32CLIPEncoder(){};
34 |     struct Fp32CLIPEncoder_output forward(const struct Fp32CLIPEncoder_input& input);
35 |     std::vector<Fp32CLIPEncoderLayer> layers;
36 |     std::string profile_name = "Fp32CLIPEncoder";
37 | };
38 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32CLIPEncoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32CLIPAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Fp32CLIPEncoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Fp32CLIPEncoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                  std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Fp32CLIPEncoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Fp32CLIPEncoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Fp32CLIPEncoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> attention_mask_,
30 |                                 Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Fp32CLIPEncoderLayer {
40 |    public:
41 |     Fp32CLIPEncoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Fp32CLIPEncoderLayer_output forward(const struct Fp32CLIPEncoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LayerNorm layer_norm1, layer_norm2;
46 |     Linear_FP mlp_fc1, mlp_fc2;
47 |     Fp32CLIPAttention attn;
48 |     std::string profile_name = "Fp32CLIPEncoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32CLIPVisionTransformer.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32CLIPEncoder.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32CLIPVisionTransformer_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32CLIPVisionTransformer_input {
14 |     Matrix3D<float> input_image;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Fp32CLIPVisionTransformer_input() {}
19 |     Fp32CLIPVisionTransformer_input(Matrix3D<float> input_image_) : input_image(input_image_) { has_past_keys_values = false; }
20 |     Fp32CLIPVisionTransformer_input(Matrix3D<float> input_image_, std::vector<Matrix3D<float>> past_keys_,
21 |                            std::vector<Matrix3D<float>> past_values_)
22 |         : input_image(input_image_), past_keys(past_keys_), past_values(past_values_) {
23 |         has_past_keys_values = true;
24 |     }
25 | };
26 | 
27 | class Fp32CLIPVisionTransformer {
28 |    public:
29 |     Fp32CLIPVisionTransformer(std::string param_path, const struct model_config config, bool is_vila);
30 |     Fp32CLIPVisionTransformer(){};
31 |     struct Fp32CLIPVisionTransformer_output forward(const struct Fp32CLIPVisionTransformer_input& input, bool is_vila);
32 |     Embedding embed_positions;
33 |     Conv2D embed_patch;
34 |     LayerNorm pre_layernorm;
35 |     Linear_FP mm_proj_0, mm_proj_2;
36 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, image_size, patch_size, num_patches, num_positions, 
37 |         projection_dim, mmproj_dim;
38 |     std::vector<Fp32CLIPEncoderLayer> layers;
39 |     std::string profile_name = "Fp32CLIPVisionTransformer";
40 | 
41 |    private:
42 |     Fp32CLIPEncoder encoder;
43 |     float* patch_embeds_buf;
44 |     float* class_embeds_buf;
45 |     float* pos_embeds_buf;
46 |     float* last_hidden_states_buf;
47 |     float* hidden_states_buf;
48 |     float* embeddings_buf;
49 |     float* mm_proj_0_arr;
50 |     float* mm_proj_2_arr;
51 | };
52 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32GPTBigCodeAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Fp32GPTBigCodeAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Fp32GPTBigCodeAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Fp32GPTBigCodeAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Fp32GPTBigCodeAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                            Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Fp32GPTBigCodeAttention {
32 |    public:
33 |     Fp32GPTBigCodeAttention(std::string param_path, const struct model_config config);
34 |     Fp32GPTBigCodeAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Fp32GPTBigCodeAttention_output forward(const struct Fp32GPTBigCodeAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape_qkv(Matrix3D<float> unshape, Matrix3D<float> shaped_q, Matrix3D<float> shaped_k,
41 |                                           Matrix3D<float> shaped_v, int sqlen);
42 |     float scaling;
43 |     int embed_dim, num_heads, head_dim, kv_heads, kv_dim;
44 |     BMM_F32T qk_bmm, pv_bmm;
45 |     Linear_FP c_attn, c_proj;
46 |     std::string profile_name = "Fp32GPTBigCodeAttention";
47 | };
48 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32GPTBigCodeDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32GPTBigCodeDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32GPTBigCodeDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32GPTBigCodeDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Fp32GPTBigCodeDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Fp32GPTBigCodeDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                          std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Fp32GPTBigCodeDecoder {
27 |    public:
28 |     Fp32GPTBigCodeDecoder(std::string param_path, const struct model_config config);
29 |     Fp32GPTBigCodeDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Fp32GPTBigCodeDecoder_output forward(const struct Fp32GPTBigCodeDecoder_input& input);
33 |     Embedding wte, wpe;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, max_position_embeddings;
35 |     std::vector<Fp32GPTBigCodeDecoderLayer> layers;
36 |     LayerNorm ln_f;
37 |     std::string profile_name = "Fp32GPTBigCodeDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32GPTBigCodeDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32GPTBigCodeAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Fp32GPTBigCodeDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Fp32GPTBigCodeDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Fp32GPTBigCodeDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Fp32GPTBigCodeDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Fp32GPTBigCodeDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Fp32GPTBigCodeDecoderLayer {
40 |    public:
41 |     Fp32GPTBigCodeDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Fp32GPTBigCodeDecoderLayer_output forward(const struct Fp32GPTBigCodeDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LayerNorm ln_1, ln_2;  // from torch_int.nn
46 |     Linear_FP fc1, fc2;
47 |     Fp32GPTBigCodeAttention attn;
48 |     std::string profile_name = "Fp32GPTBigCodeDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32GPTBigCodeForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32GPTBigCodeDecoder.h"
 2 | 
 3 | struct Fp32GPTBigCodeForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Fp32GPTBigCodeForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Fp32GPTBigCodeForCausalLM_input() {}
13 |     Fp32GPTBigCodeForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
14 |     Fp32GPTBigCodeForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
15 |                              std::vector<Matrix3D<float>> past_values_)
16 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
17 |         has_past_keys_values = true;
18 |     }
19 | };
20 | 
21 | class Fp32GPTBigCodeForCausalLM {
22 |    public:
23 |     Fp32GPTBigCodeForCausalLM(std::string param_path, const struct model_config config);
24 |     struct Fp32GPTBigCodeForCausalLM_output forward(const struct Fp32GPTBigCodeForCausalLM_input& input);
25 | 
26 |    private:
27 |     Fp32GPTBigCodeDecoder decoder;
28 |     Linear_FP lm_head;
29 |     std::string profile_name = "Fp32GPTBigCodeForCausalLM";
30 |     float* logits_output;
31 |     float* lm_head_weight;
32 | };
33 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32OPTAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Fp32OPTAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Fp32OPTAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Fp32OPTAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Fp32OPTAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                            Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Fp32OPTAttention {
32 |    public:
33 |     Fp32OPTAttention(std::string param_path, const struct model_config config);
34 |     Fp32OPTAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Fp32OPTAttention_output forward(const struct Fp32OPTAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shpae(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     float scaling;
42 |     int embed_dim, num_heads, head_dim;
43 |     BMM_F32T qk_bmm, pv_bmm;
44 |     Linear_FP k_proj, v_proj, q_proj, out_proj;
45 |     std::string profile_name = "Fp32OPTAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32OPTDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32OPTDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32OPTDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32OPTDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Fp32OPTDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Fp32OPTDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                          std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Fp32OPTDecoder {
27 |    public:
28 |     Fp32OPTDecoder(std::string param_path, const struct model_config config);
29 |     Fp32OPTDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Fp32OPTDecoder_output forward(const struct Fp32OPTDecoder_input& input);
33 |     Embedding embed_tokens, embed_positions;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Fp32OPTDecoderLayer> layers;
36 |     LayerNorm final_layer_norm;
37 |     std::string profile_name = "Fp32OPTDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32OPTDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Fp32OPTDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Fp32OPTDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Fp32OPTDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Fp32OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Fp32OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Fp32OPTDecoderLayer {
40 |    public:
41 |     Fp32OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Fp32OPTDecoderLayer_output forward(const struct Fp32OPTDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LayerNorm self_attn_layer_norm, final_layer_norm;  // from torch_int.nn
46 |     Linear_FP fc1, fc2;
47 |     Fp32OPTAttention attn;
48 |     std::string profile_name = "Fp32OPTDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32OPTForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTDecoder.h"
 2 | 
 3 | struct Fp32OPTForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Fp32OPTForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Fp32OPTForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
13 |     Fp32OPTForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
14 |                              std::vector<Matrix3D<float>> past_values_)
15 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
16 |         has_past_keys_values = true;
17 |     }
18 | };
19 | 
20 | class Fp32OPTForCausalLM {
21 |    public:
22 |     Fp32OPTForCausalLM(std::string param_path, const struct model_config config);
23 |     struct Fp32OPTForCausalLM_output forward(const struct Fp32OPTForCausalLM_input& input);
24 | 
25 |    private:
26 |     Fp32OPTDecoder decoder;
27 |     Linear_FP lm_head;
28 |     std::string profile_name = "Fp32OPTForCausalLM";
29 |     float* logits_output;
30 |     float* lm_head_weight;
31 | };
32 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32llamaAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Fp32llamaAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Fp32llamaAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Fp32llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Fp32llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                              Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Fp32llamaAttention {
32 |    public:
33 |     Fp32llamaAttention(std::string param_path, const struct model_config config);
34 |     Fp32llamaAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Fp32llamaAttention_output forward(const struct Fp32llamaAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     int embed_dim, num_heads, head_dim;
42 |     Linear_FP k_proj, v_proj, q_proj, o_proj;
43 |     RotaryPosEmb rotary_pos_emb;
44 |     BMM_F32T qk_bmm, pv_bmm;
45 |     std::string profile_name = "Fp32llamaAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32llamaDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32llamaDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32llamaDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32llamaDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     Matrix3D<float> image_embed;
16 |     Matrix3D<int> second_input_ids;
17 |     std::vector<Matrix3D<float>> past_keys, past_values;
18 |     bool has_past_keys_values;
19 |     bool is_llava;
20 | 
21 |     Fp32llamaDecoder_input() {}
22 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { 
23 |         has_past_keys_values = false; 
24 |         is_llava = false;
25 |     }
26 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
27 |                            std::vector<Matrix3D<float>> past_values_)
28 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
29 |         has_past_keys_values = true;
30 |         is_llava = false;
31 |     }
32 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_, Matrix3D<int> second_input_ids_)
33 |         : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) {
34 |         has_past_keys_values = false;
35 |         is_llava = true;
36 |     }
37 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_)
38 |         : input_ids(input_ids_), image_embed(image_embed_) {
39 |         has_past_keys_values = false;
40 |         is_llava = true;
41 |     }
42 | };
43 | 
44 | class Fp32llamaDecoder {
45 |    public:
46 |     Fp32llamaDecoder(std::string param_path, const struct model_config config);
47 |     Fp32llamaDecoder(){};
48 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
49 |     struct Fp32llamaDecoder_output forward(const struct Fp32llamaDecoder_input& input);
50 |     Embedding embed_tokens;
51 |     LlamaRMSNorm norm;
52 |     float rms_norm_eps;
53 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
54 |     std::vector<Fp32llamaDecoderLayer> layers;
55 |     std::string profile_name = "Fp32llamaDecoder";
56 | 
57 |    private:
58 |     float* attention_mask_buf;
59 |     float* pos_embeds_buf;
60 |     float* last_hidden_states_buf;
61 |     float* hidden_states_buf;
62 |     float* inputs_embeds_buf;
63 |     float* first_input_ids_buf;
64 |     float* image_embed_buf;
65 |     float* second_input_ids_buf;
66 | };
67 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32llamaDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Fp32llamaDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Fp32llamaDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                  std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Fp32llamaDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Fp32llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Fp32llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                                 Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Fp32llamaDecoderLayer {
40 |    public:
41 |     Fp32llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Fp32llamaDecoderLayer_output forward(const struct Fp32llamaDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     float rms_norm_eps;
46 |     LlamaRMSNorm input_layernorm, post_attention_layernorm;
47 |     Linear_FP gate_proj, down_proj, up_proj;
48 |     Fp32llamaAttention attn;
49 |     std::string profile_name = "Fp32llamaDecoderLayer";
50 | };
51 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Fp32llamaForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaDecoder.h"
 2 | 
 3 | struct Fp32LlamaForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Fp32LlamaForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     Matrix3D<float> image_embed;
10 |     Matrix3D<int> second_input_ids;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 |     bool has_past_keys_values;
13 |     bool is_llava;
14 | 
15 |     Fp32LlamaForCausalLM_input() {}
16 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { 
17 |         has_past_keys_values = false; 
18 |         is_llava = false;
19 |     }
20 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
21 |                                std::vector<Matrix3D<float>> past_values_)
22 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
23 |         has_past_keys_values = true;
24 |         is_llava = false;
25 |     }
26 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_, Matrix3D<int> second_input_ids_)
27 |         : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) {
28 |         has_past_keys_values = false;
29 |         is_llava = true;
30 |     }
31 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_)
32 |         : input_ids(input_ids_), image_embed(image_embed_) {
33 |         has_past_keys_values = false;
34 |         is_llava = true;
35 |     }
36 | };
37 | 
38 | class Fp32LlamaForCausalLM {
39 |    public:
40 |     Fp32LlamaForCausalLM(std::string param_path, const struct model_config config);
41 | 
42 |     struct Fp32LlamaForCausalLM_output forward(const struct Fp32LlamaForCausalLM_input& input);
43 | 
44 |    private:
45 |     Fp32llamaDecoder decoder;
46 |     Linear_FP lm_head;
47 |     std::string profile_name = "Fp32LlamaForCausalLM";
48 |     float* logits_output;
49 |     float* lm_head_weight;
50 | };
51 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4GPTBigCodeAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Int4GPTBigCodeAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Int4GPTBigCodeAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Int4GPTBigCodeAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Int4GPTBigCodeAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                            Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Int4GPTBigCodeAttention {
32 |    public:
33 |     Int4GPTBigCodeAttention(std::string param_path, const struct model_config config);
34 |     Int4GPTBigCodeAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Int4GPTBigCodeAttention_output forward(const struct Int4GPTBigCodeAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape_qkv(Matrix3D<float> unshape, Matrix3D<float> shaped_q, Matrix3D<float> shaped_k,
41 |                                           Matrix3D<float> shaped_v, int sqlen);
42 |     int embed_dim, num_heads, head_dim, kv_heads, kv_dim;
43 |     BMM_F32T qk_bmm, pv_bmm;
44 |     Linear_FP_int4 c_attn, c_proj;
45 |     std::string profile_name = "Int4GPTBigCodeAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4GPTBigCodeDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Int4GPTBigCodeDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Int4GPTBigCodeDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Int4GPTBigCodeDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Int4GPTBigCodeDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Int4GPTBigCodeDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                          std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Int4GPTBigCodeDecoder {
27 |    public:
28 |     Int4GPTBigCodeDecoder(std::string param_path, const struct model_config config);
29 |     Int4GPTBigCodeDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Int4GPTBigCodeDecoder_output forward(const struct Int4GPTBigCodeDecoder_input& input);
33 |     Embedding wte, wpe;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads, max_position_embeddings;
35 |     std::vector<Int4GPTBigCodeDecoderLayer> layers;
36 |     LayerNorm ln_f;
37 |     std::string profile_name = "Int4GPTBigCodeDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4GPTBigCodeDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Int4GPTBigCodeAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Int4GPTBigCodeDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Int4GPTBigCodeDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Int4GPTBigCodeDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Int4GPTBigCodeDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Int4GPTBigCodeDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Int4GPTBigCodeDecoderLayer {
40 |    public:
41 |     Int4GPTBigCodeDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Int4GPTBigCodeDecoderLayer_output forward(const struct Int4GPTBigCodeDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LayerNorm ln_1, ln_2;  // from torch_int.nn
46 |     Linear_FP_int4 fc1, fc2;
47 |     Int4GPTBigCodeAttention attn;
48 |     std::string profile_name = "Int4GPTBigCodeDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4GPTBigCodeForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int4GPTBigCodeDecoder.h"
 2 | 
 3 | struct Int4GPTBigCodeForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Int4GPTBigCodeForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Int4GPTBigCodeForCausalLM_input() {}
13 |     Int4GPTBigCodeForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
14 |     Int4GPTBigCodeForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
15 |                              std::vector<Matrix3D<float>> past_values_)
16 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
17 |         has_past_keys_values = true;
18 |     }
19 | };
20 | 
21 | class Int4GPTBigCodeForCausalLM {
22 |    public:
23 |     Int4GPTBigCodeForCausalLM(std::string param_path, const struct model_config config);
24 |     struct Int4GPTBigCodeForCausalLM_output forward(std::string param_path, const struct Int4GPTBigCodeForCausalLM_input& input);
25 | 
26 |    private:
27 |     Int4GPTBigCodeDecoder decoder;
28 |     Linear_FP_int4 lm_head;
29 |     std::string profile_name = "Int4GPTBigCodeForCausalLM";
30 |     float* logits_output;
31 |     uint8_t* lm_head_weight;
32 | };
33 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4OPTAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Int4OPTAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Int4OPTAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Int4OPTAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Int4OPTAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                            Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Int4OPTAttention {
32 |    public:
33 |     Int4OPTAttention(std::string param_path, const struct model_config config);
34 |     Int4OPTAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Int4OPTAttention_output forward(const struct Int4OPTAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shpae(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     float scaling;
42 |     int embed_dim, num_heads, head_dim;
43 |     BMM_F32T qk_bmm, pv_bmm;
44 |     Linear_FP_int4 k_proj, v_proj, q_proj, out_proj;
45 |     std::string profile_name = "Int4OPTAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4OPTDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Int4OPTDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Int4OPTDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Int4OPTDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Int4OPTDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Int4OPTDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                          std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Int4OPTDecoder {
27 |    public:
28 |     Int4OPTDecoder(std::string param_path, const struct model_config config);
29 |     Int4OPTDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Int4OPTDecoder_output forward(const struct Int4OPTDecoder_input& input);
33 |     Embedding embed_tokens, embed_positions;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Int4OPTDecoderLayer> layers;
36 |     LayerNorm final_layer_norm;
37 |     std::string profile_name = "Int4OPTDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4OPTDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Int4OPTAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Int4OPTDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Int4OPTDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Int4OPTDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Int4OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Int4OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Int4OPTDecoderLayer {
40 |    public:
41 |     Int4OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Int4OPTDecoderLayer_output forward(const struct Int4OPTDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LayerNorm self_attn_layer_norm, final_layer_norm;  // from torch_int.nn
46 |     Linear_FP_int4 fc1, fc2;
47 |     Int4OPTAttention attn;
48 |     std::string profile_name = "Int4OPTDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4OPTForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int4OPTDecoder.h"
 2 | 
 3 | struct Int4OPTForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Int4OPTForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Int4OPTForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
13 |     Int4OPTForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
14 |                              std::vector<Matrix3D<float>> past_values_)
15 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
16 |         has_past_keys_values = true;
17 |     }
18 | };
19 | 
20 | class Int4OPTForCausalLM {
21 |    public:
22 |     Int4OPTForCausalLM(std::string param_path, const struct model_config config);
23 |     struct Int4OPTForCausalLM_output forward(const struct Int4OPTForCausalLM_input& input);
24 | 
25 |    private:
26 |     Int4OPTDecoder decoder;
27 |     Linear_FP_int4 lm_head;
28 |     std::string profile_name = "Int4OPTForCausalLM";
29 |     float* logits_output;
30 |     uint8_t* lm_head_weight;
31 | };
32 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int4llamaForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaDecoder.h"
 2 | 
 3 | struct Int4LlamaForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 | #ifdef QM_CUDA
 6 |     std::vector<Matrix3D<float16_t>> past_keys, past_values;
 7 | #else
 8 |     std::vector<Matrix3D<float>> past_keys, past_values;
 9 | #endif
10 | };
11 | struct Int4LlamaForCausalLM_input {
12 |     Matrix3D<int> input_ids;
13 |     Matrix3D<float> image_embed;
14 |     Matrix3D<int> second_input_ids;
15 |     bool has_past_keys_values;
16 |     bool is_llava;
17 | #ifdef QM_CUDA
18 |     std::vector<Matrix3D<float16_t>> past_keys, past_values;
19 | #else
20 |     std::vector<Matrix3D<float>> past_keys, past_values;
21 | #endif
22 | 
23 |     Int4LlamaForCausalLM_input() {}
24 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { 
25 |         has_past_keys_values = false; 
26 |         is_llava = false;
27 |     }
28 | #ifdef QM_CUDA
29 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float16_t>> past_keys_,
30 |                                std::vector<Matrix3D<float16_t>> past_values_)
31 | #else
32 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
33 |                                std::vector<Matrix3D<float>> past_values_)
34 | #endif
35 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
36 |         has_past_keys_values = true;
37 |         is_llava = false;
38 |     }
39 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_, Matrix3D<int> second_input_ids_)
40 |         : input_ids(input_ids_), image_embed(image_embed_), second_input_ids(second_input_ids_) {
41 |         has_past_keys_values = false;
42 |         is_llava = true;
43 |     }
44 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_, Matrix3D<float> image_embed_)
45 |         : input_ids(input_ids_), image_embed(image_embed_) {
46 |         has_past_keys_values = false;
47 |         is_llava = true;
48 |     }
49 | };
50 | 
51 | class Int4LlamaForCausalLM {
52 |    public:
53 |     Int4LlamaForCausalLM(std::string param_path, const struct model_config config);
54 |     Int4LlamaForCausalLM(){};
55 |     struct Int4LlamaForCausalLM_output forward(std::string param_path, const struct Int4LlamaForCausalLM_input& input);
56 |     float* logits_output = nullptr;
57 | #ifdef QM_CUDA
58 |     void free_cuda_memory();
59 |     int* lm_head_weight = nullptr;
60 |     float16_t* logits_output_half = nullptr;
61 | #else
62 |     uint8_t* lm_head_weight;
63 | #endif
64 | 
65 |    private:
66 |     std::string profile_name = "Int4LlamaForCausalLM";
67 |     Int4llamaDecoder decoder;
68 | #ifdef QM_CUDA
69 |     Linear_half_int4 lm_head;
70 | #else
71 |     Linear_FP_int4 lm_head;
72 | #endif
73 | };
74 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int8OPTAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Int8OPTAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<int8_t> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value;
10 | };
11 | struct Int8OPTAttention_input {
12 |     Matrix3D<int8_t> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<int8_t> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Int8OPTAttention_input(Matrix3D<int8_t> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Int8OPTAttention_input(Matrix3D<int8_t> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<int8_t> past_key_,
22 |                            Matrix3D<int8_t> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Int8OPTAttention {
32 |    public:
33 |     Int8OPTAttention(std::string param_path, const struct model_config config, BMM_S8T_S8N_F32T &qk_bmm,
34 |                      BMM_S8T_S8N_S8T &pv_bmm, W8A8B8O8Linear &k_proj, W8A8B8O8Linear &v_proj, W8A8B8O8Linear &q_proj,
35 |                      W8A8BFP32OFP32Linear &out_proj);
36 |     Int8OPTAttention() {}
37 |     static void initialized_memory(const struct model_config config);
38 |     struct Int8OPTAttention_output forward(const struct Int8OPTAttention_input &input);
39 | 
40 |    private:
41 |     void unshape(Matrix3D<int8_t> shaped, Matrix3D<int8_t> unshape, int sqlen);
42 |     void shpae(Matrix3D<int8_t> unshape, Matrix3D<int8_t> shaped, int sqlen);
43 |     int embed_dim, num_heads, head_dim;
44 |     BMM_S8T_S8N_F32T qk_bmm;
45 |     BMM_S8T_S8N_S8T pv_bmm;
46 |     W8A8B8O8Linear k_proj, v_proj, q_proj;
47 |     W8A8BFP32OFP32Linear out_proj;
48 |     std::string profile_name = "Int8OPTAttention";
49 | };
50 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int8OPTDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Int8OPTDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Int8OPTDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
12 | };
13 | struct Int8OPTDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Int8OPTDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Int8OPTDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<int8_t>> past_keys_,
20 |                          std::vector<Matrix3D<int8_t>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Int8OPTDecoder {
27 |    public:
28 |     Int8OPTDecoder(std::string param_path, const struct model_config config);
29 |     Int8OPTDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Int8OPTDecoder_output forward(const struct Int8OPTDecoder_input& input);
33 |     Embedding embed_tokens, embed_positions;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Int8OPTDecoderLayer> layers;
36 |     LayerNorm final_layer_norm;
37 |     std::string profile_name = "Int8OPTDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/Int8OPTDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Int8OPTAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Int8OPTDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<int8_t> attentions;
 8 |     std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value;
 9 | 
10 |     Int8OPTDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<int8_t> attentions_,
11 |                                std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Int8OPTDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<int8_t> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Int8OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Int8OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<int8_t> past_key_, Matrix3D<int8_t> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Int8OPTDecoderLayer {
40 |    public:
41 |     Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx,
42 |                         LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm, W8A8B8O8LinearReLU fc1,
43 |                         W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm, BMM_S8T_S8N_S8T pv_bmm,
44 |                         W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj, W8A8B8O8Linear q_proj,
45 |                         W8A8BFP32OFP32Linear out_proj);
46 |     struct Int8OPTDecoderLayer_output forward(const struct Int8OPTDecoderLayer_input &input);
47 | 
48 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
49 |     LayerNormQ self_attn_layer_norm, final_layer_norm;  // from torch_int.nn
50 |     W8A8B8O8LinearReLU fc1;
51 |     W8A8BFP32OFP32Linear fc2;
52 |     Int8OPTAttention attn;
53 |     std::string profile_name = "Int8OPTDecoderLayer";
54 | };
55 | 


--------------------------------------------------------------------------------
/llm/include/nn_modules/OPTForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int8OPTDecoder.h"
 2 | 
 3 | struct OPTForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
 6 | };
 7 | struct OPTForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     OPTForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
13 |     OPTForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<int8_t>> past_keys_,
14 |                          std::vector<Matrix3D<int8_t>> past_values_)
15 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
16 |         has_past_keys_values = true;
17 |     }
18 | };
19 | 
20 | class OPTForCausalLM {
21 |    public:
22 |     OPTForCausalLM(std::string param_path, const struct model_config config);
23 |     struct OPTForCausalLM_output forward(const struct OPTForCausalLM_input& input);
24 | 
25 |    private:
26 |     Int8OPTDecoder decoder;
27 |     Linear_FP lm_head;
28 |     std::string profile_name = "OPTForCausalLM";
29 |     float* logits_output;
30 |     float* lm_head_weight;
31 | };
32 | 


--------------------------------------------------------------------------------
/llm/include/operators.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPERATORS_H
 2 | #define OPERATORS_H
 3 | #include <cassert>
 4 | 
 5 | #include "common.h"
 6 | #include "matmul.h"
 7 | 
 8 | #define BLK_SIZE 16
 9 | // #define NUM_THREAD 8
10 | extern int NUM_THREAD;
11 | 
12 | // include all ops
13 | #include "ops/BMM_F32T.h"
14 | #include "ops/BMM_S8T_S8N_F32T.h"
15 | #include "ops/BMM_S8T_S8N_S8T.h"
16 | #include "ops/Embedding.h"
17 | #include "ops/LayerNorm.h"
18 | #include "ops/LayerNormQ.h"
19 | #include "ops/LlamaRMSNorm.h"
20 | #include "ops/RotaryPosEmb.h"
21 | #include "ops/W8A8B8O8Linear.h"
22 | #include "ops/W8A8B8O8LinearReLU.h"
23 | #include "ops/W8A8BFP32OFP32Linear.h"
24 | #include "ops/arg_max.h"
25 | #include "ops/linear.h"
26 | #include "ops/Conv2D.h"
27 | #include "ops/Gelu.h"
28 | 
29 | void softmax(const Matrix3D<float> &input, Matrix3D<float> &output, int dim);
30 | void batch_Add(const Matrix3D<float> &input, const Matrix3D<float> &input2, Matrix3D<float> &output);
31 | template <typename T>
32 | void linear(Matrix3D<T> &a, Matrix3D<T> &b, Matrix3D<T> &c);
33 | 
34 | 
35 | #ifdef QM_CUDA
36 | #include "ops/cuda/BMM_F16T.cuh"
37 | #include "ops/cuda/Embedding.cuh"
38 | #include "ops/cuda/LlamaRMSNorm.cuh"
39 | #include "ops/cuda/RotaryPosEmb.cuh"
40 | 
41 | __global__ void batch_Add_float(const Matrix3D<float> input, const Matrix3D<float> input2, Matrix3D<float> output);
42 | __global__ void batch_Add_cuda(const Matrix3D<float16_t> input, const Matrix3D<float16_t> input2,
43 |                                Matrix3D<float16_t> output);
44 | __global__ void batch_Add_cuda_half2(Matrix3D<float16_t> input, Matrix3D<float16_t> input2, Matrix3D<float16_t> output);
45 | __global__ void softmax_float(Matrix3D<float> input, Matrix3D<float> output);
46 | __global__ void softmax_cuda(Matrix3D<float16_t> input, Matrix3D<float16_t> output);
47 | #endif
48 | 
49 | #endif  // OPERATORS_H
50 | 


--------------------------------------------------------------------------------
/llm/include/ops/BMM_F32T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | class BMM_F32T {
 4 |    public:
 5 |     BMM_F32T(float _alpha);
 6 |     BMM_F32T(){};
 7 |     void forward(const Matrix3D<float> &x, const Matrix3D<float> &weight, Matrix3D<float> &output);
 8 |     void forward_weight_untransposed(const Matrix3D<float> &x, const Matrix3D<float> &weight, Matrix3D<float> &output);
 9 |     float alpha;
10 | 
11 |    private:
12 |     std::string profile_name = "BMM_F32T";
13 | };
14 | 
15 | void load_BMM_F32T(BMM_F32T &op, std::string prefix);
16 | 


--------------------------------------------------------------------------------
/llm/include/ops/BMM_S8T_S8N_F32T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct BMM_S8T_S8N_F32T_params {
 4 |     float alpha;
 5 | };
 6 | 
 7 | class BMM_S8T_S8N_F32T {
 8 |    public:
 9 |     BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T_params &params_);
10 |     BMM_S8T_S8N_F32T(){};
11 |     void forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<float> &output);
12 |     struct matmul_params params;
13 |     float alpha;
14 | 
15 |    private:
16 |     std::string profile_name = "BMM_S8T_S8N_F32T";
17 | };
18 | 
19 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix);
20 | 


--------------------------------------------------------------------------------
/llm/include/ops/BMM_S8T_S8N_S8T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct BMM_S8T_S8N_S8T_params {
 4 |     float alpha;
 5 | };
 6 | 
 7 | class BMM_S8T_S8N_S8T {
 8 |    public:
 9 |     BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T_params &params_);
10 |     BMM_S8T_S8N_S8T(){};
11 |     void forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<int8_t> &output);
12 |     struct matmul_params params;
13 |     float alpha;
14 | 
15 |    private:
16 |     std::string profile_name = "BMM_S8T_S8N_S8T";
17 | };
18 | 
19 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix);
20 | 


--------------------------------------------------------------------------------
/llm/include/ops/Conv2D.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <limits>
 3 | 
 4 | struct Conv2D_params {
 5 |     Matrix4D<float> weight;
 6 |     Matrix3D<float> bias;
 7 |     int stride_width = 1;
 8 |     int stride_height = 1;
 9 |     int dilation_width_factor = 1;
10 |     int dilation_height_factor = 1;
11 |     int padding_width = 0;
12 |     int padding_height = 0;
13 |     float float_activation_min = -std::numeric_limits<float>::max();
14 |     float float_activation_max = std::numeric_limits<float>::max();
15 | };
16 | 
17 | class Conv2D {
18 |    public:
19 |     Conv2D(Conv2D_params params_) : params(params_){};
20 |     Conv2D(){};
21 |     void forward(const Matrix3D<float> &input, Matrix3D<float> &output);
22 |     struct Conv2D_params params;
23 |     bool has_bias = false;
24 | 
25 |    private:
26 |     std::string profile_name = "Conv2D";
27 | };
28 | 
29 | void load_Conv2D(Conv2D &op, std::string prefix);
30 | 


--------------------------------------------------------------------------------
/llm/include/ops/Embedding.h:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | 
 3 | #include "common.h"
 4 | 
 5 | class Embedding {
 6 |    public:
 7 |     Embedding(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D<float> lookup_)
 8 |         : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) {
 9 |         assert(lookup_.m_dim_y == voc_size_);
10 |         assert(lookup_.m_dim_z == embed_dim_);
11 |     }
12 |     Embedding(){};
13 |     void forward(Matrix3D<int> input_id, Matrix3D<float> output);
14 |     int embed_dim, voc_size, padding_idx;
15 |     Matrix3D<float> lookup;
16 | 
17 |    private:
18 |     std::string profile_name = "Embedding";
19 | };
20 | 
21 | void load_Embedding_params(Embedding &op, std::string prefix);
22 | 


--------------------------------------------------------------------------------
/llm/include/ops/Gelu.h:
--------------------------------------------------------------------------------
1 | #include "common.h"
2 | 
3 | float Gelu_imp(float x);
4 | void Gelu(Matrix3D<float> a);
5 | float Gelu_quick_imp(float x);
6 | void Gelu_quick(Matrix3D<float> a);
7 | 


--------------------------------------------------------------------------------
/llm/include/ops/LayerNorm.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct LayerNorm_params {
 4 |     Matrix3D<float> weight;
 5 |     Matrix3D<float> bias;
 6 | };
 7 | 
 8 | class LayerNorm {
 9 |    public:
10 |     LayerNorm(LayerNorm_params params_) : params(params_){};
11 |     LayerNorm(){};
12 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
13 |     struct LayerNorm_params params;
14 | 
15 |    private:
16 |     std::string profile_name = "LayerNorm";
17 | };
18 | 
19 | void load_LayerNorm(LayerNorm &op, std::string prefix);
20 | 


--------------------------------------------------------------------------------
/llm/include/ops/LayerNormQ.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct LayerNormQ_params {
 4 |     Matrix3D<float> weight;
 5 |     Matrix3D<float> bias;
 6 | };
 7 | 
 8 | class LayerNormQ {
 9 |    public:
10 |     LayerNormQ(LayerNormQ_params &params_) : params(params_){};
11 |     LayerNormQ(){};
12 |     void forward(const Matrix3D<float> &x, Matrix3D<int8_t> &output);
13 |     struct LayerNormQ_params params;
14 | 
15 |    private:
16 |     std::string profile_name = "LayerNormQ";
17 | };
18 | 
19 | void load_LayerNormQ(LayerNormQ &op, std::string prefix);
20 | 


--------------------------------------------------------------------------------
/llm/include/ops/LlamaRMSNorm.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "utils.h"
 3 | 
 4 | class LlamaRMSNorm {
 5 |    public:
 6 |     LlamaRMSNorm(Matrix3D<float> _weight) : weight(_weight){};
 7 |     LlamaRMSNorm(){};
 8 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output, float eps);
 9 |     Matrix3D<float> weight;
10 | 
11 |    private:
12 |     std::string profile_name = "LlamaRMSNorm";
13 | };
14 | 


--------------------------------------------------------------------------------
/llm/include/ops/RotaryPosEmb.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | 
 3 | #include "common.h"
 4 | #include "utils.h"
 5 | 
 6 | class RotaryPosEmb {
 7 |    public:
 8 |     RotaryPosEmb(Matrix3D<float> _cos, Matrix3D<float> _sin, std::string path) {
 9 |         sin = _sin;
10 |         cos = _cos;
11 |         read_to_array((path + "/cos_cached.bin").c_str(), cos.m_data, cos.length());
12 |         read_to_array((path + "/sin_cached.bin").c_str(), sin.m_data, sin.length());
13 |     };
14 |     RotaryPosEmb(){};
15 |     void forward(Matrix3D<float> &key, Matrix3D<float> &value, int start_idx, int len);
16 |     Matrix3D<float> cos, sin;
17 | 
18 |    private:
19 |     std::string profile_name = "RotaryPosEmb";
20 | };
21 | 
22 | void load_RotaryPosEmb(RotaryPosEmb &op, std::string prefix);
23 | 


--------------------------------------------------------------------------------
/llm/include/ops/W8A8B8O8Linear.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8B8O8Linear_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<int8_t> bias;
 6 |     float alpha;
 7 |     float beta;
 8 | };
 9 | 
10 | class W8A8B8O8Linear {
11 |    public:
12 |     W8A8B8O8Linear(W8A8B8O8Linear_params &params_);
13 |     W8A8B8O8Linear(){};
14 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output);
15 |     struct matmul_params params;
16 |     float alpha;
17 |     float beta;
18 | 
19 |    private:
20 |     std::string profile_name = "W8A8B8O8Linear";
21 | };
22 | 
23 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix);
24 | 


--------------------------------------------------------------------------------
/llm/include/ops/W8A8B8O8LinearReLU.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8B8O8LinearReLU_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<int8_t> bias_int8;
 6 |     float alpha;
 7 |     float beta;
 8 | };
 9 | 
10 | class W8A8B8O8LinearReLU {
11 |    public:
12 |     W8A8B8O8LinearReLU(W8A8B8O8LinearReLU_params &params_);
13 |     W8A8B8O8LinearReLU(){};
14 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output);
15 |     struct matmul_params params;
16 |     float alpha;
17 |     float beta;
18 | 
19 |    private:
20 |     std::string profile_name = "W8A8B8O8LinearReLU";
21 | };
22 | 
23 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix);
24 | 


--------------------------------------------------------------------------------
/llm/include/ops/W8A8BFP32OFP32Linear.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8BFP32OFP32Linear_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<float> bias;
 6 |     float alpha;
 7 | };
 8 | 
 9 | class W8A8BFP32OFP32Linear {
10 |    public:
11 |     W8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear_params &params_);
12 |     W8A8BFP32OFP32Linear(){};
13 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<float> &output);
14 |     struct matmul_params params;
15 |     float alpha;
16 | 
17 |    private:
18 |     std::string profile_name = "W8A8BFP32OFP32Linear";
19 | };
20 | 
21 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix);
22 | 


--------------------------------------------------------------------------------
/llm/include/ops/arg_max.h:
--------------------------------------------------------------------------------
1 | #include "common.h"
2 | 
3 | #define FLOAT_MIN -1000000.0
4 | 
5 | void arg_max_dim2(Matrix3D<float> &input, Matrix3D<int> &output);
6 | 


--------------------------------------------------------------------------------
/llm/include/ops/cuda/BMM_F16T.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | class BMM_F16T{
 4 | public:
 5 |     BMM_F16T(half _alpha);
 6 |     BMM_F16T(){};
 7 |     void forward(const Matrix3D<half> &x, const Matrix3D<half> &weight, Matrix3D<half> &output);  // TODO: convert weight to half
 8 |     void forward_weight_untransposed(const Matrix3D<half> &a, const Matrix3D<half> &weight, Matrix3D<half> &c);
 9 |     half alpha;
10 | private:
11 |     std::string profile_name = "BMM_F16T";
12 | };
13 | 
14 | void load_BMM_F16T(BMM_F16T &op, std::string prefix);
15 | 


--------------------------------------------------------------------------------
/llm/include/ops/cuda/Embedding.cuh:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include "common.h"
 3 | 
 4 | class Embedding_cuda {
 5 |    public:
 6 |     Embedding_cuda(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D<float> lookup_)
 7 |         : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) {
 8 |             assert(lookup_.m_dim_y == voc_size_);
 9 |             assert(lookup_.m_dim_z == embed_dim_);
10 |         }
11 |     Embedding_cuda(){};
12 |     void forward(Matrix3D<int> input_id, Matrix3D<half> output);
13 |     int embed_dim, voc_size, padding_idx;
14 |     Matrix3D<float> lookup;
15 | private:
16 |     std::string profile_name = "Embedding";
17 | };
18 | 
19 | void load_Embedding_params_cuda(Embedding_cuda &op, std::string prefix);
20 | 


--------------------------------------------------------------------------------
/llm/include/ops/cuda/LlamaRMSNorm.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | class LlamaRMSNorm_cuda {
 4 |    public:
 5 |     LlamaRMSNorm_cuda(Matrix3D<float> _weight) : weight(_weight){};
 6 |     LlamaRMSNorm_cuda(){};
 7 |     void forward(const Matrix3D<half> &x, Matrix3D<half> &output, float eps);
 8 |     Matrix3D<float> weight;
 9 |     // half half_eps = 6.10352e-05;
10 | 
11 |    private:
12 |     std::string profile_name = "LlamaRMSNorm_cuda";
13 | };
14 | 


--------------------------------------------------------------------------------
/llm/include/ops/cuda/RotaryPosEmb.cuh:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | 
 3 | #include "utils.h"
 4 | #include "common.h"
 5 | 
 6 | class RotaryPosEmb_cuda
 7 | {
 8 | public:
 9 |     RotaryPosEmb_cuda(Matrix3D<half> _cos, Matrix3D<half> _sin, std::string path)
10 |     {
11 |         sin = _sin;
12 |         cos = _cos;
13 |         read_to_array_half((path + "/cos_cached_half.bin").c_str(), cos.m_data, cos.length());
14 |         read_to_array_half((path + "/sin_cached_half.bin").c_str(), sin.m_data, sin.length());
15 |     };
16 |     RotaryPosEmb_cuda(){};
17 |     void forward(Matrix3D<half> &key, Matrix3D<half> &value, int start_idx, int len);
18 |     Matrix3D<half> cos, sin;
19 | 
20 | private:
21 |     std::string profile_name = "RotaryPosEmb_cuda";
22 | };
23 | 
24 | void load_RotaryPosEmb_cuda(RotaryPosEmb_cuda &op, std::string prefix);
25 | 
26 | __global__ void RotaryPosEmb_float_forward(Matrix3D<float> query, Matrix3D<float> key, Matrix3D<float> cos, Matrix3D<float> sin, int start_idx, int len);
27 | __global__ void RotaryPosEmb_cuda_forward(Matrix3D<half> query, Matrix3D<half> key, Matrix3D<half> cos, Matrix3D<half> sin, int start_idx, int len);
28 | __global__ void RotaryPosEmb_cuda_forward_shared(Matrix3D<half> query, Matrix3D<half> key, Matrix3D<half> cos, Matrix3D<half> sin, int start_idx, int len);
29 | 


--------------------------------------------------------------------------------
/llm/include/ops/cuda/reduction.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Adapted from NVIDIA FasterTransformer:
 3 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 4 | */
 5 | 
 6 | #pragma once
 7 | #include <assert.h>
 8 | #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
 9 | #include <cooperative_groups/reduce.h>
10 | #else
11 | #include <cooperative_groups.h>
12 | #endif
13 | #include <cuda_fp16.h>
14 | #include <cuda_runtime.h>
15 | #include <float.h>
16 | #include <type_traits>
17 | 
18 | static const float HALF_FLT_MAX = 65504.F;
19 | #define FINAL_MASK 0xffffffff
20 | 
21 | 
22 | template<typename T>
23 | inline __device__ T add(T a, T b) {
24 |     return a + b;
25 | }
26 | 
27 | template<>
28 | inline __device__ half2 add(half2 a, half2 b) {
29 |     return __hadd2(a, b);
30 | }
31 | 
32 | template<>
33 | inline __device__ half add(half a, half b) {
34 |     return __hadd(a, b);
35 | }
36 | 
37 | template<typename T>
38 | __inline__ __device__ T warpReduceSum(T val)
39 | {
40 | #pragma unroll
41 |     for (int mask = 16; mask > 0; mask >>= 1)
42 |         val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));  //__shfl_sync bf16 return float when sm < 80
43 |     return val;
44 | }
45 | 
46 | /* Calculate the sum of all elements in a block */
47 | template<typename T>
48 | __inline__ __device__ T blockReduceSum(T val)
49 | {
50 |     static __shared__ T shared[32];
51 |     int                 lane = threadIdx.x & 0x1f;
52 |     int                 wid  = threadIdx.x >> 5;
53 | 
54 |     val = warpReduceSum<T>(val);
55 | 
56 |     if (lane == 0)
57 |         shared[wid] = val;
58 | 
59 |     __syncthreads();
60 | 
61 |     // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
62 |     // blockDim.x is not divided by 32
63 |     val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
64 |     val = warpReduceSum<T>(val);
65 | 
66 |     return val;
67 | }
68 | 
69 | 
70 | template<typename T>
71 | __device__ __forceinline__ T clamp_inf_for_half(const float input)
72 | {
73 |     return input;
74 | }
75 | 
76 | template<>
77 | __device__ __forceinline__ half clamp_inf_for_half(const float input)
78 | {
79 |     // clamp inf values to enable fp16 training
80 |     return input > 0.0f ? __float2half(min(input, 65504.F - 1000)) : __float2half(max(input, -65504.F + 1000));
81 | }
82 | 


--------------------------------------------------------------------------------
/llm/include/profiler.h:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <iostream>
 3 | #include <map>
 4 | #include <string>
 5 | 
 6 | class Profiler {
 7 |    public:
 8 |     bool for_demo = false;
 9 |     static Profiler& getInstance() {
10 |         static Profiler instance;
11 |         return instance;
12 |     }
13 | 
14 |     void start(const std::string& section) { start_times[section] = std::chrono::high_resolution_clock::now(); }
15 | 
16 |     void start(const std::string& section, const long long section_flops) {
17 |         start_times[section] = std::chrono::high_resolution_clock::now();
18 |         if (flops.count(section) == 0)
19 |             flops[section] = section_flops;
20 |         else
21 |             flops[section] += section_flops;
22 |     }
23 | 
24 |     void reset() {
25 |         start_times.clear();
26 |         durations.clear();
27 |         counts.clear();
28 |         flops.clear();
29 |     }
30 | 
31 |     void stop(const std::string& section) {
32 |         auto end_time = std::chrono::high_resolution_clock::now();
33 |         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_times[section]).count();
34 |         durations[section] += duration;
35 |         counts[section]++;
36 |     }
37 | 
38 |     void report_internal() const {
39 |         if (for_demo) {
40 |             for (const auto& entry : durations) {
41 |                 std::string row;
42 |                 std::cout << entry.first + ", ";
43 |                 float s = (float)(entry.second) / 1000000;
44 |                 float ts = (float)counts.at(entry.first);
45 |                 printf("Total time: %.1f s, %.1f ms/token, %.1f token/s, %d tokens\n\n", s, s / ts * 1000, ts / s,
46 |                        counts.at(entry.first));
47 |             }
48 |         } else {
49 |             std::cout << "Section, Total time(us), Average time(us), Count, GOPs:" << std::endl;
50 |             for (const auto& entry : durations) {
51 |                 std::string row;
52 |                 row += entry.first + ", ";
53 |                 row += std::to_string(entry.second) + ", ";
54 |                 row += std::to_string(entry.second / counts.at(entry.first)) + ", ";
55 |                 if (flops.count(entry.first) == 0)
56 |                     row += std::to_string(counts.at(entry.first)) + ", N/A";
57 |                 else {
58 |                     row += std::to_string(counts.at(entry.first)) + ", ";
59 |                     // ops and microsecond
60 |                     row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0);
61 |                 }
62 |                 std::cout << row << std::endl;
63 |             }
64 |         }
65 |     }
66 | 
67 |     void report() const {
68 | #ifdef PROFILER
69 |         report_internal();
70 | #endif
71 |     }
72 | 
73 |    private:
74 |     Profiler() {}
75 |     Profiler(const Profiler&) = delete;
76 |     Profiler& operator=(const Profiler&) = delete;
77 | 
78 |     std::map<std::string, std::chrono::high_resolution_clock::time_point> start_times;
79 |     std::map<std::string, long long> flops;
80 |     std::map<std::string, long long> durations;
81 |     std::map<std::string, int> counts;
82 | };
83 | 


--------------------------------------------------------------------------------
/llm/mistral:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat Mistral_7B INT4 5 0
3 | 


--------------------------------------------------------------------------------
/llm/models/llama3_vocab.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/llama3_vocab.bin


--------------------------------------------------------------------------------
/llm/models/llama_vocab.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/llama_vocab.bin


--------------------------------------------------------------------------------
/llm/models/mistral_vocab.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/mistral_vocab.bin


--------------------------------------------------------------------------------
/llm/models/starcoder_vocab.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/TinyChatEngine/80d7aff15718ae7d74e6bb3d74f06fa58a7af9b4/llm/models/starcoder_vocab.bin


--------------------------------------------------------------------------------
/llm/scripts/chat-13b.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat LLaMA2_13B_chat INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/scripts/chat.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat LLaMA2_7B_chat INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/scripts/code.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat CodeLLaMA_7B_Instruct INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/scripts/llava.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat LLaVA_7B INT4 6 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/scripts/vila.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat VILA_7B INT4 5 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/scripts/voice_llava.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat -v LLaVA_7B INT4 6 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/scripts/voice_vila.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat -v VILA_7B INT4 5 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/scripts/voicechat.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat -v LLaMA2_7B_chat INT4 5
3 | 


--------------------------------------------------------------------------------
/llm/src/interface.cc:
--------------------------------------------------------------------------------
 1 | #include "interface.h"
 2 | #include <iostream>
 3 | 
 4 | void set_print_black() {
 5 |     printf("\033[0;30m");
 6 | }
 7 | 
 8 | void set_print_red() {
 9 |     printf("\033[1;31m");
10 | }
11 | 
12 | void set_print_yellow() {
13 |     printf("\033[0;33m");
14 | }
15 | 
16 | void set_print_bold_yellow() {
17 |     printf("\033[1;33m");
18 | }
19 | 
20 | void set_print_blue() {
21 |     printf("\033[1;34m");
22 | }
23 | 
24 | void set_print_white() {
25 |     printf("\033[0;37m");
26 | }
27 | 
28 | void set_print_reset() {
29 |     printf("\033[0m");
30 | }
31 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Fp32CLIPEncoder.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32CLIPEncoder.h"
 2 | #include "utils.h"
 3 | 
 4 | #include <cstring>
 5 | #include <iostream>
 6 | 
 7 | Fp32CLIPEncoder::Fp32CLIPEncoder(std::string param_path, const struct model_config config) {
 8 |     // Load all the encoder layers
 9 |     for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) {
10 |         DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;)
11 | 
12 |         std::string path = param_path + "/layer" + std::to_string(layer_idx);
13 |         Fp32CLIPEncoderLayer layer = Fp32CLIPEncoderLayer(path, config, layer_idx);
14 | 
15 |         this->layers.push_back(layer);
16 |     }
17 | };
18 | 
19 | // Fp32CLIPEncoder
20 | struct Fp32CLIPEncoder_output Fp32CLIPEncoder::forward(const struct Fp32CLIPEncoder_input &input) {
21 |     PROFILE_START(profile_name);
22 |     int sqlen = input.hidden_states.m_dim_y;
23 | 
24 |     // Go through each layer
25 |     Matrix3D<float> hidden_states = input.hidden_states;
26 |     std::vector<Matrix3D<float>> past_keys, past_values;
27 |     for (int i = 0; i < this->layers.size(); i++) {
28 |         if (!input.has_past_keys_values) {
29 |             struct Fp32CLIPEncoderLayer_input l_i = {hidden_states, input.attention_mask};
30 |             struct Fp32CLIPEncoderLayer_output l_o = this->layers[i].forward(l_i);
31 |             hidden_states = l_o.hidden_states;
32 |             past_keys.push_back(l_o.past_key_value.first);
33 |             past_values.push_back(l_o.past_key_value.second);
34 |         } else {
35 |             struct Fp32CLIPEncoderLayer_input l_i = {hidden_states, input.attention_mask, input.past_keys[i],
36 |                                                       input.past_values[i]};
37 |             struct Fp32CLIPEncoderLayer_output l_o = this->layers[i].forward(l_i);
38 |             hidden_states = l_o.hidden_states;
39 |             past_keys.push_back(l_o.past_key_value.first);
40 |             past_values.push_back(l_o.past_key_value.second);
41 |         }
42 |     }
43 | 
44 |     struct Fp32CLIPEncoder_output output = {hidden_states, past_keys, past_values};
45 |     PROFILE_END(profile_name);
46 |     return output;
47 | }
48 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Fp32GPTBigCodeForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32GPTBigCodeForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Fp32GPTBigCodeForCausalLM::Fp32GPTBigCodeForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Fp32GPTBigCodeDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct Fp32GPTBigCodeForCausalLM_output Fp32GPTBigCodeForCausalLM::forward(const struct Fp32GPTBigCodeForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Fp32GPTBigCodeDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Fp32GPTBigCodeDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 | 
27 |     } else {
28 |         struct Fp32GPTBigCodeDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
34 | 
35 |     struct Fp32GPTBigCodeForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
36 |     PROFILE_END(profile_name);
37 |     return LMoutput;
38 | }
39 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Fp32OPTForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Fp32OPTForCausalLM::Fp32OPTForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Fp32OPTDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct Fp32OPTForCausalLM_output Fp32OPTForCausalLM::forward(const struct Fp32OPTForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Fp32OPTDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Fp32OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 | 
27 |     } else {
28 |         struct Fp32OPTDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
34 | 
35 |     struct Fp32OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
36 |     PROFILE_END(profile_name);
37 |     return LMoutput;
38 | }
39 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Fp32llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Fp32LlamaForCausalLM::Fp32LlamaForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Fp32llamaDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct Fp32LlamaForCausalLM_output Fp32LlamaForCausalLM::forward(const struct Fp32LlamaForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 | 
20 |     struct Fp32llamaDecoder_output decoder_output;
21 | 
22 |     // Call decoder
23 |     if (input.has_past_keys_values) {
24 |         struct Fp32llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 |     } else {
27 |         struct Fp32llamaDecoder_input decoder_input;
28 |         if (input.is_llava) {
29 |             decoder_input = {input.input_ids, input.image_embed};
30 |             decoder_input.has_past_keys_values = false;
31 |             decoder_input.is_llava = true;
32 |         } else {
33 |             decoder_input = {input.input_ids};
34 |             decoder_input.has_past_keys_values = false;
35 |             decoder_input.is_llava = false;
36 |         }
37 |         decoder_output = this->decoder.forward(decoder_input);
38 |     }
39 | 
40 |     // Get logits
41 |     int sqlen;
42 |     if (input.is_llava) {
43 |         sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y;
44 |     } else {
45 |         sqlen = input.input_ids.m_dim_z;
46 |     }
47 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
48 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
49 | 
50 |     struct Fp32LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
51 |     PROFILE_END(profile_name);
52 |     return LMoutput;
53 | }
54 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Int4GPTBigCodeForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4GPTBigCodeForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Int4GPTBigCodeForCausalLM::Int4GPTBigCodeForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(uint8_t) / 2);
11 |     this->decoder = Int4GPTBigCodeDecoder(param_path + "/decoder", config);
12 |     this->lm_head =
13 |         Linear_FP_int4(Matrix3D<uint8_t>(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), param_path + "/lm_head");
14 | }
15 | 
16 | struct Int4GPTBigCodeForCausalLM_output Int4GPTBigCodeForCausalLM::forward(std::string param_path, const struct Int4GPTBigCodeForCausalLM_input &input) {
17 |     // printf(("Int4GPTBigCodeForCausalLM::forward\n");
18 |     PROFILE_START(profile_name);
19 |     // printf(("Int4GPTBigCodeForCausalLM starts\n");
20 |     int sqlen = input.input_ids.m_dim_z;
21 | 
22 |     struct Int4GPTBigCodeDecoder_output decoder_output;
23 |     // printf(("Before this->decoder.forward\n");
24 |     if (input.has_past_keys_values) {
25 |         struct Int4GPTBigCodeDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
26 |         decoder_output = this->decoder.forward(decoder_input);
27 |     } else {
28 |         // printf(("00000000\n");
29 |         struct Int4GPTBigCodeDecoder_input decoder_input = {input.input_ids};
30 |         // printf(("11111111\n");
31 |         decoder_output = this->decoder.forward(decoder_input);
32 |     }
33 | 
34 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
35 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
36 | 
37 |     struct Int4GPTBigCodeForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
38 |     PROFILE_END(profile_name);
39 |     return LMoutput;
40 | }
41 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/Int4OPTForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4OPTForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Int4OPTForCausalLM::Int4OPTForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(uint8_t) / 2);
11 | 
12 |     this->decoder = Int4OPTDecoder(param_path + "/decoder", config);
13 |     this->lm_head = Linear_FP_int4(Matrix3D<uint8_t>(lm_head_weight, 1, config.vocsize, config.embed_dim / 2),
14 |                                    param_path + "/lm_head");
15 | }
16 | 
17 | struct Int4OPTForCausalLM_output Int4OPTForCausalLM::forward(const struct Int4OPTForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Int4OPTDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Int4OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 | 
27 |     } else {
28 |         struct Int4OPTDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
34 | 
35 |     struct Int4OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
36 |     PROFILE_END(profile_name);
37 |     return LMoutput;
38 | }
39 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/OPTForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "OPTForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | OPTForCausalLM::OPTForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Int8OPTDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct OPTForCausalLM_output OPTForCausalLM::forward(const struct OPTForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Int8OPTDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Int8OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 | 
27 |     } else {
28 |         struct Int8OPTDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
34 | 
35 |     struct OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
36 |     PROFILE_END(profile_name);
37 |     return LMoutput;
38 | }
39 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/cuda/Int4llamaForCausalLM.cu:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | 
 3 | #include "Int4llamaForCausalLM.h"
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) {
 8 |     allocate_aligned_memory_gpu(logits_output_half, config.max_sqlen * config.vocsize * sizeof(float16_t));
 9 |     allocate_aligned_memory_gpu(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory_gpu(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(int)) / 8);
11 | 
12 |     this->decoder = Int4llamaDecoder(param_path + "/decoder", config);
13 |     this->lm_head = Linear_half_int4(Matrix3D<int>(lm_head_weight, 1, config.vocsize, config.embed_dim / 8),
14 |                                    param_path + "/lm_head");
15 | }
16 | 
17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(std::string param_path, const struct Int4LlamaForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Int4llamaDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input);
26 | 
27 |     } else {
28 |         struct Int4llamaDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float16_t> logits_half(logits_output_half, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits_half);
34 | 
35 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
36 |     int threadsPerBlock_1D = 1024;
37 |     int blocksPerGrid =(sqlen * this->decoder.voc_size + threadsPerBlock_1D - 1) / threadsPerBlock_1D;
38 |     half2float<<<blocksPerGrid, threadsPerBlock_1D>>>(logits_output_half, logits_output, sqlen * this->decoder.voc_size);
39 | 
40 |     cudaEvent_t event;
41 |     cudaEventCreate(&event);
42 |     cudaEventRecord(event, 0);
43 |     cudaEventSynchronize(event);
44 |     cudaEventDestroy(event);
45 | 
46 |     struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
47 |     PROFILE_END(profile_name);
48 | 
49 |     return LMoutput;
50 | }
51 | 
52 | void Int4LlamaForCausalLM::free_cuda_memory() {
53 |     free_aligned_memory_gpu(logits_output_half);
54 |     free_aligned_memory_gpu(logits_output);
55 |     free_aligned_memory_gpu(lm_head_weight);
56 | }
57 | 


--------------------------------------------------------------------------------
/llm/src/nn_modules/non_cuda/Int4llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(uint8_t)) / 2);
11 | 
12 |     this->decoder = Int4llamaDecoder(param_path + "/decoder", config);
13 |     this->lm_head = Linear_FP_int4(Matrix3D<uint8_t>(lm_head_weight, 1, config.vocsize, config.embed_dim / 2),
14 |                                    param_path + "/lm_head");
15 | }
16 | 
17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(std::string param_path, const struct Int4LlamaForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 | 
20 |     struct Int4llamaDecoder_output decoder_output;
21 | 
22 |     // Call decoder
23 |     if (input.has_past_keys_values) {
24 |         struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input);
26 |     } else {
27 |         struct Int4llamaDecoder_input decoder_input;
28 |         if (input.is_llava) {
29 |             decoder_input = {input.input_ids, input.image_embed};
30 |             decoder_input.has_past_keys_values = false;
31 |             decoder_input.is_llava = true;
32 |         } else {
33 |             decoder_input = {input.input_ids};
34 |             decoder_input.has_past_keys_values = false;
35 |             decoder_input.is_llava = false;
36 |         }
37 |         decoder_output = this->decoder.forward(param_path + "/decoder", decoder_input);
38 |     }
39 | 
40 |     // Get logits
41 |     int sqlen;
42 |     if (input.is_llava) {
43 |         sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y + input.second_input_ids.m_dim_z;
44 |         sqlen = input.input_ids.m_dim_z + input.image_embed.m_dim_y;
45 |     } else {
46 |         sqlen = input.input_ids.m_dim_z;
47 |     }
48 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
49 |     PROFILE_START("Int4LlamaForCausalLM::lm_head");
50 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
51 |     PROFILE_END("Int4LlamaForCausalLM::lm_head");
52 | 
53 |     struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
54 |     PROFILE_END(profile_name);
55 |     return LMoutput;
56 | }
57 | 


--------------------------------------------------------------------------------
/llm/src/ops/BMM_S8T_S8N_F32T.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix) {
 7 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
 8 | }
 9 | 
10 | BMM_S8T_S8N_F32T::BMM_S8T_S8N_F32T(struct BMM_S8T_S8N_F32T_params &op_params) { alpha = op_params.alpha; }
11 | 
12 | void BMM_S8T_S8N_F32T::forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<float> &output) {
13 |     const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x;
14 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
15 |     PROFILE_START_FLOPS(profile_name, ops);
16 |     assert(output.m_dim_x == x.m_dim_x);
17 |     assert(output.m_dim_y == x.m_dim_y);
18 |     assert(output.m_dim_z == weight.m_dim_y);
19 |     assert(x.m_dim_z == weight.m_dim_z);
20 | 
21 |     struct matmul_params params;
22 | 
23 |     params.A.row = m;
24 |     params.A.column = k;
25 |     params.A.int8_data_ptr = x.m_data;
26 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
27 |     params.B.qparams.scale = 1.0;
28 |     params.C.qparams.scale = 1.0;
29 |     params.A.qparams.zero_point = 0;
30 |     params.B.row = k;
31 |     params.B.column = n;
32 |     params.B.int8_data_ptr = weight.m_data;
33 |     params.B.qparams.zero_point = 0;
34 |     params.C.row = m;
35 |     params.C.column = n;
36 |     params.C.data_ptr = output.m_data;
37 |     params.C.qparams.zero_point = 0;
38 |     params.opt_params.blk_size = BLK_SIZE;
39 |     params.opt_params.num_thread = NUM_THREAD;
40 |     params.C.qparams.q_max = 127;
41 |     params.C.qparams.q_min = -128;
42 |     params.alpha = alpha;
43 | 
44 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
45 |     if (m == 1 && x.m_dim_x > 1) {
46 |         // merge each batch
47 |         params.A.row = x.m_dim_x;
48 |         params.C.row = x.m_dim_x;
49 |         // B is batched, need a new op for this!
50 |         matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(&params);
51 |     } else {
52 |         // process each batch
53 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
54 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(&params);
55 |             params.A.int8_data_ptr += m * k;
56 |             params.B.int8_data_ptr += k * n;
57 |             params.C.data_ptr += m * n;
58 |         }
59 |     }
60 | 
61 |     PROFILE_END(profile_name);
62 | }
63 | 


--------------------------------------------------------------------------------
/llm/src/ops/BMM_S8T_S8N_S8T.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix) {
 7 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
 8 | }
 9 | 
10 | BMM_S8T_S8N_S8T::BMM_S8T_S8N_S8T(struct BMM_S8T_S8N_S8T_params &op_params) { alpha = op_params.alpha; }
11 | 
12 | void BMM_S8T_S8N_S8T::forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<int8_t> &output) {
13 |     const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x;
14 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
15 |     PROFILE_START_FLOPS(profile_name, ops);
16 |     assert(output.m_dim_x == x.m_dim_x);
17 |     assert(output.m_dim_y == x.m_dim_y);
18 |     assert(output.m_dim_z == weight.m_dim_y);
19 |     assert(x.m_dim_z == weight.m_dim_z);
20 | 
21 |     struct matmul_params params;
22 | 
23 |     params.A.row = m;
24 |     params.A.column = k;
25 |     params.A.int8_data_ptr = x.m_data;
26 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
27 |     params.B.qparams.scale = 1.0;
28 |     params.C.qparams.scale = 1.0;
29 |     params.A.qparams.zero_point = 0;
30 |     params.B.row = k;
31 |     params.B.column = n;
32 |     params.B.int8_data_ptr = weight.m_data;
33 |     params.B.qparams.zero_point = 0;
34 |     params.C.row = m;
35 |     params.C.column = n;
36 |     params.C.int8_data_ptr = output.m_data;
37 |     params.C.qparams.zero_point = 0;
38 |     params.opt_params.blk_size = BLK_SIZE;
39 |     params.opt_params.num_thread = NUM_THREAD;
40 |     params.C.qparams.q_max = 127;
41 |     params.C.qparams.q_min = -128;
42 |     params.alpha = alpha;
43 | 
44 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
45 | 
46 |     // process each batch
47 |     if (m == 1 && x.m_dim_x > 1) {
48 |         // merge each batch
49 |         params.A.row = x.m_dim_x;
50 |         params.C.row = x.m_dim_x;
51 |         // B is batched, need a new op for this!
52 |         matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(&params);
53 |     } else {
54 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
55 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(&params);
56 |             params.A.int8_data_ptr += m * k;
57 |             params.B.int8_data_ptr += k * n;
58 |             params.C.int8_data_ptr += m * n;
59 |         }
60 |     }
61 | 
62 |     PROFILE_END(profile_name);
63 | }
64 | 


--------------------------------------------------------------------------------
/llm/src/ops/Gelu.cc:
--------------------------------------------------------------------------------
 1 | #include "ops/Gelu.h"
 2 | 
 3 | #include <cmath>
 4 | #include <cassert>
 5 | 
 6 | static const float GELU_COEF_A    = 0.044715f;
 7 | static const float GELU_QUICK_COEF = -1.702f;
 8 | static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
 9 | 
10 | float Gelu_imp(float x) {
11 |     return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
12 | }
13 | 
14 | void Gelu(Matrix3D<float> a) {
15 |     for (int i = 0; i < a.length(); i++) {
16 |         a.m_data[i] = Gelu_imp(a.m_data[i]);
17 |     }
18 | }
19 | 
20 | float Gelu_quick_imp(float x) {
21 |     return x * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x)));
22 | }
23 | 
24 | void Gelu_quick(Matrix3D<float> a) {
25 |     for (int i = 0; i < a.length(); i++) {
26 |         a.m_data[i] = Gelu_quick_imp(a.m_data[i]);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/llm/src/ops/LayerNorm.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void load_LayerNorm(LayerNorm &op, std::string prefix) {
 8 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length());
 9 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length());
10 | }
11 | 
12 | void LayerNorm::forward(const Matrix3D<float> &x, Matrix3D<float> &output) {
13 |     PROFILE_START(profile_name);
14 |     Matrix3D<float> weight = params.weight;
15 |     Matrix3D<float> bias = params.bias;
16 |     const int last_dims = 2;
17 |     const float eps = 1e-5;
18 | 
19 |     assert(last_dims == 2);  // support the last dim for now
20 |     assert(output.m_dim_x == x.m_dim_x);
21 |     assert(output.m_dim_y == x.m_dim_y);
22 |     assert(output.m_dim_z == x.m_dim_z);
23 |     assert(x.m_dim_z == weight.m_dim_z);
24 |     assert(x.m_dim_z == bias.m_dim_z);
25 | 
26 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
27 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
28 |             float mean = 0;
29 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
30 |                 mean += x(i, j, k);
31 |             }
32 |             mean /= static_cast<float>(x.m_dim_z);
33 |             float squared_diff_sum = 0;
34 |             for (int k = 0; k < x.m_dim_z; k++) {
35 |                 float value = static_cast<float>(x(i, j, k));
36 |                 squared_diff_sum += (value - mean) * (value - mean);
37 |             }
38 |             float std_dev = sqrtl(squared_diff_sum / static_cast<float>(x.m_dim_z) + eps);
39 | 
40 |             for (int k = 0; k < x.m_dim_z; k++) {
41 |                 float value = static_cast<float>(x(i, j, k));
42 |                 float fp_out = (((value - mean) / (std_dev)) * static_cast<float>(weight(0, 0, k))) +
43 |                                static_cast<float>(bias(0, 0, k));
44 |                 output(i, j, k) = static_cast<float>(fp_out);
45 |             }
46 |         }
47 |     }
48 |     PROFILE_END(profile_name);
49 | }
50 | 


--------------------------------------------------------------------------------
/llm/src/ops/LayerNormQ.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void load_LayerNormQ(LayerNormQ &op, std::string prefix) {
 8 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length());
 9 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length());
10 | }
11 | 
12 | void LayerNormQ::forward(const Matrix3D<float> &x, Matrix3D<int8_t> &output) {
13 |     PROFILE_START(profile_name);
14 |     Matrix3D<float> weight = params.weight;
15 |     Matrix3D<float> bias = params.bias;
16 |     const int last_dims = 2;
17 |     const float eps = 0.00001;
18 | 
19 |     assert(last_dims == 2);  // support the last dim for now
20 |     assert(output.m_dim_x == x.m_dim_x);
21 |     assert(output.m_dim_y == x.m_dim_y);
22 |     assert(output.m_dim_z == x.m_dim_z);
23 |     assert(x.m_dim_z == weight.m_dim_z);
24 |     assert(x.m_dim_z == bias.m_dim_z);
25 | 
26 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
27 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
28 |             float mean = 0;
29 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
30 |                 mean += x(i, j, k);
31 |             }
32 |             mean /= static_cast<float>(x.m_dim_z);
33 |             float squared_diff_sum = 0;
34 |             for (int k = 0; k < x.m_dim_z; k++) {
35 |                 float value = static_cast<float>(x(i, j, k));
36 |                 squared_diff_sum += (value - mean) * (value - mean);
37 |             }
38 | 
39 |             float var = squared_diff_sum / static_cast<float>(x.m_dim_z);
40 |             float std_dev = sqrt(var + eps);
41 | 
42 |             for (int k = 0; k < x.m_dim_z; k++) {
43 |                 float value = static_cast<float>(x(i, j, k));
44 |                 float fp_out = ((value - mean) / (std_dev) * static_cast<float>(weight(0, 0, k))) +
45 |                                static_cast<float>(bias(0, 0, k));
46 |                 output(i, j, k) = static_cast<int8_t>(std::round(fp_out));
47 |             }
48 |         }
49 |     }
50 | 
51 |     PROFILE_END(profile_name);
52 | }
53 | 


--------------------------------------------------------------------------------
/llm/src/ops/LlamaRMSNorm.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void LlamaRMSNorm::forward(const Matrix3D<float> &x, Matrix3D<float> &output, float eps) {
 8 |     PROFILE_START(profile_name);
 9 |     const int last_dims = 2;
10 | 
11 |     assert(last_dims == 2);  // support the last dim for now
12 |     assert(output.m_dim_x == x.m_dim_x);
13 |     assert(output.m_dim_y == x.m_dim_y);
14 |     assert(output.m_dim_z == x.m_dim_z);
15 |     assert(x.m_dim_z == weight.m_dim_z);
16 | 
17 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
18 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
19 |             float var = 0;
20 | 
21 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
22 |                 var += x(i, j, k) * x(i, j, k);
23 |             }
24 |             var /= static_cast<float>(x.m_dim_z);
25 |             float variance = 1.0 / sqrt(var + eps);
26 | 
27 |             for (int k = 0; k < x.m_dim_z; k++) {
28 |                 float value = static_cast<float>(x(i, j, k));
29 |                 float fp_out = (value * variance) * weight(0, 0, k);
30 |                 output(i, j, k) = fp_out;
31 |             }
32 |         }
33 |     }
34 | 
35 |     PROFILE_END(profile_name);
36 | }
37 | 


--------------------------------------------------------------------------------
/llm/src/ops/RotaryPosEmb.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | 
 5 | float q_buf[4096], k_buf[4096];
 6 | // TODO: optimize this with multithreading
 7 | void RotaryPosEmb::forward(Matrix3D<float> &query, Matrix3D<float> &key, int start_idx, int len) {
 8 |     PROFILE_START(profile_name);
 9 |     int num_heads = query.m_dim_x;
10 |     int num_kv_heads = key.m_dim_x;
11 |     int head_embed = cos.m_dim_z;
12 |     int max_sqlen = cos.m_dim_y;
13 | 
14 |     assert(query.m_dim_z == cos.m_dim_z);
15 |     assert(key.m_dim_z == cos.m_dim_z);
16 |     assert(max_sqlen > len + start_idx);
17 | 
18 |     // cos, sin = self.rotary_emb(key_states, seq_len=kv_seq_len)
19 |     // query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
20 |     // cos, sin, position_ids) cos = cos[position_ids].unsqueeze(1)  # [bs, 1,
21 |     // seq_len, dim] sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
22 |     // q_embed = (q * cos) + (rotate_half(q) * sin)
23 |     // k_embed = (k * cos) + (rotate_half(k) * sin)
24 |     // x1 = x[..., : x.shape[-1] // 2]
25 |     // x2 = x[..., x.shape[-1] // 2 :]
26 |     // rotate_half: torch.cat((-x2, x1), dim=-1)
27 | 
28 |     int half = head_embed / 2;
29 |     // Query
30 |     for (int b = 0; b < num_heads; b++) {
31 |         for (int i = 0; i < len; i++) {
32 |             // first half
33 |             for (int j = 0; j < half; j++) {
34 |                 q_buf[j] = -1 * query(b, i, j + half);
35 |                 // k_buf[j] = -1 * key(b, i, j + half);
36 |             }
37 |             // second half
38 |             for (int j = half; j < head_embed; j++) {
39 |                 q_buf[j] = query(b, i, j - half);
40 |                 // k_buf[j] = key(b, i, j - half);
41 |             }
42 | 
43 |             for (int j = 0; j < head_embed; j++) {
44 |                 query(b, i, j) = ((query(b, i, j) * cos(0, i + start_idx, j)) + (q_buf[j] * sin(0, i + start_idx, j)));
45 |                 // key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) + (k_buf[j] * sin(0, i + start_idx, j)));
46 |             }
47 |         }
48 |     }
49 | 
50 |     // Key
51 |     for (int b = 0; b < num_kv_heads; b++) {
52 |         for (int i = 0; i < len; i++) {
53 |             // first half
54 |             for (int j = 0; j < half; j++) {
55 |                 k_buf[j] = -1 * key(b, i, j + half);
56 |             }
57 |             // second half
58 |             for (int j = half; j < head_embed; j++) {
59 |                 k_buf[j] = key(b, i, j - half);
60 |             }
61 | 
62 |             for (int j = 0; j < head_embed; j++) {
63 |                 key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) + (k_buf[j] * sin(0, i + start_idx, j)));
64 |             }
65 |         }
66 |     }
67 | 
68 |     PROFILE_END(profile_name);
69 | }
70 | 


--------------------------------------------------------------------------------
/llm/src/ops/W8A8B8O8Linear.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix) {
 7 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length());
 8 |     read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length());
 9 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1);
10 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
11 |     read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1);
12 |     read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1);
13 | }
14 | 
15 | W8A8B8O8Linear::W8A8B8O8Linear(struct W8A8B8O8Linear_params &op_params) {
16 |     Matrix3D<int8_t> weight = op_params.weight;
17 |     Matrix3D<int8_t> bias = op_params.bias;
18 | 
19 |     int k = weight.m_dim_z, n = weight.m_dim_y;
20 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
21 |     params.B.qparams.scale = 1.0;
22 |     params.C.qparams.scale = 1.0;
23 |     params.A.qparams.zero_point = 0;
24 |     params.B.row = k;
25 |     params.B.column = n;
26 |     params.B.int8_data_ptr = weight.m_data;
27 |     params.B.qparams.zero_point = 0;
28 |     params.C.qparams.zero_point = 0;
29 |     params.opt_params.blk_size = BLK_SIZE;
30 |     params.opt_params.num_thread = NUM_THREAD;
31 |     params.C.qparams.q_max = 127;
32 |     params.C.qparams.q_min = -128;
33 |     params.bias.int8_data_ptr = bias.m_data;
34 |     params.bias.row = 1;
35 |     params.bias.column = n;
36 | }
37 | 
38 | void W8A8B8O8Linear::forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output) {
39 |     const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x;
40 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
41 |     PROFILE_START_FLOPS(profile_name, ops);
42 |     assert(output.m_dim_x == x.m_dim_x);
43 |     assert(output.m_dim_y == x.m_dim_y);
44 |     assert(output.m_dim_z == params.B.column);
45 |     assert(x.m_dim_z == params.B.row);
46 |     assert(output.m_dim_z == params.bias.column);
47 | 
48 |     params.A.row = m;
49 |     params.A.column = k;
50 |     params.A.int8_data_ptr = x.m_data;
51 |     params.C.row = m;
52 |     params.C.column = n;
53 |     params.C.int8_data_ptr = output.m_data;
54 |     params.A.qparams.scale = alpha;
55 |     params.alpha = alpha;
56 |     params.beta = beta;
57 | 
58 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
59 | 
60 |     // printf("W8A8B8O8Linear-m,n,k: %d, %d, %d\n", m,n,k);
61 |     if (m == 1) {
62 |         // params.opt_params.num_thread = 8;
63 |         // let's loop over the column dim instead of row
64 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
65 |             matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(&params);
66 |             params.A.int8_data_ptr += m * k;
67 |             params.C.int8_data_ptr += m * n;
68 |         }
69 |     } else {
70 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
71 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(&params);
72 |             params.A.int8_data_ptr += m * k;
73 |             params.C.int8_data_ptr += m * n;
74 |         }
75 |     }
76 | 
77 |     PROFILE_END(profile_name);
78 | }
79 | 


--------------------------------------------------------------------------------
/llm/src/ops/W8A8BFP32OFP32Linear.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix) {
 7 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length());
 8 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.data_ptr, op.params.bias.length());
 9 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
10 | }
11 | 
12 | W8A8BFP32OFP32Linear::W8A8BFP32OFP32Linear(struct W8A8BFP32OFP32Linear_params &op_params) {
13 |     Matrix3D<int8_t> weight = op_params.weight;
14 |     Matrix3D<float> bias = op_params.bias;
15 |     alpha = op_params.alpha;
16 | 
17 |     int k = weight.m_dim_z, n = weight.m_dim_y;
18 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
19 |     params.B.qparams.scale = 1.0;
20 |     params.C.qparams.scale = 1.0;
21 |     params.A.qparams.zero_point = 0;
22 |     params.B.row = k;
23 |     params.B.column = n;
24 |     params.B.int8_data_ptr = weight.m_data;
25 |     params.B.qparams.zero_point = 0;
26 |     params.C.column = n;
27 |     params.C.qparams.zero_point = 0;
28 |     params.opt_params.blk_size = BLK_SIZE;
29 |     params.opt_params.num_thread = NUM_THREAD;
30 |     params.bias.data_ptr = bias.m_data;
31 |     params.bias.row = 1;
32 |     params.bias.column = bias.m_dim_z;
33 | }
34 | 
35 | void W8A8BFP32OFP32Linear::forward(const Matrix3D<int8_t> &x, Matrix3D<float> &output) {
36 |     const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x;
37 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
38 |     PROFILE_START_FLOPS(profile_name, ops);
39 |     assert(output.m_dim_x == x.m_dim_x);
40 |     assert(output.m_dim_y == x.m_dim_y);
41 |     assert(output.m_dim_z == params.B.column);
42 |     assert(x.m_dim_z == params.B.row);
43 |     assert(output.m_dim_z == params.bias.column);
44 | 
45 |     params.A.row = m;
46 |     params.A.column = k;
47 |     params.A.int8_data_ptr = x.m_data;
48 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
49 |     params.C.row = m;
50 |     params.C.column = n;
51 |     params.C.data_ptr = output.m_data;
52 |     params.C.qparams.zero_point = 0;
53 |     params.alpha = alpha;
54 | 
55 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
56 | 
57 |     if (m == 1) {
58 |         // let's loop over the column dim instead of row
59 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
60 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(&params);
61 |             params.A.int8_data_ptr += m * k;
62 |             params.C.data_ptr += m * n;
63 |         }
64 |     } else {
65 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
66 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(&params);
67 |             params.A.int8_data_ptr += m * k;
68 |             params.C.data_ptr += m * n;
69 |         }
70 |     }
71 | 
72 |     PROFILE_END(profile_name);
73 | }
74 | 


--------------------------------------------------------------------------------
/llm/src/ops/arg_max.cc:
--------------------------------------------------------------------------------
 1 | #include "ops/arg_max.h"
 2 | 
 3 | #include <cassert>
 4 | 
 5 | void arg_max_dim2(Matrix3D<float> &input, Matrix3D<int> &output) {
 6 |     int bz = input.m_dim_x;
 7 |     int sqlen = input.m_dim_y;
 8 |     int voc_size = input.m_dim_z;
 9 | 
10 |     assert(sqlen == output.m_dim_z);
11 |     assert(bz == output.m_dim_x);
12 | 
13 |     for (int b = 0; b < bz; b++) {
14 |         for (int i = 0; i < sqlen; i++) {
15 |             float max = FLOAT_MIN;
16 |             int max_idx = -1;
17 |             for (int j = 0; j < voc_size; j++) {
18 |                 float v = input(b, i, j);
19 |                 if (max < v) {
20 |                     max = v;
21 |                     max_idx = j;
22 |                 }
23 |             }
24 |             output(b, 0, i) = max_idx;
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/llm/src/ops/batch_add.cc:
--------------------------------------------------------------------------------
 1 | #include "operators.h"
 2 | 
 3 | void batch_Add(const Matrix3D<float> &input, const Matrix3D<float> &input2, Matrix3D<float> &output) {
 4 |     PROFILE_START("batch_Add");
 5 |     assert(input.m_dim_y == input2.m_dim_y);
 6 |     assert(input.m_dim_z == input2.m_dim_z);
 7 |     assert(input.m_dim_x == output.m_dim_x);
 8 |     assert(input.m_dim_y == output.m_dim_y);
 9 |     assert(input.m_dim_z == output.m_dim_z);
10 | 
11 |     if (input.m_dim_x != input2.m_dim_x && input2.m_dim_x == 1) {
12 |         // Find the maximum value in the input array
13 |         for (int i = 0; i < input.m_dim_x; i++) {
14 |             for (int j = 0; j < input.m_dim_y; j++) {
15 |                 for (int k = 0; k < input.m_dim_z; k++) {
16 |                     output(i, j, k) = input(i, j, k) + input2(0, j, k);
17 |                 }
18 |             }
19 |         }
20 |     } else {
21 |         throw("Unsupported dimension for softmax");
22 |     }
23 |     PROFILE_END("batch_Add");
24 | }
25 | 


--------------------------------------------------------------------------------
/llm/src/ops/cuda/RotaryPosEmb.cu:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "operators.h"
 3 | 
 4 | __global__ void RotaryPosEmb_cuda_forward(Matrix3D<half> query, Matrix3D<half> key, Matrix3D<half> cos, Matrix3D<half> sin, int start_idx, int len) {
 5 |   half query_buf[128], key_buf[128];
 6 | 
 7 |   int num_heads = query.m_dim_x;
 8 |   int head_embed = cos.m_dim_z;
 9 |   int half_pos = head_embed / 2;
10 | 
11 |   // Convert the 1D CUDA thread indices into 3D indices
12 |   int b = blockIdx.x;
13 |   int i = threadIdx.x;
14 | 
15 |   if(b < num_heads && i < len) {
16 |     for(int j = 0; j < half_pos; j++) {
17 |       query_buf[j] = __hneg(query(b, i, j + half_pos));
18 |       key_buf[j] = __hneg(key(b, i, j + half_pos));
19 |     }
20 | 
21 |     for(int j = half_pos; j < head_embed; j++) {
22 |       query_buf[j] = query(b, i, j - half_pos);
23 |       key_buf[j] = key(b, i, j - half_pos);
24 |     }
25 | 
26 |     for(int j = 0; j < head_embed; j++) {
27 |       half cos_half = cos(0, i + start_idx, j);
28 |       half sin_half = sin(0, i + start_idx, j);
29 | 
30 |       query(b, i, j) = __hfma(query(b, i, j), cos_half, __hmul(query_buf[j], sin_half));
31 |       key(b, i, j) = __hfma(key(b, i, j), cos_half, __hmul(key_buf[j], sin_half));
32 |     }
33 |   }
34 | }
35 | 
36 | __global__ void RotaryPosEmb_cuda_forward_shared(Matrix3D<half> query, Matrix3D<half> key, Matrix3D<half> cos, Matrix3D<half> sin, int start_idx, int len) {
37 |   extern __shared__ half shared_memory[];
38 | 
39 |   half *query_buf = &shared_memory[0];
40 |   half *key_buf = &shared_memory[4096];
41 | 
42 |   int num_heads = query.m_dim_x;
43 |   int head_embed = cos.m_dim_z;
44 |   int half_pos = head_embed / 2;
45 | 
46 |   int b = blockIdx.x;
47 |   int i = threadIdx.x;
48 | 
49 |   if(b < num_heads && i < len) {
50 |     // Load data into shared memory for faster access.
51 |     for(int j = 0; j < half_pos; j++) {
52 |       query_buf[threadIdx.x * head_embed + j] = __hneg(query(b, i, j + half_pos));
53 |       key_buf[threadIdx.x * head_embed + j] = __hneg(key(b, i, j + half_pos));
54 |     }
55 | 
56 |     for(int j = half_pos; j < head_embed; j++) {
57 |       query_buf[threadIdx.x * head_embed + j] = query(b, i, j - half_pos);
58 |       key_buf[threadIdx.x * head_embed + j] = key(b, i, j - half_pos);
59 |     }
60 | 
61 |     __syncthreads();  // Synchronize to ensure all data is loaded before processing.
62 | 
63 |     for(int j = 0; j < head_embed; j++) {
64 |       half cos_half = cos(0, i + start_idx, j);
65 |       half sin_half = sin(0, i + start_idx, j);
66 | 
67 |       // Use the __hfma intrinsic function for faster multiply-add operations.
68 |       query(b, i, j) = __hfma(query(b, i, j), cos_half, __hmul(query_buf[threadIdx.x * head_embed + j], sin_half));
69 |       key(b, i, j) = __hfma(key(b, i, j), cos_half, __hmul(key_buf[threadIdx.x * head_embed + j], sin_half));
70 |     }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/llm/src/ops/cuda/batch_add.cu:
--------------------------------------------------------------------------------
 1 | #include "operators.h"
 2 | 
 3 | // __global__ void batch_Add_float(Matrix3D<float> input, Matrix3D<float> input2, Matrix3D<float> output) {
 4 | //     int i = blockIdx.x * blockDim.x + threadIdx.x;
 5 | //     int j = blockIdx.y * blockDim.y + threadIdx.y;
 6 | //     int k = blockIdx.z * blockDim.z + threadIdx.z;
 7 | 
 8 | //     if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z) {
 9 | //         output(i, j, k) = input(i, j, k) + input2(0, j, k);
10 | //     }
11 | // }
12 | 
13 | __global__ void batch_Add_cuda(Matrix3D<half> input, Matrix3D<half> input2, Matrix3D<half> output) {
14 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
15 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
16 |     int k = blockIdx.z * blockDim.z + threadIdx.z;
17 | 
18 |     //// half version
19 |     if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z) {
20 |         output(i, j, k) = __hadd(input(i, j, k), input2(0, j, k));
21 |     }
22 | }
23 | 
24 | __global__ void batch_Add_cuda_half2(Matrix3D<half> input, Matrix3D<half> input2, Matrix3D<half> output) {
25 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
26 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
27 |     int k = blockIdx.z * blockDim.z + threadIdx.z;
28 | 
29 |     if (i < input.m_dim_x && j < input.m_dim_y && k < input.m_dim_z / 2) {
30 |         half2* input_half2 = reinterpret_cast<half2*>(input.m_data);
31 |         half2* input2_half2 = reinterpret_cast<half2*>(input2.m_data);
32 |         half2* output_half2 = reinterpret_cast<half2*>(output.m_data);
33 |         int input_half2_dim_y = input.m_dim_y;
34 |         int input_half2_dim_z = input.m_dim_z / 2;
35 |         // int input2_half2_dim_y = input2.m_dim_y;
36 |         int input2_half2_dim_z = input2.m_dim_z / 2;
37 |         int output_half2_dim_y = output.m_dim_y;
38 |         int output_half2_dim_z = output.m_dim_z / 2;
39 | 
40 |         output_half2[i * output_half2_dim_y * output_half2_dim_z + j * output_half2_dim_z + k] =
41 |                 __hadd2(input_half2[i * input_half2_dim_y * input_half2_dim_z + j * input_half2_dim_z + k],
42 |                         input2_half2[j * input2_half2_dim_z + k]);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/llm/src/ops/cuda/embedding.cu:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | __global__ void EmbeddingKernel(Matrix3D<int> input_id, Matrix3D<half> output, float* lookup, int embed_dim) {
 7 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 8 | 
 9 |     if (i < input_id.m_dim_z) {
10 |         int token_id = input_id(0, 0, i);
11 |         half* output_sample_ptr = &output.m_data[i * embed_dim];
12 |         float* target_embed = &lookup[token_id * embed_dim];
13 | 
14 |         for (int j = 0; j < embed_dim; ++j) {
15 |             output_sample_ptr[j] = __float2half(target_embed[j]);
16 |         }
17 |     }
18 | }
19 | 
20 | void load_Embedding_params_cuda(Embedding_cuda& op, std::string prefix) {
21 |     op.lookup.load((prefix + "/weight.bin").c_str());
22 | }
23 | 
24 | void Embedding_cuda::forward(Matrix3D<int> input_id, Matrix3D<half> output) {
25 |     PROFILE_START(profile_name);
26 |     assert(input_id.m_dim_x == 1);
27 |     assert(input_id.m_dim_y == 1);
28 |     assert(input_id.m_dim_z == output.m_dim_y);
29 |     assert(output.m_dim_z == this->embed_dim);
30 | 
31 |     int threadsPerBlock = 1024;
32 |     int blocksPerGrid = (input_id.m_dim_z + threadsPerBlock - 1) / threadsPerBlock;
33 |     EmbeddingKernel<<<blocksPerGrid, threadsPerBlock>>>(input_id, output, this->lookup.m_data, this->embed_dim);
34 | 
35 |     PROFILE_END(profile_name);
36 | }
37 | 


--------------------------------------------------------------------------------
/llm/src/ops/cuda/linear.cu:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | 
 5 | void Linear_half_int4::forward(const Matrix3D<float16_t> &x, Matrix3D<float16_t> &output) {
 6 |     const int num_thread = 8;
 7 |     Matrix3D<int> b = this->weight;
 8 |     PROFILE_START(profile_name);
 9 | 
10 |     // a: m x k   b: n x k   c: m x n
11 |     assert(output.m_dim_x == 1);
12 |     assert(output.m_dim_y == x.m_dim_y);
13 |     // assert(output.m_dim_z == weight.m_dim_y);
14 |     // assert(x.m_dim_z / 8 == weight.m_dim_z);
15 | 
16 |     assert(output.m_dim_z > num_thread);
17 |     assert(output.m_dim_z % (num_thread * 2) == 0);  // unroll column by 2
18 | 
19 |     struct matmul_params params;
20 |     params.A.row = x.m_dim_y;
21 |     params.A.column = x.m_dim_z;
22 |     params.A.half_data_ptr = x.m_data;
23 |     params.B.row = b.m_dim_z;     // k
24 |     params.B.column = b.m_dim_y;  // n
25 |     params.B.int32_data_ptr = b.m_data;
26 |     params.C.row = output.m_dim_y;
27 |     params.C.column = output.m_dim_z;
28 |     params.C.half_data_ptr = output.m_data;
29 |     params.opt_params.num_thread = num_thread;
30 |     params.half_scales = this->scale.m_data;
31 |     // params.offset = this->offset.m_data;  // TODO: Currently, we don't need offset
32 |     params.int32_zero_point = this->zero_point.m_data;
33 |     params.block_size = QK;
34 | 
35 |     matmul::MatmulOperator op = matmul::MatmulOperator();
36 |     op.gemv_forward_cuda(&params);
37 | 
38 |     PROFILE_END(profile_name);
39 |     return;
40 | }
41 | 
42 | 
43 | void Linear_FP16_int4_ref::forward_ref(const Matrix3D<naive_float16_t> &a, Matrix3D<naive_float16_t> &c) {
44 |     Matrix3D<int> b = this->weight;
45 |     PROFILE_START(profile_name);
46 | 
47 |     // a: m x k   b: n x k   c: m x n
48 |     assert(a.m_dim_x == b.m_dim_x);      // batch dim
49 |     assert(a.m_dim_z == b.m_dim_z);      // k
50 |     assert(a.m_dim_y == c.m_dim_y);      // m
51 |     assert(b.m_dim_y == c.m_dim_z / 8);  // n
52 | 
53 |     // batch dim == 1 only support MM for now
54 |     assert(a.m_dim_x == 1);
55 |     assert(b.m_dim_x == 1);
56 | 
57 |     struct matmul_params params;
58 |     params.A.row = a.m_dim_y;
59 |     params.A.column = a.m_dim_z;
60 |     params.A.fp16_data_ptr = a.m_data;
61 |     params.B.row = b.m_dim_z;
62 |     params.B.column = b.m_dim_y;
63 |     params.B.int32_data_ptr = b.m_data;
64 |     params.C.row = c.m_dim_y;
65 |     params.C.column = c.m_dim_z;
66 |     params.C.fp16_data_ptr = c.m_data;
67 |     params.fp16_scales = this->scale.m_data;
68 |     // params.offset = this->offset.m_data;   // TODO: Currently, we don't need offset
69 |     params.int32_zero_point = this->zero_point.m_data;
70 |     params.block_size = QK;
71 | 
72 |     matmul::MatmulOperator op = matmul::MatmulOperator();
73 |     op.naive_mat_mul_fp16_int4((const struct matmul_params *)&params);
74 | 
75 |     PROFILE_END(profile_name);
76 |     return;
77 | }
78 | 


--------------------------------------------------------------------------------
/llm/src/ops/cuda/softmax.cu:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "operators.h"
 3 | 
 4 | __global__ void softmax_cuda(Matrix3D<half> input, Matrix3D<half> output) {
 5 |     // Calculate indices i, j in the input array
 6 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 7 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 8 | 
 9 |     if (i < input.m_dim_x && j < input.m_dim_y) {
10 |         // half max_value = __float2half(-INFINITY);
11 |         half max_value = -65504;
12 |         half sum = 0;
13 | 
14 |         // Find the maximum value in the input array
15 |         for (int k = 0; k < input.m_dim_z; k++) {
16 |             half value = input(i, j, k);
17 | #if defined(__CUDA_ARCH__)
18 | #if __CUDA_ARCH__ >= 860  // Compute Capability >= 8.6
19 |             max_value = __hmax(max_value, value);
20 | #else
21 |             max_value = __hgt(max_value, value) ? max_value : value;
22 | #endif
23 | #endif
24 |         }
25 | 
26 |         // Compute the sum
27 |         for (int k = 0; k < input.m_dim_z; k++) {
28 |             half value = input(i, j, k);
29 |             // atomicAdd(&sum, value);
30 |             sum = __hadd(sum, hexp(__hsub(value, max_value)));
31 |             // sum = __hfma(__hsub(value, max_value), sum, sum);  // TODO: Check if this is correct and faster
32 |         }
33 | 
34 |         // Compute the final softmax values
35 |         for (int k = 0; k < input.m_dim_z; k++) {
36 |             half value = input(i, j, k);
37 |             output(i, j, k) = __hdiv(hexp(__hsub(value, max_value)), sum);
38 |         }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/llm/src/ops/embedding.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_Embedding_params(Embedding& op, std::string prefix) {
 7 |     op.lookup.load((prefix + "/weight.bin").c_str());
 8 |     // read_to_array((prefix + "/weight.bin").c_str(), op.lookup.m_data, op.lookup.length());
 9 | }
10 | 
11 | void Embedding::forward(Matrix3D<int> input_id, Matrix3D<float> output) {
12 |     PROFILE_START(profile_name);
13 |     assert(input_id.m_dim_x == 1);
14 |     assert(input_id.m_dim_y == 1);
15 |     assert(input_id.m_dim_z == output.m_dim_y);
16 |     assert(output.m_dim_z == this->embed_dim);
17 | 
18 |     for (int i = 0; i < input_id.m_dim_z; i++) {
19 |         int token_id = input_id(0, 0, i);
20 |         float* output_sample_ptr = &output.m_data[i * this->embed_dim];
21 |         float* target_embed = &this->lookup.m_data[token_id * this->embed_dim];
22 |         memcpy(output_sample_ptr, target_embed, sizeof(float) * this->embed_dim);
23 |     }
24 |     PROFILE_END(profile_name);
25 | }
26 | 


--------------------------------------------------------------------------------
/llm/src/ops/softmax.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | 
 5 | void softmax(const Matrix3D<float> &input, Matrix3D<float> &output, const int dim) {
 6 |     PROFILE_START("softmax");
 7 |     int len = input.length();
 8 | 
 9 |     if (dim == 2) {
10 |         // Find the maximum value in the input array
11 |         for (int i = 0; i < input.m_dim_x; i++) {
12 |             for (int j = 0; j < input.m_dim_y; j++) {
13 |                 float max_value = input.m_data[0];
14 |                 float sum = 0;
15 |                 // Find the maximum value in the input array
16 |                 for (int k = 0; k < input.m_dim_z; k++) {
17 |                     float value = input(i, j, k);
18 |                     if (value > max_value) {
19 |                         max_value = value;
20 |                     }
21 |                 }
22 | 
23 |                 // Compute the softmax values
24 |                 for (int k = 0; k < input.m_dim_z; k++) {
25 |                     float value = input(i, j, k);
26 |                     sum += std::exp(value - max_value);
27 |                 }
28 | 
29 |                 // Normalize the softmax values and store them in the output array
30 |                 for (int k = 0; k < input.m_dim_z; k++) {
31 |                     float value = input(i, j, k);
32 |                     float final_v = (std::exp(value - max_value) / (sum + 1e-10));
33 |                     output(i, j, k) = final_v;
34 |                 }
35 |             }
36 |         }
37 |     } else {
38 |         throw("Unsupported dimension for softmax");
39 |     }
40 |     PROFILE_END("softmax");
41 | }
42 | 


--------------------------------------------------------------------------------
/llm/tests/cuda/test_Int4llamaForCausalLM.cu:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <cstring>
 3 | 
 4 | #include "Int4llamaForCausalLM.h"
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | int NUM_THREAD = 8;
 9 | 
10 | static void Int4LLaMAFreeMemory() {
11 |     // Int4LlamaForCausalLM
12 |     Int4LlamaForCausalLM LlamaForCausalLM;
13 |     LlamaForCausalLM.free_cuda_memory();
14 | 
15 |     // Int4llamaDecoder
16 |     Int4llamaDecoder llamaDecoder;
17 |     llamaDecoder.free_cuda_memory();
18 | 
19 |     // Int4llamaDecoderLayer
20 |     Int4llamaDecoderLayer llamaDecoderLayer;
21 |     llamaDecoderLayer.free_cuda_memory();
22 | 
23 |     // Int4llamaAttention
24 |     Int4llamaAttention llamaAttention;
25 |     llamaAttention.free_cuda_memory();
26 | }
27 | 
28 | void test_Int4LlamaForCausalLM() {
29 |     struct model_config config = get_opt_model_config(LLaMA_7B);
30 |     const int voc_size = config.vocsize, sqlen = 9, b = 1;
31 | 
32 |     // reasoning phase: 1st run
33 |     int* buffer_1;
34 |     cudaMallocManaged(&buffer_1, sizeof(int) * sqlen);
35 |     Matrix3D<int> input_ids(buffer_1, b, 1, sqlen);
36 |     input_ids.load("assets/llama/tests/model/1st_input_ids.bin");
37 |     struct Int4LlamaForCausalLM_input input_1st = {input_ids};
38 | 
39 |     Int4LlamaForCausalLM model = Int4LlamaForCausalLM("INT4/models/LLaMA_7B_2_chat", config);
40 |     struct Int4LlamaForCausalLM_output output_1st = model.forward("INT4/models/LLaMA_7B_2_chat", input_1st);
41 | 
42 |     float* buffer_2;
43 |     cudaMallocManaged(&buffer_2, sizeof(float) * b * sqlen * voc_size);
44 |     Matrix3D<float> logits(buffer_2, b, sqlen, voc_size);
45 |     logits.load("assets/llama/tests/model/1st_logits_cuda.bin");
46 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8);
47 | 
48 |     Profiler::getInstance().report();
49 |     Profiler::getInstance().reset();
50 | 
51 |     // generating phase: 2nd run
52 |     int* buffer_3;
53 |     cudaMallocManaged(&buffer_3, sizeof(int) * sqlen);
54 |     Matrix3D<int> input_ids_2nd(buffer_3, b, 1, 1);
55 |     input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin");
56 | 
57 |     struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
58 |     struct Int4LlamaForCausalLM_output output_2nd = model.forward("INT4/models/LLaMA_7B_2_chat", input_2nd);
59 | 
60 |     float* buffer_4;
61 |     cudaMallocManaged(&buffer_4, sizeof(float) * b * 1 * voc_size);
62 |     logits = Matrix3D<float>(buffer_4, b, 1, voc_size);
63 |     logits.load("assets/llama/tests/model/2nd_logits_cuda.bin");
64 | 
65 |     success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8);
66 | 
67 |     Profiler::getInstance().report();
68 | 
69 |     if (!success)
70 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
71 |     else
72 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
73 | 
74 |     // Free memory
75 |     free_aligned_memory_gpu(buffer_1);
76 |     free_aligned_memory_gpu(buffer_2);
77 |     free_aligned_memory_gpu(buffer_3);
78 |     free_aligned_memory_gpu(buffer_4);
79 |     Int4LLaMAFreeMemory();
80 | }
81 | 
82 | int main() { test_Int4LlamaForCausalLM(); }
83 | 


--------------------------------------------------------------------------------
/llm/tests/non_cuda/test_Int4llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <cstring>
 3 | 
 4 | #include "../utils_memalloc.h"
 5 | #include "Int4llamaForCausalLM.h"
 6 | #include "operators.h"
 7 | #include "utils.h"
 8 | 
 9 | int NUM_THREAD = 8;
10 | 
11 | void test_Int4LlamaForCausalLM() {
12 |     struct model_config config = get_opt_model_config(LLaMA_7B);
13 |     const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 600, b = 1,
14 |               hidden_dim = config.hidden_dim;
15 |     const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers;
16 |     MemoryAllocator mem_buf;
17 | 
18 |     // reasoning phase: 1st run
19 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
20 |     input_ids.load("assets/llama/tests/model/1st_input_ids.bin");
21 |     struct Int4LlamaForCausalLM_input input_1st = {input_ids};
22 | 
23 |     Int4LlamaForCausalLM model = Int4LlamaForCausalLM("models/LLaMA_7B", config);
24 | 
25 |     struct Int4LlamaForCausalLM_output output_1st = model.forward("models/LLaMA_7B", input_1st);
26 | 
27 |     Matrix3D<float> logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size);
28 |     logits.load("assets/llama/tests/model/1st_logits.bin");
29 |     // print_first_k_elelment("O", output_1st.logits.m_data, 20);
30 |     // print_first_k_elelment("G", logits.m_data, 20);
31 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8);
32 | 
33 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
34 |                                    embed_dim / num_heads);
35 |     Profiler::getInstance().report();
36 |     Profiler::getInstance().reset();
37 | 
38 |     // generating phase: 2nd run
39 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
40 |     input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin");
41 |     struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
42 | 
43 |     struct Int4LlamaForCausalLM_output output_2nd;
44 |     for (int i = 0; i < 10; i++) output_2nd = model.forward("models/LLaMA_7B", input_2nd);
45 | 
46 |     logits = Matrix3D<float>(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size);
47 |     logits.load("assets/llama/tests/model/2nd_logits.bin");
48 |     // print_first_k_elelment("O", output_2nd.logits.m_data, 20);
49 |     // print_first_k_elelment("G", logits.m_data, 20);
50 |     success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8);
51 | 
52 |     Profiler::getInstance().report();
53 |     if (!success)
54 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
55 |     else
56 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
57 | }
58 | 
59 | int main() {
60 |     // This tests are directly from fp32 and are not completed yet!
61 |     test_Int4LlamaForCausalLM();
62 | }
63 | 


--------------------------------------------------------------------------------
/llm/tests/test_Fp32OPTAttention.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTAttention.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | int NUM_THREAD = 8;
 7 | 
 8 | void test_Fp32OPTAttention() {
 9 |     const int num_heads = 12, embed_dim = 768, sqlen = 2, b = 1;
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Fp32OPTAttention::initialized_memory(get_opt_model_config(OPT_125M));
13 |     Fp32OPTAttention attn =
14 |         Fp32OPTAttention("FP32/models/OPT_125m/decoder/layer0/self_attn", get_opt_model_config(OPT_125M));
15 | 
16 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
17 |     hidden_states.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_input.bin");
18 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
19 |     attention_mask.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_mask.bin");
20 |     struct Fp32OPTAttention_input input(hidden_states, attention_mask, 0);
21 | 
22 |     struct Fp32OPTAttention_output output = attn.forward(input);
23 | 
24 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
25 |     attn_outputGT.load("assets/OPT/tests/attn/OPT_125m/Fp32_atten_output.bin");
26 | 
27 |     bool success = check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, b * sqlen * embed_dim);
28 |     if (!success)
29 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
30 |     else
31 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
32 | }
33 | 
34 | void test_Fp32OPTAttention_1_3B() {
35 |     const int embed_dim = 2048, sqlen = 2, b = 1;
36 |     MemoryAllocator mem_buf;
37 | 
38 |     Fp32OPTAttention::initialized_memory(get_opt_model_config(OPT_1_3B));
39 |     Fp32OPTAttention attn =
40 |         Fp32OPTAttention("FP32/models/OPT_1.3B/decoder/layer0/self_attn", get_opt_model_config(OPT_1_3B));
41 | 
42 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
43 |     hidden_states.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_input.bin");
44 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
45 |     attention_mask.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_mask.bin");
46 |     struct Fp32OPTAttention_input input(hidden_states, attention_mask, 0);
47 | 
48 |     struct Fp32OPTAttention_output output = attn.forward(input);
49 | 
50 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
51 |     attn_outputGT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_atten_output.bin");
52 | 
53 |     bool success = check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, b * sqlen * embed_dim, 1e-5);
54 |     if (!success)
55 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
56 |     else
57 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
58 | }
59 | 
60 | int main() {
61 |     test_Fp32OPTAttention();
62 |     test_Fp32OPTAttention_1_3B();
63 | }
64 | 


--------------------------------------------------------------------------------
/llm/tests/test_Fp32OPTDecoder.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTDecoder.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | int NUM_THREAD = 8;
 7 | 
 8 | void test_Fp32OPTDecoder() {
 9 |     const int embed_dim = 2048, sqlen = 2, b = 1;
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
13 |     input_ids.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_input_ids.bin");
14 |     struct Fp32OPTDecoder_input input_1st = {input_ids};
15 | 
16 |     Fp32OPTDecoder decoder = Fp32OPTDecoder("FP32/models/OPT_1.3B/decoder", get_opt_model_config(OPT_1_3B));
17 | 
18 |     struct Fp32OPTDecoder_output output_1st = decoder.forward(input_1st);
19 | 
20 |     // reasoning phase: 1st run
21 |     Matrix3D<float> last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
22 |     last_hidden_state1_GT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_last_hidden_state.bin");
23 | 
24 |     print_first_k_elelment("Fp32_decoder_1st_last_hidden_state", last_hidden_state1_GT.m_data, 20);
25 |     print_first_k_elelment("output_1st", output_1st.last_hidden_state.m_data, 20);
26 |     bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data,
27 |                                    last_hidden_state1_GT.length(), 1e-5);
28 | 
29 |     if (!success)
30 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
31 |     else
32 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
33 | }
34 | 
35 | int main() { test_Fp32OPTDecoder(); }
36 | 


--------------------------------------------------------------------------------
/llm/tests/test_Fp32OPTDecoderLayer.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTDecoderLayer.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | int NUM_THREAD = 8;
 7 | 
 8 | void test_Fp32OPTDecoderLayer() {
 9 |     const int num_heads = 12, embed_dim = 768, sqlen = 2, b = 1;
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Fp32OPTDecoderLayer layer =
13 |         Fp32OPTDecoderLayer("FP32/models/OPT_125m/decoder/layer0", get_opt_model_config(OPT_125M), 0);
14 | 
15 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
16 |     hidden_states.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_input.bin");
17 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
18 |     attention_mask.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_mask.bin");
19 |     struct Fp32OPTDecoderLayer_input input(hidden_states, attention_mask);
20 | 
21 |     struct Fp32OPTDecoderLayer_output output = layer.forward(input);
22 | 
23 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
24 |     attn_outputGT.load("assets/OPT/tests/attn/OPT_125m/Fp32_layer_output.bin");
25 | 
26 |     bool success = check_two_equal(attn_outputGT.m_data, output.hidden_states.m_data, b * sqlen * embed_dim, 1e-4);
27 |     if (!success)
28 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
29 |     else
30 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
31 | }
32 | 
33 | void test_Fp32OPTDecoderLayer_1_3B() {
34 |     const int embed_dim = 2048, sqlen = 2, b = 1;
35 |     MemoryAllocator mem_buf;
36 | 
37 |     Fp32OPTDecoderLayer layer =
38 |         Fp32OPTDecoderLayer("FP32/models/OPT_1.3B/decoder/layer0", get_opt_model_config(OPT_1_3B), 0);
39 | 
40 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
41 |     hidden_states.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_input.bin");
42 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
43 |     attention_mask.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_mask.bin");
44 |     struct Fp32OPTDecoderLayer_input input(hidden_states, attention_mask);
45 | 
46 |     struct Fp32OPTDecoderLayer_output output = layer.forward(input);
47 | 
48 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
49 |     attn_outputGT.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_layer_output.bin");
50 | 
51 |     bool success = check_two_equal(attn_outputGT.m_data, output.hidden_states.m_data, b * sqlen * embed_dim, 1e-4);
52 |     if (!success)
53 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
54 |     else
55 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
56 | }
57 | 
58 | int main() {
59 |     test_Fp32OPTDecoderLayer();
60 |     test_Fp32OPTDecoderLayer_1_3B();
61 | }
62 | 


--------------------------------------------------------------------------------
/llm/tests/test_Fp32OPTForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32OPTForCausalLM.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | int NUM_THREAD = 8;
 7 | 
 8 | void test_Fp32OPTForCausalLM() {
 9 |     struct model_config config = get_opt_model_config(OPT_1_3B);
10 |     const int embed_dim = config.embed_dim, sqlen = 2, b = 1;
11 |     const int voc_size = config.vocsize;
12 |     MemoryAllocator mem_buf;
13 | 
14 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
15 |     input_ids.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_decoder_1st_input_ids.bin");
16 |     struct Fp32OPTForCausalLM_input input_1st = {input_ids};
17 | 
18 |     Fp32OPTForCausalLM model = Fp32OPTForCausalLM("FP32/models/OPT_1.3B", get_opt_model_config(OPT_1_3B));
19 | 
20 |     struct Fp32OPTForCausalLM_output output_1st = model.forward(input_1st);
21 | 
22 |     // reasoning phase: 1st run
23 |     Matrix3D<float> logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size);
24 |     logits.load("assets/OPT/tests/attn/OPT_1.3B/Fp32_causallm_logits.bin");
25 | 
26 |     // print_first_k_elelment("logits", logits.m_data, 20);
27 |     // print_first_k_elelment("output_1st.logits.m_data", output_1st.logits.m_data, 20);
28 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-5);
29 | 
30 |     if (!success)
31 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
32 |     else
33 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
34 | }
35 | 
36 | int main() { test_Fp32OPTForCausalLM(); }
37 | 


--------------------------------------------------------------------------------
/llm/tests/test_Fp32llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <cstring>
 3 | 
 4 | #include "Fp32llamaForCausalLM.h"
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | #include "utils_memalloc.h"
 8 | 
 9 | int NUM_THREAD = 8;
10 | 
11 | void test_Fp32LlamaForCausalLM() {
12 |     struct model_config config = get_opt_model_config(LLaMA_7B);
13 |     const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1,
14 |               hidden_dim = config.hidden_dim;
15 |     const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers;
16 |     MemoryAllocator mem_buf;
17 | 
18 |     // reasoning phase: 1st run
19 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
20 |     input_ids.load("assets/llama/tests/model/1st_input_ids.bin");
21 |     struct Fp32LlamaForCausalLM_input input_1st = {input_ids};
22 | 
23 |     Fp32LlamaForCausalLM model = Fp32LlamaForCausalLM("models/LLaMA_7B", config);
24 | 
25 |     struct Fp32LlamaForCausalLM_output output_1st = model.forward(input_1st);
26 | 
27 |     Matrix3D<float> logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size);
28 |     logits.load("assets/llama/tests/model/1st_logits.bin");
29 |     // print_first_k_elelment("O", output_1st.logits.m_data, 20);
30 |     // print_first_k_elelment("G", logits.m_data, 20);
31 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8);
32 | 
33 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
34 |                                    embed_dim / num_heads);
35 |     Profiler::getInstance().report();
36 |     Profiler::getInstance().reset();
37 | 
38 |     // generating phase: 2nd run
39 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
40 |     input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin");
41 |     struct Fp32LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
42 | 
43 |     struct Fp32LlamaForCausalLM_output output_2nd = model.forward(input_2nd);
44 | 
45 |     logits = Matrix3D<float>(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size);
46 |     logits.load("assets/llama/tests/model/2nd_logits.bin");
47 |     // print_first_k_elelment("O", output_2nd.logits.m_data, 20);
48 |     // print_first_k_elelment("G", logits.m_data, 20);
49 |     success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8);
50 | 
51 |     Profiler::getInstance().report();
52 |     if (!success)
53 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
54 |     else
55 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
56 | }
57 | 
58 | int main() { test_Fp32LlamaForCausalLM(); }
59 | 


--------------------------------------------------------------------------------
/llm/tests/test_LLaMATokenizer.cc:
--------------------------------------------------------------------------------
  1 | #include "LLaMATokenizer.h"
  2 | 
  3 | int NUM_THREAD = 8;
  4 | 
  5 | static const std::map<std::string, std::vector<int>> &test_LLaMATokenizer() {
  6 |     static std::map<std::string, std::vector<int>> llama_answer = {
  7 |         /* 1. */ {
  8 |             "Hello World",
  9 |             {
 10 |                 1,
 11 |                 10994,
 12 |                 2787,
 13 |             },
 14 |         },
 15 |         /* 2. */
 16 |         {
 17 |             " Hello World!",
 18 |             {
 19 |                 1,
 20 |                 15043,
 21 |                 2787,
 22 |                 29991,
 23 |             },
 24 |         },
 25 |         /* 3. */
 26 |         {
 27 |             "This is Tiny LLM Engine.",
 28 |             {
 29 |                 1,
 30 |                 4013,
 31 |                 338,
 32 |                 323,
 33 |                 4901,
 34 |                 365,
 35 |                 26369,
 36 |                 10863,
 37 |                 29889,
 38 |             },
 39 |         },
 40 |         /* 4. */
 41 |         {
 42 |             "Please introduce Massachusetts Institute of Technology (MIT)",
 43 |             {
 44 |                 1,
 45 |                 12148,
 46 |                 14944,
 47 |                 16167,
 48 |                 8907,
 49 |                 310,
 50 |                 17968,
 51 |                 313,
 52 |                 26349,
 53 |                 29897,
 54 |             },
 55 |         },
 56 |         /* 5. */
 57 |         {
 58 |             "Building a website can be done in 10 simple steps. This message is for general people, so we assume "
 59 |             "they don't have basic concepts.",
 60 |             {
 61 |                 1,   8893, 292,  263,  4700,  508, 367, 2309, 297, 29871, 29896, 29900, 2560, 6576, 29889, 910,   2643,
 62 |                 338, 363,  2498, 2305, 29892, 577, 591, 5251, 896, 1016,  29915, 29873, 505,  6996, 22001, 29889,
 63 |             },
 64 |         },
 65 |     };
 66 | 
 67 |     return llama_answer;
 68 | };
 69 | 
 70 | int main(int argc, char **argv) {
 71 |     // load the vocab
 72 |     const std::string fname = "models/llama_vocab.bin";
 73 |     llama_vocab vocab = llama_init_vocab(fname.c_str());
 74 | 
 75 |     bool is_equal;
 76 |     int test_count = 1;
 77 |     for (const auto &llama_answer : test_LLaMATokenizer()) {
 78 |         std::vector<int> input_ids(llama_answer.first.size());
 79 |         const int n = llama_tokenize(vocab, llama_answer.first.c_str(), input_ids.data(), input_ids.size(), true);
 80 |         input_ids.resize(n);
 81 | 
 82 |         is_equal = input_ids.size() == llama_answer.second.size();
 83 | 
 84 |         for (int i = 0; i < (int)input_ids.size() && is_equal; ++i) {
 85 |             if (input_ids[i] != llama_answer.second[i]) {
 86 |                 is_equal = false;
 87 |             }
 88 |         }
 89 | 
 90 |         test_count++;
 91 |     }
 92 | 
 93 |     if (!is_equal)
 94 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
 95 |     else
 96 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
 97 | 
 98 |     return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/llm/tests/test_OPTGenerate.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "Generate.h"
 4 | 
 5 | int NUM_THREAD = 8;
 6 | 
 7 | int main() {
 8 |     // std::vector<int> input_ids = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
 9 |     //                                    13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
10 |     std::string vocab_file = "./models/OPT_125m/vocab.json";
11 |     std::string bpe_file = "./models/OPT_125m/merges.txt";
12 | 
13 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
14 |     std::vector<int> input_ids = encoder.encode("John went to MIT and study Computer Science.");
15 | 
16 |     std::string decoded = encoder.decode(input_ids);
17 |     std::cout << "input:" << decoded << std::endl;
18 | 
19 |     OPTForCausalLM model = OPTForCausalLM("models/OPT_125m", get_opt_model_config(OPT_125M));
20 |     const struct opt_params generation_config;
21 |     std::vector<int> generated_ids = OPTGenerate(&model, OPT_INT8, input_ids, generation_config);
22 | 
23 |     decoded = encoder.decode(generated_ids);
24 |     std::cout << "generated:" << decoded << std::endl;
25 | };
26 | 


--------------------------------------------------------------------------------
/llm/tests/test_OPTTokenizer.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "OPTTokenizer.h"
 4 | 
 5 | int NUM_THREAD = 8;
 6 | 
 7 | void test_OPTEncode() {
 8 |     std::string bpe_file = "models/opt_merges.txt";
 9 |     std::string vocab_file = "models/opt_vocab.json";
10 | 
11 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
12 |     std::vector<int> encoded = encoder.encode(
13 |         "Building a website can be done in 10 simple steps. This message is for general people, so we assume they "
14 |         "don't have basic concepts.");
15 |     std::vector<int> encoded_answer = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
16 |                                        13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
17 |     bool is_equal = true;
18 |     for (int i = 0; i < encoded.size(); i++) {
19 |         if (encoded[i] != encoded_answer[i]) {
20 |             is_equal = false;
21 |             break;
22 |         }
23 |     }
24 |     if (!is_equal)
25 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
26 |     else
27 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
28 | }
29 | 
30 | void test_OPTDecode() {
31 |     std::string bpe_file = "models/opt_merges.txt";
32 |     std::string vocab_file = "models/opt_vocab.json";
33 |     ;
34 | 
35 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
36 |     std::vector<int> encoded_answer = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
37 |                                        13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
38 |     std::string decoded = encoder.decode(encoded_answer);
39 |     std::string decoded_answer =
40 |         "Building a website can be done in 10 simple steps. This message is for general people, so we assume they "
41 |         "don't have basic concepts.";
42 |     bool is_equal = true;
43 |     if (decoded != decoded_answer) is_equal = false;
44 |     if (!is_equal)
45 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
46 |     else
47 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
48 | }
49 | 
50 | int main() {
51 |     test_OPTEncode();
52 |     test_OPTDecode();
53 | };
54 | 


--------------------------------------------------------------------------------
/llm/tests/utils_memalloc.h:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | class MemoryAllocator {
 3 |     // TODO: use allocate_aligned_memory instead!
 4 |    public:
 5 |     MemoryAllocator() { this->counter = 0; }
 6 |     float* get_fpbuffer(int size) {
 7 |         float* ptr;
 8 |         allocate_aligned_memory(ptr, size * sizeof(float));
 9 |         return ptr;
10 |     }
11 |     int8_t* get_int8buffer(int size) {
12 |         int8_t* ptr;
13 |         allocate_aligned_memory(ptr, size * sizeof(int8_t));
14 |         return ptr;
15 |     }
16 |     int* get_intbuffer(int size) {
17 |         int* ptr;
18 |         allocate_aligned_memory(ptr, size * sizeof(int));
19 |         return ptr;
20 |     }
21 | 
22 |    private:
23 |     int counter;
24 | };
25 | 


--------------------------------------------------------------------------------
/llm/tools/copy_rotary_emb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Copy from layer 0 to layer 31
4 | for i in {0..31}; do
5 |   cp -r INT4/models/CodeLLaMA_7B_Instruct/decoder/layer${i}/self_attn/rotary_emb/* INT4/models/Mistral_7B/decoder/layer${i}/self_attn/rotary_emb/
6 | done


--------------------------------------------------------------------------------
/llm/tools/download_assets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # List of files to download, their corresponding MD5 checksums, and target local paths
 4 | files_and_checksums=(
 5 |   "https://huggingface.co/mit-han-lab/tinychatengine-model-zoo/resolve/main/assets.zip?download=true 8527788105acccfada9c89d075fa8764 assets.zip"
 6 | )
 7 | 
 8 | OS=`uname`
 9 | 
10 | # Function to download a file if it doesn't exist or if its MD5 checksum is incorrect
11 | download_if_needed() {
12 |   url="$1"
13 |   expected_md5="$2"
14 |   target_path="$3"
15 | 
16 |   # Ensure the target directory exists
17 |   target_dir=$(dirname "$target_path")
18 |   mkdir -p "$target_dir"
19 | 
20 |   # Download the file if it does not exist
21 |   if [ ! -e "$target_path" ]; then
22 |     echo "File '$target_path' does not exist. Downloading..."
23 |     wget -q -O "$target_path" "$url"
24 |   fi
25 | 
26 |   # Use md5 on MacOS
27 |   if [ $OS = "Darwin" ]
28 |   then
29 |       actual_md5=$(md5 -q "$target_path")
30 |   # Use md5sum on Ubuntu
31 |   elif [ $OS = "Linux" ]
32 |   then
33 |       actual_md5=$(md5sum "$target_path" | cut -d ' ' -f1)
34 |   fi
35 | 
36 |   if [ "$actual_md5" != "$expected_md5" ]; then
37 |     echo "MD5 checksum for '$target_path' is incorrect. Downloading again..."
38 |     wget -q -O "$target_path" "$url"
39 |   else
40 |     echo "File '$target_path' exists and its MD5 checksum is correct."
41 |   fi
42 | }
43 | 
44 | # Process each file, its corresponding MD5 checksum, and target local path
45 | for file_and_checksum in "${files_and_checksums[@]}"; do
46 |   url=$(echo "$file_and_checksum" | awk '{ print $1 }')
47 |   expected_md5=$(echo "$file_and_checksum" | awk '{ print $2 }')
48 |   target_path=$(echo "$file_and_checksum" | awk '{ print $3 }')
49 | 
50 |   download_if_needed "$url" "$expected_md5" "$target_path"
51 |   unzip "$target_path"
52 | done
53 | 


--------------------------------------------------------------------------------
/llm/tools/export_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # # E.g., Quantize and export Mistral-7B model
 4 | # python tools/mistral_exporter.py --model ../../llm-awq-mistral/quant_cache/mistral-7b-w4-g32-awq-v2.pt --output models/Mistral_7B
 5 | # python tools/rotary_emb_exporter.py
 6 | # # For x86
 7 | # python tools/model_quantizer.py --model_path models/Mistral_7B --method QM_x86
 8 | # mkdir Mistral_7B_for_x86
 9 | # mkdir Mistral_7B_for_x86/INT4
10 | # mkdir Mistral_7B_for_x86/INT4/models
11 | # mv INT4/models/Mistral_7B Mistral_7B_for_x86/INT4/models
12 | # cd Mistral_7B_for_x86/
13 | # zip -r Mistral_7B_v0.2_Instruct.zip INT4
14 | # cd ..
15 | # # For ARM
16 | # python tools/model_quantizer.py --model_path models/Mistral_7B --method QM_ARM
17 | # mkdir Mistral_7B_for_ARM
18 | # mkdir Mistral_7B_for_ARM/INT4
19 | # mkdir Mistral_7B_for_ARM/INT4/models
20 | # mv INT4/models/Mistral_7B Mistral_7B_for_ARM/INT4/models
21 | # cd Mistral_7B_for_ARM/
22 | # zip -r Mistral_7B_v0.2_Instruct.zip INT4
23 | # cd ..
24 | # # fp32
25 | # mkdir Mistral_7B_FP32
26 | # mkdir Mistral_7B_FP32/models
27 | # mv models/Mistral_7B Mistral_7B_FP32/models
28 | # cd Mistral_7B_FP32/
29 | # zip -r Mistral_7B_v0.2_Instruct.zip models
30 | # cd ..
31 | 
32 | 
33 | # E.g., Quantize and export LLaMA3-8B model
34 | python tools/llama3_exporter.py --model ../../llm-awq/quant_cache/llama3-8b-w4-g32-awq-v2.pt --output models/LLaMA_3_8B_Instruct
35 | python tools/rotary_emb_exporter.py
36 | # For ARM
37 | python tools/model_quantizer.py --model_path models/LLaMA_3_8B_Instruct --method QM_ARM
38 | mkdir LLaMA_3_8B_Instruct_for_ARM
39 | mkdir LLaMA_3_8B_Instruct_for_ARM/INT4
40 | mkdir LLaMA_3_8B_Instruct_for_ARM/INT4/models
41 | mv INT4/models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_for_ARM/INT4/models
42 | cd LLaMA_3_8B_Instruct_for_ARM/
43 | zip -r LLaMA_3_8B_Instruct.zip INT4
44 | cd ..
45 | # For x86
46 | python tools/model_quantizer.py --model_path models/LLaMA_3_8B_Instruct --method QM_x86
47 | mkdir LLaMA_3_8B_Instruct_for_x86
48 | mkdir LLaMA_3_8B_Instruct_for_x86/INT4
49 | mkdir LLaMA_3_8B_Instruct_for_x86/INT4/models
50 | mv INT4/models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_for_x86/INT4/models
51 | cd LLaMA_3_8B_Instruct_for_x86/
52 | zip -r LLaMA_3_8B_Instruct.zip INT4
53 | cd ..
54 | # fp32
55 | mkdir LLaMA_3_8B_Instruct_FP32
56 | mkdir LLaMA_3_8B_Instruct_FP32/models
57 | mv models/LLaMA_3_8B_Instruct LLaMA_3_8B_Instruct_FP32/models
58 | cd LLaMA_3_8B_Instruct_FP32/
59 | zip -r LLaMA_3_8B_Instruct.zip models
60 | cd ..
61 | 


--------------------------------------------------------------------------------
/llm/tools/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | make clean && make -j
 4 | 
 5 | # Find all executable files in the current directory starting with 'profile_'
 6 | for file in profile_*; do
 7 |   # Check if the file is executable
 8 |   if [ -x "$file" ]; then
 9 |     echo "Running '$file'..."
10 |     ./"$file"
11 |     exit_code=$?
12 |   fi
13 | done
14 | 


--------------------------------------------------------------------------------
/llm/tools/quantize_and_upload.py:
--------------------------------------------------------------------------------
 1 | """A script to quantize supported models and updload to model zoo.
 2 | 
 3 | Example usage:
 4 | python quantize_and_upload.py --method <method> --token <dropbox token>
 5 | 
 6 | Note: This script is for developers.
 7 | """
 8 | import argparse
 9 | import hashlib
10 | import os
11 | 
12 | from upload import subebackups
13 | 
14 | model_paths = ["models/LLaMA_13B_2_chat"]
15 | 
16 | quantized_dir = "INT4"
17 | db_prefix = "/MIT/transformer_assets/"
18 | 
19 | 
20 | def _get_md5sum(file_path):
21 |     hash_md5 = hashlib.md5()
22 |     with open(file_path, "rb") as f:
23 |         for chunk in iter(lambda: f.read(4096), b""):
24 |             hash_md5.update(chunk)
25 |     return hash_md5.hexdigest()
26 | 
27 | 
28 | def main():
29 |     """Take arguments and quantize all models and upload to dropbox."""
30 | 
31 |     def _get_parser():
32 |         parser = argparse.ArgumentParser(description="Quantize model")
33 |         parser.add_argument("--model_path", type=str, help="Quantization method", default=None)
34 |         parser.add_argument("--method", type=str, help="Quantization method")
35 |         parser.add_argument("--token", help="Your Dropbox OAuth2 token.")
36 |         return parser
37 | 
38 |     parser = _get_parser()
39 |     args = parser.parse_args()
40 | 
41 |     if args.method not in ["QM_x86", "QM_ARM", "QM_CUDA", "FP32", "INT8"]:
42 |         raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM', 'QM_CUDA', 'FP32', 'INT8']")
43 |     QM_method = args.method
44 | 
45 |     if args.model_path:
46 |         target_paths = [args.model_path]
47 |     else:
48 |         target_paths = model_paths
49 | 
50 |     for model_path in target_paths:
51 |         # quantize
52 |         if args.method in ["QM_x86", "QM_CUDA", "QM_ARM"]:
53 |             out_dir = quantized_dir
54 |             quantize_cmd = (
55 |                 f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {out_dir}"
56 |             )
57 |             os.system(quantize_cmd)
58 |         else:
59 |             out_dir = "./"
60 |         # zip
61 |         print("zipping...")
62 |         model_name_size = model_path.rsplit("/", maxsplit=1)[-1]
63 |         zip_path = "/tmp/" + model_name_size + ".zip"
64 |         zip_cmd = f"zip -qq -r {zip_path} {os.path.join(out_dir, model_path)}"
65 |         os.system(zip_cmd)
66 |         # md5sum
67 |         print(f"md5sum is {_get_md5sum(zip_path)}.")
68 |         print("uploading...")
69 |         # upload
70 |         upload_path = os.path.join(db_prefix, QM_method, model_name_size + ".zip")
71 |         subebackups(zip_path, upload_path, args.token)
72 |         print("removing temporary zip file...")
73 |         # rm zip
74 |         os.system(f"rm {zip_path}")
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/llm/tools/quantize_constants.py:
--------------------------------------------------------------------------------
1 | STORE_FP16 = False
2 | 
3 | QK4_0 = 32
4 | QK4_1 = 32
5 | QK4_2 = 32
6 | QK4_3 = 32
7 | QK4_5 = 128
8 | QK4_6 = 128
9 | 


--------------------------------------------------------------------------------
/llm/tools/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | make clean && make -j
 4 | 
 5 | # Find all executable files in the current directory starting with 'test_'
 6 | for file in test_*; do
 7 |   # Check if the file is executable
 8 |   if [ -x "$file" ]; then
 9 |     echo "Running '$file'..."
10 |     ./"$file"
11 |     exit_code=$?
12 |   fi
13 | done
14 | 


--------------------------------------------------------------------------------
/llm/tools/upload.py:
--------------------------------------------------------------------------------
 1 | """ Python script to upload models to Hugging Face.
 2 | 
 3 | Usage:
 4 |    python tools/upload.py --filename <filename> --QM <method> --hf_token <token>
 5 | 
 6 | Example commandline:
 7 |    python tools/upload.py --filename LLaMA_3_8B_Instruct.zip --QM QM_ARM --hf_token <token>
 8 | """
 9 | import argparse
10 | import hashlib
11 | import os
12 | import zipfile
13 | 
14 | import requests
15 | from tqdm import tqdm
16 | from huggingface_hub import HfApi
17 | 
18 | 
19 | def _upload_file_to_HF(filename, folder_name, hf_token):
20 |     # Check if the file is a zip file
21 |     if zipfile.is_zipfile(filename):
22 |         print(f"Start uploading the model to Huggingface: mit-han-lab/tinychatengine-model-zoo/{folder_name}/{filename}")
23 |         api = HfApi()
24 |         api.upload_file(
25 |             path_or_fileobj=filename,
26 |             path_in_repo=f"{folder_name}/{filename}",
27 |             repo_id="mit-han-lab/tinychatengine-model-zoo",
28 |             repo_type="model",
29 |             commit_message="Upload models",
30 |             token=hf_token
31 |         )
32 |         print(f"File uploaded successfully: mit-han-lab/tinychatengine-model-zoo/{folder_name}/{filename}")
33 |     else:
34 |         print(f"The file is not a zip file: {filename}")
35 | 
36 | def _remove_file(filepath):
37 |     if os.path.isfile(filepath):
38 |         os.remove(filepath)
39 |         print(f"File removed successfully: {filepath}")
40 |     else:
41 |         print(f"Error: {filepath} not a valid filename")
42 | 
43 | def _main():
44 |     parser = argparse.ArgumentParser(description="Download a file and check its md5sum")
45 |     parser.add_argument("--filename", help="The name of the file to upload.")
46 |     parser.add_argument("--QM", default="FP32", help="Quantization method.")
47 |     parser.add_argument("--hf_token", help="Huggingface write token.")
48 |     parser.add_argument("--remove_file", action="store_true", help="Remove the file after uploading.")
49 |     args = parser.parse_args()
50 | 
51 |     Qmodels = ["FP32", "QM_ARM", "QM_x86", "QM_CUDA", "INT8"]
52 | 
53 |     if args.QM not in Qmodels:
54 |         raise NotImplementedError(f"{args.QM} is not supported.")
55 | 
56 |     _upload_file_to_HF(args.filename, args.QM, args.hf_token) # Upload the file to Huggingface
57 | 
58 |     if args.remove_file:
59 |         _remove_file(args.filename)  # Remove the zip file
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     _main()
64 | 


--------------------------------------------------------------------------------
/llm/tools/upload_to_dropbox.py:
--------------------------------------------------------------------------------
 1 | """ DEPRECATED: This script is deprecated. Please use `upload.py` to upload models to Hugging Face instead.
 2 | 
 3 | Uploading models and asset to the dropbox storage.
 4 | 
 5 | Example commandline:
 6 |    python upload.py <dropbox app token>
 7 | """
 8 | import argparse
 9 | import os
10 | 
11 | import dropbox
12 | 
13 | files_to_upload = [
14 |     "CodeLLaMA_13B_Instruct.zip",
15 |     "CodeLLaMA_7B_Instruct.zip",
16 |     # "LLaMA_13B_2_chat.zip",
17 |     # "LLaMA_7B_2_chat.zip",
18 |     # "assets.zip",
19 | ]
20 | 
21 | 
22 | def subebackups(file_path, target_path, token):
23 |     """Upload a file to the dropbox storage."""
24 |     dbx = dropbox.Dropbox(token, timeout=36000)
25 |     file_size = os.path.getsize(file_path)
26 |     CHUNK_SIZE = 50 * 1024 * 1024
27 |     dest_path = target_path
28 | 
29 |     with open(file_path, "rb") as f:
30 |         if file_size <= CHUNK_SIZE:
31 |             dbx.files_upload(f.read(), dest_path)
32 | 
33 |         else:
34 |             upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE))
35 |             cursor = dropbox.files.UploadSessionCursor(
36 |                 session_id=upload_session_start_result.session_id, offset=f.tell()
37 |             )
38 |             commit = dropbox.files.CommitInfo(path=dest_path, mode=dropbox.files.WriteMode("overwrite"))
39 | 
40 |             while f.tell() < file_size:
41 |                 if (file_size - f.tell()) <= CHUNK_SIZE:
42 |                     print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit))
43 |                 else:
44 |                     dbx.files_upload_session_append(f.read(CHUNK_SIZE), cursor.session_id, cursor.offset)
45 |                     cursor.offset = f.tell()
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser(description="Upload a file to Dropbox.")
50 |     parser.add_argument("token", help="Your Dropbox OAuth2 token.")
51 |     args = parser.parse_args()
52 | 
53 |     db_prefix = "/HAN Lab Public Space/Projects/TinyChatEngine/assets and models/QM_CUDA/"
54 |     local_prefix = "uploads"
55 | 
56 |     for file in files_to_upload:
57 |         subebackups(file, db_prefix + file, args.token)
58 | 


--------------------------------------------------------------------------------
/llm/tools/zip_assets.sh:
--------------------------------------------------------------------------------
1 | zip -r assets.zip assets
2 | 


--------------------------------------------------------------------------------
/llm/vila:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 70
5 | echo "============================================================================================================================="
6 | 
7 | ./chat VILA1.5_8B INT4 5 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/vila_2.7b:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat VILA_2.7B INT4 5 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/voice_mistral:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | ./chat -v Mistral_7B INT4 5 0
3 | 


--------------------------------------------------------------------------------
/llm/voice_vila:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | echo "============================================================================================================================="
3 | image_path="$1"
4 | termvisage $image_path -w 75
5 | echo "============================================================================================================================="
6 | 
7 | ./chat -v VILA1.5_8B INT4 5 $image_path
8 | 


--------------------------------------------------------------------------------
/llm/voicechat_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Clone whisper.cpp and checkout the specific commit
 4 | git clone https://github.com/ggerganov/whisper.cpp
 5 | cd whisper.cpp
 6 | git checkout a4bb2df
 7 | 
 8 | # Determine the platform
 9 | OS="$(uname)"
10 | if [ "$OS" = "Linux" ]; then
11 |     # Install SDL2 on Linux
12 |     sudo apt-get install libsdl2-dev
13 | elif [ "$OS" = "Darwin" ]; then
14 |     # Install SDL2 on Mac OS
15 |     brew install sdl2
16 | else
17 |     echo "Unsupported operating system: $OS"
18 |     exit 1
19 | fi
20 | 
21 | # Apply patch and download model
22 | git apply ../application/sts_utils/clean_up.patch
23 | bash ./models/download-ggml-model.sh base.en
24 | 
25 | # Check for NVIDIA GPU
26 | if lspci | grep -i nvidia > /dev/null; then
27 |     # Compile with CUDA support
28 |     WHISPER_CUBLAS=1 make -j stream
29 | else
30 |     # Compile without CUDA support
31 |     make -j stream
32 | fi
33 | 
34 | # Set up TTS
35 | cd ../
36 | mkdir TTS
37 | cd TTS
38 | wget "https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_arm64.tar.gz"
39 | tar -xvzf piper_arm64.tar.gz
40 | rm piper_arm64.tar.gz
41 | 
42 | # Download default voice
43 | wget "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx?download=true" -O en_US-amy-medium.onnx
44 | wget "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx.json?download=true" -O en_US-amy-medium.onnx.json
45 | 
46 | # Return to the parent directory and compile chat
47 | cd ../
48 | make clean
49 | make -j chat
50 | 
51 | echo ""
52 | echo "TinyChatEngine's speech-to-speech chatbot setup completed successfully!"
53 | echo "Use './chat -v' on Linux/MacOS or 'chat.exe -v' on Windows."
54 | echo ""
55 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | include = '\.pyi?$'
 4 | extend-exclude = "codegen/.*"
 5 | 
 6 | [tool.isort]
 7 | profile = "black"
 8 | known_first_party = ["code_generator"]
 9 | extend_skip = ["codegen"]
10 | multi_line_output = 3
11 | include_trailing_comma = true
12 | force_grid_wrap = 0
13 | use_parentheses = true
14 | ensure_newline_before_comments = true
15 | line_length = 120
16 | 
17 | [tool.pylint]
18 |     [tool.pylint.master]
19 |     ignore-paths = ["codegen"]
20 |     [tool.pylint.messages_control]
21 |     disable = [
22 |         "C0103",
23 |         "C0114",
24 |         "C0115",
25 |         "C0116",
26 |         "C0123",
27 |         "C0209",
28 |         "C0330",
29 |         "C0301",
30 |         "C0302",
31 |         "C0411",
32 |         "C0415",
33 |         "E0401",
34 |         "E1121",
35 |         "E1123",
36 |         "E1101",
37 |         "R",
38 |         "W"
39 |     ]
40 |     [tool.pylint.basic]
41 |     good-names-rgxs = "^[_a-z][_a-z0-9]?$"  # allow 1 or 2 character names
42 |     [tool.pylint.format]
43 |     max-line-length = 120
44 |     max-module-lines = 5000
45 |     [tool.pylint.design]
46 |     max-args = 10
47 |     max-attributes = 15
48 |     max-parents = 10
49 | 
50 | [tool.mypy]
51 | files = "."
52 | exclude ="codegen/.*"
53 | install_types = true
54 | non_interactive = true
55 | show_error_codes = true
56 | disable_error_code = [
57 |     "import",
58 |     "assignment",
59 |     "operator",
60 |     "has-type",
61 |     "var-annotated",
62 |     "operator",
63 |     "call-arg",
64 | ]
65 | explicit_package_bases = true
66 | namespace_packages = true
67 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | tqdm
3 | torch
4 | transformers
5 | pillow
6 | huggingface_hub


--------------------------------------------------------------------------------