├── .flake8
├── transformer
    ├── quantize_constants.py
    ├── zip_assets.sh
    ├── models
    │   └── llama_vocab.bin
    ├── tests
    │   ├── assets
    │   │   ├── input.bin
    │   │   └── output.bin
    │   ├── utils_memalloc.h
    │   ├── test_OPTGenerate.cc
    │   ├── test_linear.cc
    │   ├── test_OPTTokenizer.cc
    │   ├── test_Fp32llamaForCausalLM.cc
    │   ├── test_Int4llamaForCausalLM.cc
    │   ├── test_LLaMATokenizer.cc
    │   ├── test_Fp32llamaDecoder.cc
    │   ├── test_Int4llamaDecoder.cc
    │   ├── test_Fp32llamaDecoderLayer.cc
    │   ├── test_Int4llamaDecoderLayer.cc
    │   ├── test_Int4llamaAttention.cc
    │   └── test_Fp32llamaAttention.cc
    ├── include
    │   ├── ops
    │   │   ├── arg_max.h
    │   │   ├── LlamaRMSNorm.h
    │   │   ├── LayerNorm.h
    │   │   ├── BMM_F32T.h
    │   │   ├── LayerNormQ.h
    │   │   ├── BMM_S8T_S8N_S8T.h
    │   │   ├── BMM_S8T_S8N_F32T.h
    │   │   ├── W8A8B8O8Linear.h
    │   │   ├── W8A8BFP32OFP32Linear.h
    │   │   ├── W8A8B8O8LinearReLU.h
    │   │   ├── Embedding.h
    │   │   ├── RotaryPosEmb.h
    │   │   └── linear.h
    │   ├── operators.h
    │   ├── nn_modules
    │   │   ├── OPTForCausalLM.h
    │   │   ├── Fp32llamaForCausalLM.h
    │   │   ├── Int4llamaForCausalLM.h
    │   │   ├── Fp32llamaDecoder.h
    │   │   ├── Int4llamaDecoder.h
    │   │   ├── Int8OPTDecoder.h
    │   │   ├── Fp32llamaAttention.h
    │   │   ├── Int4llamaAttention.h
    │   │   ├── Fp32llamaDecoderLayer.h
    │   │   ├── Int4llamaDecoderLayer.h
    │   │   ├── Int8OPTAttention.h
    │   │   └── Int8OPTDecoderLayer.h
    │   ├── model.h
    │   ├── LLaMATokenizer.h
    │   ├── OPTTokenizer.h
    │   ├── utils.h
    │   ├── profiler.h
    │   ├── common.h
    │   └── Generate.h
    ├── test.sh
    ├── profile.sh
    ├── src
    │   ├── ops
    │   │   ├── arg_max.cc
    │   │   ├── batch_add.cc
    │   │   ├── embedding.cc
    │   │   ├── LlamaRMSNorm.cc
    │   │   ├── softmax.cc
    │   │   ├── RotaryPosEmb.cc
    │   │   ├── LayerNorm.cc
    │   │   ├── LayerNormQ.cc
    │   │   ├── BMM_S8T_S8N_S8T.cc
    │   │   ├── BMM_S8T_S8N_F32T.cc
    │   │   ├── W8A8BFP32OFP32Linear.cc
    │   │   ├── W8A8B8O8LinearReLU.cc
    │   │   ├── W8A8B8O8Linear.cc
    │   │   └── BMM_F32T.cc
    │   └── nn_modules
    │   │   ├── OPTForCausalLM.cc
    │   │   ├── Fp32llamaForCausalLM.cc
    │   │   ├── Int4llamaForCausalLM.cc
    │   │   ├── Int8OPTDecoderLayer.cc
    │   │   ├── Fp32llamaDecoder.cc
    │   │   └── Int4llamaDecoder.cc
    ├── evaluate.sh
    ├── .pre-commit-config.yaml
    ├── download_assets.sh
    ├── upload.py
    ├── quantize_and_upload.py
    ├── Makefile
    └── llama_exporter.py
├── assets
    └── figures
    │   └── chat.gif
├── .clang-format
├── kernels
    ├── metal
    │   ├── download_metal-cpp.sh
    │   ├── include
    │   │   ├── opParams.h
    │   │   └── MetalMatmulInt4.hpp
    │   ├── Makefile
    │   ├── matmul_ref_fp32.cc
    │   ├── matmul_metal_int4_imp.h
    │   └── matmul_metal_int4.cc
    ├── cuda
    │   ├── gemm_cuda.h
    │   ├── matmul.cu
    │   └── dequantize.cuh
    ├── ref
    │   ├── matmul_ref_fp32.cc
    │   └── matmul_ref_int4.cc
    ├── neon
    │   └── matmul_ref_fp32.cc
    ├── matmul_int8.cc
    ├── matmul_imp.cc
    ├── matmul.h
    ├── avx
    │   └── matmul_avx_fp32.cc
    ├── starter_code
    │   ├── reference.cc
    │   └── multithreading.cc
    └── quantizer.cc
├── .gitmodules
├── .gitignore
├── .pre-commit-config.yaml
├── pyproject.toml
└── README.md


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/transformer/quantize_constants.py:
--------------------------------------------------------------------------------
1 | STORE_FP16 = False
2 | 


--------------------------------------------------------------------------------
/transformer/zip_assets.sh:
--------------------------------------------------------------------------------
1 | zip -r assets.zip assets
2 | zip -r models.zip models
3 | 


--------------------------------------------------------------------------------
/assets/figures/chat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/assets/figures/chat.gif


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | ColumnLimit: 120
3 | ContinuationIndentWidth: 4
4 | IndentWidth: 4
5 | TabWidth: 4
6 | 


--------------------------------------------------------------------------------
/transformer/models/llama_vocab.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/models/llama_vocab.bin


--------------------------------------------------------------------------------
/transformer/tests/assets/input.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/tests/assets/input.bin


--------------------------------------------------------------------------------
/transformer/tests/assets/output.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/tests/assets/output.bin


--------------------------------------------------------------------------------
/kernels/metal/download_metal-cpp.sh:
--------------------------------------------------------------------------------
1 | wget https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13_iOS16.zip
2 | unzip metal-cpp_macOS13_iOS16.zip
3 | 


--------------------------------------------------------------------------------
/transformer/include/ops/arg_max.h:
--------------------------------------------------------------------------------
1 | #include "common.h"
2 | 
3 | #define FLOAT_MIN -1000000.0
4 | 
5 | void arg_max_dim2(Matrix3D<float> &input, Matrix3D<int> &output);
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "json"]
2 | 	path = json
3 | 	url = https://github.com/nlohmann/json
4 | [submodule "transformer/json"]
5 | 	path = transformer/json
6 | 	url = https://github.com/nlohmann/json
7 | 


--------------------------------------------------------------------------------
/kernels/metal/include/opParams.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | typedef struct {
4 |     unsigned int m;
5 |     unsigned int n;
6 |     unsigned int k;
7 |     unsigned int group_size;
8 | } MetalMatMulParams;
9 | 


--------------------------------------------------------------------------------
/kernels/cuda/gemm_cuda.h:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | torch::Tensor gemm_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors,
4 |                                 torch::Tensor _zeros, int split_k_iters);
5 | 


--------------------------------------------------------------------------------
/transformer/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | make clean && make -j
 4 | 
 5 | # Find all executable files in the current directory starting with 'test_'
 6 | for file in test_*; do
 7 |   # Check if the file is executable
 8 |   if [ -x "$file" ]; then
 9 |     echo "Running '$file'..."
10 |     ./"$file"
11 |     exit_code=$?
12 |   fi
13 | done
14 | 


--------------------------------------------------------------------------------
/transformer/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | make clean && make -j
 4 | 
 5 | # Find all executable files in the current directory starting with 'profile_'
 6 | for file in profile_*; do
 7 |   # Check if the file is executable
 8 |   if [ -x "$file" ]; then
 9 |     echo "Running '$file'..."
10 |     ./"$file"
11 |     exit_code=$?
12 |   fi
13 | done
14 | 


--------------------------------------------------------------------------------
/transformer/include/ops/LlamaRMSNorm.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | class LlamaRMSNorm {
 4 |    public:
 5 |     LlamaRMSNorm(Matrix3D<float> _weight) : weight(_weight){};
 6 |     LlamaRMSNorm(){};
 7 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
 8 |     Matrix3D<float> weight;
 9 |     float eps = 1e-6;
10 | 
11 |    private:
12 |     std::string profile_name = "LlamaRMSNorm";
13 | };
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | !main_int8.o
 3 | !main.o
 4 | !matmul_avx_int8.o
 5 | !matmul_imp.o
 6 | !matmul_int8.o
 7 | !matmul_int4.o
 8 | !matmul_onednn.o
 9 | !utils.o
10 | *.a
11 | .DS_Store
12 | .build/
13 | .cache/
14 | .direnv/
15 | .envrc
16 | .swiftpm
17 | .venv
18 | .vs/
19 | .vscode/
20 | 
21 | assets/
22 | *.bin
23 | !ggml-vocab.bin
24 | *.zip
25 | *.txt
26 | *.json
27 | test_*
28 | !test_*.cc
29 | demo
30 | profile_*
31 | !profile_*.cc
32 | libtorch/
33 | 


--------------------------------------------------------------------------------
/transformer/include/ops/LayerNorm.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct LayerNorm_params {
 4 |     Matrix3D<float> weight;
 5 |     Matrix3D<float> bias;
 6 | };
 7 | 
 8 | class LayerNorm {
 9 | public:
10 |     LayerNorm(LayerNorm_params &params_): params(params_) {};
11 |     LayerNorm(){};
12 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
13 |     struct LayerNorm_params params;
14 | private:
15 |     std::string profile_name = "LayerNorm";
16 | };
17 | 
18 | void load_LayerNorm(LayerNorm &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/BMM_F32T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | class BMM_F32T {
 4 |    public:
 5 |     BMM_F32T(float _alpha);
 6 |     BMM_F32T(){};
 7 |     void forward(const Matrix3D<float> &x, const Matrix3D<float> &weight, Matrix3D<float> &output);
 8 |     void forward_weight_untransposed(const Matrix3D<float> &x, const Matrix3D<float> &weight, Matrix3D<float> &output);
 9 |     float alpha;
10 | 
11 |    private:
12 |     std::string profile_name = "BMM_F32T";
13 | };
14 | 
15 | void load_BMM_F32T(BMM_F32T &op, std::string prefix);
16 | 


--------------------------------------------------------------------------------
/transformer/include/ops/LayerNormQ.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct LayerNormQ_params {
 4 |     Matrix3D<float> weight;
 5 |     Matrix3D<float> bias;
 6 | };
 7 | 
 8 | class LayerNormQ {
 9 | public:
10 |     LayerNormQ(LayerNormQ_params &params_): params(params_) {};
11 |     LayerNormQ(){};
12 |     void forward(const Matrix3D<float> &x, Matrix3D<int8_t> &output);
13 |     struct LayerNormQ_params params;
14 | private:
15 |     std::string profile_name = "LayerNormQ";
16 | };
17 | 
18 | void load_LayerNormQ(LayerNormQ &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/BMM_S8T_S8N_S8T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct BMM_S8T_S8N_S8T_params {
 4 |     float alpha;
 5 | };
 6 | 
 7 | class BMM_S8T_S8N_S8T{
 8 | public:
 9 |     BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T_params &params_);
10 |     BMM_S8T_S8N_S8T(){};
11 |     void forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<int8_t> &output);
12 |     struct matmul_params params;
13 |     float alpha;
14 | private:
15 |     std::string profile_name = "BMM_S8T_S8N_S8T";
16 | };
17 | 
18 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/BMM_S8T_S8N_F32T.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct BMM_S8T_S8N_F32T_params {
 4 |     float alpha;
 5 | };
 6 | 
 7 | class BMM_S8T_S8N_F32T{
 8 | public:
 9 |     BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T_params &params_);
10 |     BMM_S8T_S8N_F32T(){};
11 |     void forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<float> &output);
12 |     struct matmul_params params;
13 |     float alpha;
14 | private:
15 |     std::string profile_name = "BMM_S8T_S8N_F32T";
16 | };
17 | 
18 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/W8A8B8O8Linear.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8B8O8Linear_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<int8_t> bias;
 6 |     float alpha;
 7 |     float beta;
 8 | };
 9 | 
10 | class W8A8B8O8Linear {
11 |    public:
12 |     W8A8B8O8Linear(W8A8B8O8Linear_params &params_);
13 |     W8A8B8O8Linear(){};
14 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output);
15 |     struct matmul_params params;
16 |     float alpha;
17 |     float beta;
18 | 
19 |    private:
20 |     std::string profile_name = "W8A8B8O8Linear";
21 | };
22 | 
23 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix);
24 | 


--------------------------------------------------------------------------------
/transformer/include/ops/W8A8BFP32OFP32Linear.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8BFP32OFP32Linear_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<float> bias;
 6 |     float alpha;
 7 | };
 8 | 
 9 | 
10 | class W8A8BFP32OFP32Linear{
11 | public:
12 |     W8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear_params &params_);
13 |     W8A8BFP32OFP32Linear(){};
14 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<float> &output);
15 |     struct matmul_params params;
16 |     float alpha;
17 | private:
18 |     std::string profile_name = "W8A8BFP32OFP32Linear";
19 | };
20 | 
21 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/W8A8B8O8LinearReLU.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | struct W8A8B8O8LinearReLU_params {
 4 |     Matrix3D<int8_t> weight;
 5 |     Matrix3D<int8_t> bias_int8;
 6 |     float alpha;
 7 |     float beta;
 8 | };
 9 | 
10 | class W8A8B8O8LinearReLU {
11 |    public:
12 |     W8A8B8O8LinearReLU(W8A8B8O8LinearReLU_params &params_);
13 |     W8A8B8O8LinearReLU(){};
14 |     void forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output);
15 |     struct matmul_params params;
16 |     float alpha;
17 |     float beta;
18 | 
19 |    private:
20 |     std::string profile_name = "W8A8B8O8LinearReLU";
21 | };
22 | 
23 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix);
24 | 


--------------------------------------------------------------------------------
/transformer/tests/utils_memalloc.h:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | class MemoryAllocator {
 3 |     // TODO: use allocate_aligned_memory instead!
 4 |    public:
 5 |     MemoryAllocator() { this->counter = 0; }
 6 |     float* get_fpbuffer(int size) {
 7 |         float* ptr;
 8 |         allocate_aligned_memory(ptr, size * sizeof(float));
 9 |         return ptr;
10 |     }
11 |     int8_t* get_int8buffer(int size) {
12 |         int8_t* ptr;
13 |         allocate_aligned_memory(ptr, size * sizeof(int8_t));
14 |         return ptr;
15 |     }
16 |     int* get_intbuffer(int size) {
17 |         int* ptr;
18 |         allocate_aligned_memory(ptr, size * sizeof(int));
19 |         return ptr;
20 |     }
21 | 
22 |    private:
23 |     int counter;
24 | };
25 | 


--------------------------------------------------------------------------------
/transformer/include/ops/Embedding.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <cassert>
 3 | 
 4 | 
 5 | 
 6 | class Embedding {
 7 |    public:
 8 |     Embedding(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D<float> lookup_)
 9 |         : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) {
10 |             assert(lookup_.m_dim_y == voc_size_);
11 |             assert(lookup_.m_dim_z == embed_dim_);
12 |         }
13 |     Embedding(){};
14 |     void forward(Matrix3D<int> input_id, Matrix3D<float> output);
15 |     int embed_dim, voc_size, padding_idx;
16 |     Matrix3D<float> lookup;
17 | private:
18 |     std::string profile_name = "Embedding";
19 | };
20 | 
21 | 
22 | void load_Embedding_params(Embedding &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/include/ops/RotaryPosEmb.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <cstdlib>
 3 | #include "utils.h"
 4 | 
 5 | class RotaryPosEmb
 6 | {
 7 | public:
 8 |     RotaryPosEmb(Matrix3D<float> _cos, Matrix3D<float> _sin, std::string path)
 9 |     {
10 |         sin = _sin;
11 |         cos = _cos;
12 |         read_to_array((path + "/cos_cached.bin").c_str(), cos.m_data, cos.length());
13 |         read_to_array((path + "/sin_cached.bin").c_str(), sin.m_data, sin.length());
14 |     };
15 |     RotaryPosEmb(){};
16 |     void forward(Matrix3D<float> &key, Matrix3D<float> &value, int start_idx, int len);
17 |     Matrix3D<float> cos, sin;
18 | 
19 | private:
20 |     std::string profile_name = "RotaryPosEmb";
21 | };
22 | 
23 | void load_RotaryPosEmb(RotaryPosEmb &op, std::string prefix);


--------------------------------------------------------------------------------
/transformer/src/ops/arg_max.cc:
--------------------------------------------------------------------------------
 1 | #include "ops/arg_max.h"
 2 | 
 3 | #include <cassert>
 4 | 
 5 | void arg_max_dim2(Matrix3D<float> &input, Matrix3D<int> &output) {
 6 |     int bz = input.m_dim_x;
 7 |     int sqlen = input.m_dim_y;
 8 |     int voc_size = input.m_dim_z;
 9 | 
10 |     assert(sqlen == output.m_dim_z);
11 |     assert(bz == output.m_dim_x);
12 | 
13 |     for (int b = 0; b < bz; b++) {
14 |         for (int i = 0; i < sqlen; i++) {
15 |             float max = FLOAT_MIN;
16 |             int max_idx = -1;
17 |             for (int j = 0; j < voc_size; j++) {
18 |                 float v = input(b, i, j);
19 |                 if (max < v) {
20 |                     max = v;
21 |                     max_idx = j;
22 |                 }
23 |             }
24 |             output(b, 0, i) = max_idx;
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/kernels/metal/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = /opt/homebrew/opt/llvm/bin/clang++
 2 | CXXFLAGS = -std=c++17 -stdlib=libc++ -O3
 3 | 
 4 | # Executable and source files
 5 | TEST_TARGET = benchmark
 6 | TARGET = $(TEST_TARGET)
 7 | KERNEL_SRC = $(wildcard ./src/*.cpp)
 8 | 
 9 | SRC = $(KERNEL_SRC)
10 | INCLUDE_DIRS = -I./metal-cpp -I./include
11 | LIB = -framework Metal -framework Foundation -framework MetalKit
12 | 
13 | 
14 | # Default target
15 | all: $(TARGET)
16 | 
17 | # Linking
18 | benchmark: build_metallib
19 | 	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o benchmark.x app/main.cpp $(SRC) $(LIB) $(LDFLAGS)
20 | 
21 | build_air:
22 | 	xcrun -sdk macosx metal -ffast-math -fno-fast-math $(INCLUDE_DIRS) -c kernel/op.metal -o library.air
23 | 
24 | build_metallib: build_air
25 | 	xcrun -sdk macosx metallib library.air -o default.metallib
26 | 
27 | # Clean up
28 | clean:
29 | 	rm -f benchmark.x library.air library.metallib default.metallib
30 | 


--------------------------------------------------------------------------------
/transformer/include/operators.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPERATORS_H
 2 | #define OPERATORS_H
 3 | #include <cassert>
 4 | 
 5 | #include "common.h"
 6 | #include "matmul.h"
 7 | 
 8 | #define BLK_SIZE 16
 9 | #define NUM_THREAD 4
10 | 
11 | // include all ops
12 | #include "ops/BMM_F32T.h"
13 | #include "ops/BMM_S8T_S8N_F32T.h"
14 | #include "ops/BMM_S8T_S8N_S8T.h"
15 | #include "ops/Embedding.h"
16 | #include "ops/LayerNorm.h"
17 | #include "ops/LayerNormQ.h"
18 | #include "ops/LlamaRMSNorm.h"
19 | #include "ops/RotaryPosEmb.h"
20 | #include "ops/W8A8B8O8Linear.h"
21 | #include "ops/W8A8B8O8LinearReLU.h"
22 | #include "ops/W8A8BFP32OFP32Linear.h"
23 | #include "ops/arg_max.h"
24 | #include "ops/linear.h"
25 | 
26 | void softmax(const Matrix3D<float> &input, Matrix3D<float> &output, int dim);
27 | void batch_Add(const Matrix3D<float> &input, const Matrix3D<float> &input2, Matrix3D<float> &output);
28 | template <typename T>
29 | void linear(Matrix3D<T> &a, Matrix3D<T> &b, Matrix3D<T> &c);
30 | 
31 | #endif  // OPERATORS_H
32 | 


--------------------------------------------------------------------------------
/transformer/src/ops/batch_add.cc:
--------------------------------------------------------------------------------
 1 | #include "operators.h"
 2 | 
 3 | void batch_Add(const Matrix3D<float> &input, const Matrix3D<float> &input2,Matrix3D<float> &output) {
 4 |     PROFILE_START("batch_Add");
 5 |     assert(input.m_dim_y == input2.m_dim_y);
 6 |     assert(input.m_dim_z == input2.m_dim_z);
 7 |     assert(input.m_dim_x == output.m_dim_x);
 8 |     assert(input.m_dim_y == output.m_dim_y);
 9 |     assert(input.m_dim_z == output.m_dim_z);
10 | 
11 |     if (input.m_dim_x != input2.m_dim_x && input2.m_dim_x == 1) {
12 |         // Find the maximum value in the input array
13 |         for (int i = 0; i < input.m_dim_x; i++) {
14 |             for (int j = 0; j < input.m_dim_y; j++) {
15 |                 for (int k = 0; k < input.m_dim_z; k++){
16 |                     output(i, j, k) = input(i, j, k) + input2(0, j, k);
17 |                 }
18 |             }
19 |         }
20 |     } else {
21 |         throw("Unsupported dimension for softmax");
22 |     }
23 |     PROFILE_END("batch_Add");
24 | }


--------------------------------------------------------------------------------
/transformer/src/ops/embedding.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_Embedding_params(Embedding& op, std::string prefix) {
 7 |     op.lookup.load((prefix + "/weight.bin").c_str());
 8 |     // read_to_array((prefix + "/weight.bin").c_str(), op.lookup.m_data, op.lookup.length());
 9 | }
10 | 
11 | void Embedding::forward(Matrix3D<int> input_id, Matrix3D<float> output) {
12 |     PROFILE_START(profile_name);
13 |     assert(input_id.m_dim_x == 1);
14 |     assert(input_id.m_dim_y == 1);
15 |     assert(input_id.m_dim_z == output.m_dim_y);
16 |     assert(output.m_dim_z == this->embed_dim);
17 | 
18 |     for (int i = 0; i < input_id.m_dim_z; i++) {
19 |         int token_id = input_id(0, 0, i);
20 |         float* output_sample_ptr = &output.m_data[i * this->embed_dim];
21 |         float* target_embed = &this->lookup.m_data[token_id * this->embed_dim];
22 |         memcpy(output_sample_ptr, target_embed, sizeof(float) * this->embed_dim);
23 |     }
24 |     PROFILE_END(profile_name);
25 | }
26 | 


--------------------------------------------------------------------------------
/transformer/tests/test_OPTGenerate.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "Generate.h"
 4 | 
 5 | int main() {
 6 |     // std::vector<int> input_ids = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
 7 |     //                                    13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
 8 |     std::string vocab_file = "./models/OPT_125m/vocab.json";
 9 |     std::string bpe_file = "./models/OPT_125m/merges.txt";
10 | 
11 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
12 |     std::vector<int> input_ids = encoder.encode("John went to MIT and study Computer Science.");
13 | 
14 |     std::string decoded = encoder.decode(input_ids);
15 |     std::cout << "input:" << decoded << std::endl;
16 | 
17 |     OPTForCausalLM model = OPTForCausalLM("models/OPT_125m", get_opt_model_config(OPT_125M));
18 |     const struct opt_params generation_config;
19 |     std::vector<int> generated_ids = OPTGenerate(model, input_ids, generation_config);
20 | 
21 |     decoded = encoder.decode(generated_ids);
22 |     std::cout << "generated:" << decoded << std::endl;
23 | };
24 | 


--------------------------------------------------------------------------------
/kernels/ref/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void fp32_ref_matmul(const struct matmul_params *params) {
12 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
13 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
14 | 
15 |     assert(A->column == B->row);
16 |     assert(C->row == A->row);
17 |     assert(C->column == B->column);
18 |     int m = A->row, n = B->column, k = A->column;
19 | 
20 |     for (int i = 0; i < m; i++) {
21 |         for (int j = 0; j < n; j++) {
22 |             float acc = 0;
23 |             for (int kk = 0; kk < k; kk++) {
24 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
25 |             }
26 |             acc = acc;
27 |             data_C[i * n + j] = acc;
28 |         }
29 |     }
30 | }
31 | 
32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
33 |     fp32_ref_matmul(params);
34 | }
35 | 
36 | }  // namespace matmul
37 | 


--------------------------------------------------------------------------------
/kernels/metal/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void fp32_ref_matmul(const struct matmul_params *params) {
12 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
13 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
14 | 
15 |     assert(A->column == B->row);
16 |     assert(C->row == A->row);
17 |     assert(C->column == B->column);
18 |     int m = A->row, n = B->column, k = A->column;
19 | 
20 |     for (int i = 0; i < m; i++) {
21 |         for (int j = 0; j < n; j++) {
22 |             float acc = 0;
23 |             for (int kk = 0; kk < k; kk++) {
24 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
25 |             }
26 |             acc = acc;
27 |             data_C[i * n + j] = acc;
28 |         }
29 |     }
30 | }
31 | 
32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
33 |     fp32_ref_matmul(params);
34 | }
35 | 
36 | }  // namespace matmul
37 | 


--------------------------------------------------------------------------------
/kernels/neon/matmul_ref_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void fp32_ref_matmul(const struct matmul_params *params) {
12 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
13 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
14 | 
15 |     assert(A->column == B->row);
16 |     assert(C->row == A->row);
17 |     assert(C->column == B->column);
18 |     int m = A->row, n = B->column, k = A->column;
19 | 
20 |     for (int i = 0; i < m; i++) {
21 |         for (int j = 0; j < n; j++) {
22 |             float acc = 0;
23 |             for (int kk = 0; kk < k; kk++) {
24 |                 acc += data_A[i * k + kk] * data_B[j * k + kk];
25 |             }
26 |             acc = acc;
27 |             data_C[i * n + j] = acc;
28 |         }
29 |     }
30 | }
31 | 
32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
33 |     fp32_ref_matmul(params);
34 | }
35 | 
36 | }  // namespace matmul
37 | 


--------------------------------------------------------------------------------
/kernels/metal/include/MetalMatmulInt4.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Foundation/Foundation.hpp"
 4 | #include "Metal/Metal.hpp"
 5 | #include "opParams.h"
 6 | 
 7 | class MetalMatmulInt4 {
 8 |    public:
 9 |     MTL::Device *_mDevice;
10 | 
11 |     // The compute pipeline generated from the compute kernel in the .metal shader file.
12 |     MTL::ComputePipelineState *_mMatmulFunctionPSO;
13 | 
14 |     // The command queue used to pass commands to the device.
15 |     MTL::CommandQueue *_mCommandQueue;
16 | 
17 |     // Buffers to hold data.
18 |     MTL::Buffer *_mBufferA;
19 |     MTL::Buffer *_mBufferB;
20 |     MTL::Buffer *_mBufferScales;
21 |     MTL::Buffer *_mBufferResult;
22 |     MTL::Buffer *_mParams;
23 | 
24 |     // Matmul params
25 |     MetalMatMulParams *_mParamsPtr;
26 | 
27 |     MetalMatmulInt4(MTL::Device *device, MetalMatMulParams param);
28 |     ~MetalMatmulInt4();
29 | 
30 |     void prepareData();
31 |     void sendComputeCommand();
32 |     void verifyResults();
33 | 
34 |    private:
35 |     void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder);
36 |     void generateRandomFloatData(MTL::Buffer *buffer, int length);
37 |     void generateRandomIn4Data(MTL::Buffer *buffer, int length);
38 | };
39 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/OPTForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int8OPTDecoder.h"
 2 | 
 3 | struct OPTForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
 6 | };
 7 | struct OPTForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     OPTForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
13 |     OPTForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<int8_t>> past_keys_,
14 |                          std::vector<Matrix3D<int8_t>> past_values_)
15 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
16 |         has_past_keys_values = true;
17 |     }
18 | };
19 | 
20 | class OPTForCausalLM {
21 |    public:
22 |     OPTForCausalLM(std::string param_path, const struct model_config config);
23 |     struct OPTForCausalLM_output forward(const struct OPTForCausalLM_input& input);
24 | 
25 |    private:
26 |     Int8OPTDecoder decoder;
27 |     Linear_FP lm_head;
28 |     std::string profile_name = "OPTForCausalLM";
29 |     float* logits_output;
30 |     float* lm_head_weight;
31 | };
32 | 


--------------------------------------------------------------------------------
/kernels/matmul_int8.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <iostream>
 3 | 
 4 | #include "matmul.h"
 5 | 
 6 | namespace matmul {
 7 | 
 8 | void MatmulOperator::naive_mat_mul_int8(const struct matmul_params *params) {
 9 |     int i, j, k;
10 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
11 |     int32_t A_zp = A->qparams.zero_point, C_zp = C->qparams.zero_point;
12 |     float A_sc = A->qparams.scale, B_sc = B->qparams.scale, C_sc = C->qparams.scale;
13 |     float effective_scale = A_sc * B_sc / C_sc;
14 |     int8_t *data_A = A->int8_data_ptr, *data_B = B->int8_data_ptr, *data_C = C->int8_data_ptr;
15 |     const int8_t q_min = C->qparams.q_min, q_max = C->qparams.q_max;
16 |     CHECK_MATRICES(A, B, C);
17 | 
18 |     for (i = 0; i < C->row; i++)
19 |         for (j = 0; j < C->column; j++) {
20 |             int acc = 0;
21 |             for (k = 0; k < A->column; k++)
22 |                 acc += ((int32_t)data_A[i * A->column + k] - A_zp) * data_B[k * B->column + j];
23 | 
24 |             acc = (int32_t)((float)acc * effective_scale);
25 |             acc -= C_zp;
26 |             acc = MAX(acc, q_min);
27 |             acc = MIN(acc, q_max);
28 |             data_C[i * C->column + j] = (int8_t)acc;
29 |         }
30 | }
31 | }  // namespace matmul
32 | 


--------------------------------------------------------------------------------
/transformer/src/ops/LlamaRMSNorm.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void LlamaRMSNorm::forward(const Matrix3D<float> &x, Matrix3D<float> &output) {
 8 |     PROFILE_START(profile_name);
 9 |     const int last_dims = 2;
10 | 
11 |     assert(last_dims == 2);  // support the last dim for now
12 |     assert(output.m_dim_x == x.m_dim_x);
13 |     assert(output.m_dim_y == x.m_dim_y);
14 |     assert(output.m_dim_z == x.m_dim_z);
15 |     assert(x.m_dim_z == weight.m_dim_z);
16 | 
17 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
18 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
19 |             float var = 0;
20 | 
21 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
22 |                 var += x(i, j, k) * x(i, j, k);
23 |             }
24 |             var /= static_cast<float>(x.m_dim_z);
25 |             float variance = 1.0 / sqrt(var + eps);
26 | 
27 |             for (int k = 0; k < x.m_dim_z; k++) {
28 |                 float value = static_cast<float>(x(i, j, k));
29 |                 float fp_out = (value * variance) * weight(0, 0, k);
30 |                 output(i, j, k) = fp_out;
31 |             }
32 |         }
33 |     }
34 | 
35 |     PROFILE_END(profile_name);
36 | }
37 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Fp32llamaForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaDecoder.h"
 2 | 
 3 | struct Fp32LlamaForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Fp32LlamaForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Fp32LlamaForCausalLM_input() {}
13 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
14 |     Fp32LlamaForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
15 |                                std::vector<Matrix3D<float>> past_values_)
16 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
17 |         has_past_keys_values = true;
18 |     }
19 | };
20 | 
21 | class Fp32LlamaForCausalLM {
22 |    public:
23 |     Fp32LlamaForCausalLM(std::string param_path, const struct model_config config);
24 | 
25 |     struct Fp32LlamaForCausalLM_output forward(const struct Fp32LlamaForCausalLM_input& input);
26 | 
27 |    private:
28 |     Fp32llamaDecoder decoder;
29 |     Linear_FP lm_head;
30 |     std::string profile_name = "Fp32LlamaForCausalLM";
31 |     float* logits_output;
32 |     float* lm_head_weight;
33 | };
34 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int4llamaForCausalLM.h:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaDecoder.h"
 2 | 
 3 | struct Int4LlamaForCausalLM_output {
 4 |     Matrix3D<float> logits;
 5 |     std::vector<Matrix3D<float>> past_keys, past_values;
 6 | };
 7 | struct Int4LlamaForCausalLM_input {
 8 |     Matrix3D<int> input_ids;
 9 |     std::vector<Matrix3D<float>> past_keys, past_values;
10 |     bool has_past_keys_values;
11 | 
12 |     Int4LlamaForCausalLM_input() {}
13 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
14 |     Int4LlamaForCausalLM_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
15 |                                std::vector<Matrix3D<float>> past_values_)
16 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
17 |         has_past_keys_values = true;
18 |     }
19 | };
20 | 
21 | class Int4LlamaForCausalLM {
22 |    public:
23 |     Int4LlamaForCausalLM(std::string param_path, const struct model_config config);
24 |     struct Int4LlamaForCausalLM_output forward(const struct Int4LlamaForCausalLM_input& input);
25 | 
26 |    private:
27 |     Int4llamaDecoder decoder;
28 |     Linear_FP_int4 lm_head;
29 |     std::string profile_name = "Int4LlamaForCausalLM";
30 |     float* logits_output;
31 |     uint8_t* lm_head_weight;
32 | };
33 | 


--------------------------------------------------------------------------------
/transformer/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # List of implementation
 4 | # 0: reference
 5 | # 1: loop_unrolling
 6 | # 2: multithreading
 7 | # 3: simd_programming
 8 | # 4: multithreading_loop_unrolling
 9 | # 5: all_techniques
10 | keys=("reference" "loop_unrolling" "multithreading" "simd_programming" "multithreading_loop_unrolling" "all_techniques")
11 | values=("0" "1" "2" "3" "4" "5")
12 | 
13 | # If a implementation is provided to the script, map it to the corresponding argument
14 | if [ "$#" -eq 1 ]; then
15 |     found=0
16 |     for i in "${!keys[@]}"; do
17 |         if [ "${keys[$i]}" = "$1" ]; then
18 |             test_args=("${values[$i]}")
19 |             found=1
20 |             break
21 |         fi
22 |     done
23 |     if [ "$found" -eq 0 ]; then
24 |         echo "Invalid implementation. Please provide a valid key from the mapping."
25 |         exit 1
26 |     fi
27 | else
28 |     # If no argument is provided, use all values
29 |     test_args=("${values[@]}")
30 | fi
31 | 
32 | 
33 | # Run the program with different arguments
34 | for arg in "${test_args[@]}"; do
35 |     make clean
36 |     make chat test_linear -j IMP="$arg"
37 |     # Check if make was successful
38 |     if [ $? -ne 0 ]; then
39 |         echo "Compilation failed!"
40 |         exit 1
41 |     fi
42 |     ./test_linear
43 |     echo ""
44 | done
45 | 
46 | echo "All tests completed!"
47 | 


--------------------------------------------------------------------------------
/kernels/metal/matmul_metal_int4_imp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | 
 5 | #include "Foundation/Foundation.hpp"
 6 | #include "Metal/Metal.hpp"
 7 | #include "include/opParams.h"
 8 | 
 9 | typedef struct {
10 |     float *A, *C, *scales, *offset;
11 |     unsigned char *B;
12 | } MetalMatmulBuffers;
13 | 
14 | class MetalMatmulInt4IMP {
15 |    public:
16 |     static MTL::Device *_mDevice;
17 | 
18 |     // The compute pipeline generated from the compute kernel in the .metal shader file.
19 |     static MTL::ComputePipelineState *_mMatmulFunctionPSO;
20 | 
21 |     // The command queue used to pass commands to the device.
22 |     static MTL::CommandQueue *_mCommandQueue;
23 | 
24 |     // Buffers to hold data.
25 |     static MTL::Buffer *_mBufferA;
26 |     static MTL::Buffer *_mBufferB;
27 |     static MTL::Buffer *_mBufferScales;
28 |     static MTL::Buffer *_mBufferResult;
29 |     static MTL::Buffer *_mParams;
30 | 
31 |     static std::unordered_map<void *, MTL::Buffer *> _mumap;
32 | 
33 |     static bool has_init;
34 |     static void init();
35 |     static void run(MetalMatMulParams param, MetalMatmulBuffers *bufferParams);
36 |     static void *allocateSharedMem(size_t size);
37 | 
38 |     static MetalMatMulParams *_mParamsPtr;
39 |     static void sendComputeCommand();
40 |     static void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder);
41 |     static MTL::Buffer *getBufferfromPtr(void *ptr);
42 | };
43 | 


--------------------------------------------------------------------------------
/kernels/matmul_imp.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <pthread.h>
 4 | #include <stdio.h>
 5 | 
 6 | #include <iostream>
 7 | #include <string>
 8 | 
 9 | #include "matmul.h"
10 | 
11 | namespace matmul {
12 | 
13 | void MatmulOperator::CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C) {
14 |     assert(A->column == B->row);
15 |     assert(C->column == B->column);
16 |     assert(C->row == A->row);
17 | }
18 | 
19 | void MatmulOperator::CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C) {
20 |     assert(B->row * B->column == A->column * C->column / 2);
21 |     assert(C->row == A->row);
22 | }
23 | 
24 | void MatmulOperator::mat_mul_transposed(const struct matmul_params *params) {
25 |     int i, j, k;
26 | 
27 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
28 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
29 | 
30 |     for (i = 0; i < C->row; i++)
31 |         for (j = 0; j < C->column; j++) {
32 |             float acc = 0;
33 |             for (k = 0; k < A->column; k++) acc += data_A[i * A->column + k] * data_B[j * B->column + k];
34 |             data_C[i * C->column + j] = acc;
35 |         }
36 | }
37 | 
38 | float interval_to_ms(struct timeval *start, struct timeval *end) {
39 |     float us_seconds = (end->tv_sec - start->tv_sec) * 1000000 + (end->tv_usec - start->tv_usec);
40 |     return us_seconds / 1000;
41 | }
42 | 
43 | }  // namespace matmul
44 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: "code_generator/tflite/.*"
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v4.0.1
 5 |     hooks:
 6 |       - id: trailing-whitespace
 7 |       - id: mixed-line-ending
 8 |         args: ["--fix=lf"]
 9 |       - id: end-of-file-fixer
10 |       - id: check-merge-conflict
11 |       - id: requirements-txt-fixer
12 |       - id: fix-encoding-pragma
13 |         args: ["--remove"]
14 |       - id: debug-statements
15 |       - id: check-toml
16 |   - repo: https://github.com/executablebooks/mdformat
17 |     rev: 0.7.10
18 |     hooks:
19 |       - id: mdformat
20 |   - repo: https://github.com/psf/black
21 |     rev: 22.3.0
22 |     hooks:
23 |       - id: black
24 |   - repo: https://github.com/pycqa/isort
25 |     rev: 5.12.0
26 |     hooks:
27 |       - id: isort
28 |         args: ["--sp", "pyproject.toml"]
29 |   - repo: https://github.com/pycqa/flake8
30 |     rev: 4.0.1
31 |     hooks:
32 |       - id: flake8
33 |         additional_dependencies:
34 |           - flake8-comprehensions==3.7.0
35 |           - flake8-docstrings==1.6.0
36 |   - repo: local
37 |     hooks:
38 |       - id: pylint
39 |         name: pylint
40 |         entry: pylint
41 |         language: system
42 |         types: [python]
43 |         require_serial: true
44 |   - repo: https://github.com/pre-commit/mirrors-mypy
45 |     rev: v0.910-1
46 |     hooks:
47 |       - id: mypy
48 |   - repo: https://github.com/pre-commit/mirrors-clang-format
49 |     rev: v13.0.0
50 |     hooks:
51 |       - id: clang-format
52 | 


--------------------------------------------------------------------------------
/transformer/src/ops/softmax.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "operators.h"
 3 | 
 4 | void softmax(const Matrix3D<float> &input, Matrix3D<float> &output, const int dim) {
 5 |     PROFILE_START("softmax");
 6 |     int len = input.length();
 7 | 
 8 |     if (dim == 2) {
 9 |         // Find the maximum value in the input array
10 |         for (int i = 0; i < input.m_dim_x; i++) {
11 |             for (int j = 0; j < input.m_dim_y; j++) {
12 |                 float max_value = input.m_data[0];
13 |                 float sum = 0;
14 |                 // Find the maximum value in the input array
15 |                 for (int k = 0; k < input.m_dim_z; k++) {
16 |                     float value = input(i, j, k);
17 |                     if (value > max_value) {
18 |                         max_value = value;
19 |                     }
20 |                 }
21 | 
22 |                 // Compute the softmax values
23 |                 for (int k = 0; k < input.m_dim_z; k++) {
24 |                     float value = input(i, j, k);
25 |                     sum += std::exp(value - max_value);
26 |                 }
27 | 
28 |                 // Normalize the softmax values and store them in the output array
29 |                 for (int k = 0; k < input.m_dim_z; k++) {
30 |                     float value = input(i, j, k);
31 |                     output(i, j, k) = (std::exp(value - max_value) / sum);
32 |                 }
33 |             }
34 |         }
35 |     } else {
36 |         throw("Unsupported dimension for softmax");
37 |     }
38 |     PROFILE_END("softmax");
39 | }


--------------------------------------------------------------------------------
/transformer/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: "code_generator/tflite/.*"
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v4.0.1
 5 |     hooks:
 6 |       - id: trailing-whitespace
 7 |       - id: mixed-line-ending
 8 |         args: ["--fix=lf"]
 9 |       - id: end-of-file-fixer
10 |       - id: check-merge-conflict
11 |       - id: requirements-txt-fixer
12 |       - id: fix-encoding-pragma
13 |         args: ["--remove"]
14 |       - id: debug-statements
15 |       - id: check-toml
16 |   - repo: https://github.com/executablebooks/mdformat
17 |     rev: 0.7.10
18 |     hooks:
19 |       - id: mdformat
20 |   - repo: https://github.com/psf/black
21 |     rev: 22.3.0
22 |     hooks:
23 |       - id: black
24 |   - repo: https://github.com/pycqa/isort
25 |     rev: 5.12.0
26 |     hooks:
27 |       - id: isort
28 |         args: ["--sp", "pyproject.toml"]
29 |   - repo: https://github.com/pycqa/flake8
30 |     rev: 4.0.1
31 |     hooks:
32 |       - id: flake8
33 |         additional_dependencies:
34 |           - flake8-comprehensions==3.7.0
35 |           - flake8-docstrings==1.6.0
36 |   - repo: local
37 |     hooks:
38 |       - id: pylint
39 |         name: pylint
40 |         entry: pylint
41 |         language: system
42 |         types: [python]
43 |         require_serial: true
44 |   - repo: https://github.com/pre-commit/mirrors-mypy
45 |     rev: v0.910-1
46 |     hooks:
47 |       - id: mypy
48 |   - repo: https://github.com/pre-commit/mirrors-clang-format
49 |     rev: v13.0.0
50 |     hooks:
51 |       - id: clang-format
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 120
 3 | include = '\.pyi?$'
 4 | extend-exclude = "codegen/.*"
 5 | 
 6 | [tool.isort]
 7 | profile = "black"
 8 | known_first_party = ["code_generator"]
 9 | extend_skip = ["codegen"]
10 | multi_line_output = 3
11 | include_trailing_comma = true
12 | force_grid_wrap = 0
13 | use_parentheses = true
14 | ensure_newline_before_comments = true
15 | line_length = 120
16 | 
17 | [tool.pylint]
18 |     [tool.pylint.master]
19 |     ignore-paths = ["codegen"]
20 |     [tool.pylint.messages_control]
21 |     disable = [
22 |         "C0103",
23 |         "C0114",
24 |         "C0115",
25 |         "C0116",
26 |         "C0123",
27 |         "C0209",
28 |         "C0330",
29 |         "C0301",
30 |         "C0302",
31 |         "C0411",
32 |         "C0415",
33 |         "E0401",
34 |         "E1121",
35 |         "E1123",
36 |         "E1101",
37 |         "R",
38 |         "W"
39 |     ]
40 |     [tool.pylint.basic]
41 |     good-names-rgxs = "^[_a-z][_a-z0-9]?$"  # allow 1 or 2 character names
42 |     [tool.pylint.format]
43 |     max-line-length = 120
44 |     max-module-lines = 5000
45 |     [tool.pylint.design]
46 |     max-args = 10
47 |     max-attributes = 15
48 |     max-parents = 10
49 | 
50 | [tool.mypy]
51 | files = "."
52 | exclude ="codegen/.*"
53 | install_types = true
54 | non_interactive = true
55 | show_error_codes = true
56 | disable_error_code = [
57 |     "import",
58 |     "assignment",
59 |     "operator",
60 |     "has-type",
61 |     "var-annotated",
62 |     "operator",
63 |     "call-arg",
64 | ]
65 | explicit_package_bases = true
66 | namespace_packages = true
67 | 


--------------------------------------------------------------------------------
/kernels/ref/matmul_ref_int4.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) {
12 |     int i, j, k;
13 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
14 |     const int block_size = params->block_size;
15 |     float *scale = params->scales, *offset = params->offset;
16 | 
17 |     assert(params->block_size == 32);  // support block size 32 for now
18 | 
19 |     for (i = 0; i < C->row; i++) {
20 |         for (j = 0; j < C->column; j++) {
21 |             float acc = 0;
22 |             for (k = 0; k < B->row; k += block_size) {
23 |                 float s = scale[j * (B->row / 16) + k / 32];  // /16:B->column is packed 4bits
24 |                 float o = offset[j * (B->row / 16) + k / 32];
25 |                 uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2];
26 |                 float *x_ptr = &A->data_ptr[i * A->column + k];
27 |                 for (int qi = 0; qi < block_size / 2; qi++) {
28 |                     uint8_t packed_int4 = weight_32_int4[qi];
29 |                     float deq_0 = (float)(packed_int4 & 0x0F) * s + o;
30 |                     float deq_1 = (float)(packed_int4 >> 4) * s + o;
31 |                     acc += *x_ptr++ * deq_0;
32 |                     acc += *x_ptr++ * deq_1;
33 |                 }
34 |             }
35 |             C->data_ptr[i * C->column + j] = acc;
36 |         }
37 |     }
38 | };
39 | 
40 | }  // namespace matmul
41 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/OPTForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "OPTForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | OPTForCausalLM::OPTForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Int8OPTDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct OPTForCausalLM_output OPTForCausalLM::forward(const struct OPTForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Int8OPTDecoder_output decoder_output;
22 | 
23 |     if (input.has_past_keys_values) {
24 |         struct Int8OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
25 |         decoder_output = this->decoder.forward(decoder_input);
26 | 
27 |     } else {
28 |         struct Int8OPTDecoder_input decoder_input = {input.input_ids};
29 |         decoder_output = this->decoder.forward(decoder_input);
30 |     }
31 | 
32 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
33 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
34 | 
35 |     struct OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
36 |     PROFILE_END(profile_name);
37 |     return LMoutput;
38 | }
39 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Fp32llamaDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Fp32llamaDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Fp32llamaDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Fp32llamaDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Fp32llamaDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                            std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Fp32llamaDecoder {
27 |    public:
28 |     Fp32llamaDecoder(std::string param_path, const struct model_config config);
29 |     Fp32llamaDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     struct Fp32llamaDecoder_output forward(const struct Fp32llamaDecoder_input& input);
32 |     Embedding embed_tokens;
33 |     LlamaRMSNorm norm;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Fp32llamaDecoderLayer> layers;
36 |     std::string profile_name = "Fp32llamaDecoder";
37 | 
38 |    private:
39 |     float* attention_mask_buf;
40 |     float* pos_embeds_buf;
41 |     float* last_hidden_states_buf;
42 |     float* hidden_states_buf;
43 | };
44 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int4llamaDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Int4llamaDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Int4llamaDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<float>> past_keys, past_values;
12 | };
13 | struct Int4llamaDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<float>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Int4llamaDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Int4llamaDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<float>> past_keys_,
20 |                            std::vector<Matrix3D<float>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Int4llamaDecoder {
27 |    public:
28 |     Int4llamaDecoder(std::string param_path, const struct model_config config);
29 |     Int4llamaDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     struct Int4llamaDecoder_output forward(const struct Int4llamaDecoder_input& input);
32 |     Embedding embed_tokens;
33 |     LlamaRMSNorm norm;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Int4llamaDecoderLayer> layers;
36 |     std::string profile_name = "Int4llamaDecoder";
37 | 
38 |    private:
39 |     float* attention_mask_buf;
40 |     float* pos_embeds_buf;
41 |     float* last_hidden_states_buf;
42 |     float* hidden_states_buf;
43 | };
44 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/Fp32llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Fp32LlamaForCausalLM::Fp32LlamaForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float));
11 | 
12 |     this->decoder = Fp32llamaDecoder(param_path + "/decoder", config);
13 |     this->lm_head =
14 |         Linear_FP(Matrix3D<float>(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin");
15 | }
16 | 
17 | struct Fp32LlamaForCausalLM_output Fp32LlamaForCausalLM::forward(const struct Fp32LlamaForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Fp32llamaDecoder_output decoder_output;
22 | 
23 |     // Call decoder
24 |     if (input.has_past_keys_values) {
25 |         struct Fp32llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
26 |         decoder_output = this->decoder.forward(decoder_input);
27 | 
28 |     } else {
29 |         struct Fp32llamaDecoder_input decoder_input = {input.input_ids};
30 |         decoder_output = this->decoder.forward(decoder_input);
31 |     }
32 | 
33 |     // Get logits
34 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
35 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
36 | 
37 |     struct Fp32LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
38 |     PROFILE_END(profile_name);
39 |     return LMoutput;
40 | }
41 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int8OPTDecoder.h:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "Int8OPTDecoderLayer.h"
 6 | #include "common.h"
 7 | #include "operators.h"
 8 | 
 9 | struct Int8OPTDecoder_output {
10 |     Matrix3D<float> last_hidden_state;
11 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
12 | };
13 | struct Int8OPTDecoder_input {
14 |     Matrix3D<int> input_ids;
15 |     std::vector<Matrix3D<int8_t>> past_keys, past_values;
16 |     bool has_past_keys_values;
17 | 
18 |     Int8OPTDecoder_input(Matrix3D<int> input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; }
19 |     Int8OPTDecoder_input(Matrix3D<int> input_ids_, std::vector<Matrix3D<int8_t>> past_keys_,
20 |                          std::vector<Matrix3D<int8_t>> past_values_)
21 |         : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) {
22 |         has_past_keys_values = true;
23 |     }
24 | };
25 | 
26 | class Int8OPTDecoder {
27 |    public:
28 |     Int8OPTDecoder(std::string param_path, const struct model_config config);
29 |     Int8OPTDecoder(){};
30 |     Matrix3D<float> prepare_decoder_attention_mask(int length, int past_length);
31 |     Matrix3D<float> get_position_embed(int sql_length, int past_length);
32 |     struct Int8OPTDecoder_output forward(const struct Int8OPTDecoder_input& input);
33 |     Embedding embed_tokens, embed_positions;
34 |     int voc_size, embed_dim, padding_idx, hidden_dim, num_heads;
35 |     std::vector<Int8OPTDecoderLayer> layers;
36 |     LayerNorm final_layer_norm;
37 |     std::string profile_name = "Int8OPTDecoder";
38 | 
39 |    private:
40 |     float* attention_mask_buf;
41 |     float* pos_embeds_buf;
42 |     float* last_hidden_states_buf;
43 |     float* hidden_states_buf;
44 | };
45 | 


--------------------------------------------------------------------------------
/transformer/download_assets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # List of files to download, their corresponding MD5 checksums, and target local paths
 4 | files_and_checksums=(
 5 |   "https://www.dropbox.com/s/8q5cupqw00twvoa/assets.zip 6014d43716e6516a4f7b7161088d3e74 assets.zip"
 6 | )
 7 | 
 8 | OS=`uname`
 9 | 
10 | # Function to download a file if it doesn't exist or if its MD5 checksum is incorrect
11 | download_if_needed() {
12 |   url="$1"
13 |   expected_md5="$2"
14 |   target_path="$3"
15 | 
16 |   # Ensure the target directory exists
17 |   target_dir=$(dirname "$target_path")
18 |   mkdir -p "$target_dir"
19 | 
20 |   # Download the file if it does not exist
21 |   if [ ! -e "$target_path" ]; then
22 |     echo "File '$target_path' does not exist. Downloading..."
23 |     wget -q -O "$target_path" "$url"
24 |   fi
25 | 
26 |   # Use md5 on MacOS
27 |   if [ $OS = "Darwin" ]
28 |   then
29 |       actual_md5=$(md5 -q "$target_path")
30 |   # Use md5sum on Ubuntu
31 |   elif [ $OS = "Linux" ]
32 |   then
33 |       actual_md5=$(md5sum "$target_path" | cut -d ' ' -f1)
34 |   fi
35 | 
36 |   if [ "$actual_md5" != "$expected_md5" ]; then
37 |     echo "MD5 checksum for '$target_path' is incorrect. Downloading again..."
38 |     wget -q -O "$target_path" "$url"
39 |   else
40 |     echo "File '$target_path' exists and its MD5 checksum is correct."
41 |   fi
42 | }
43 | 
44 | # Process each file, its corresponding MD5 checksum, and target local path
45 | for file_and_checksum in "${files_and_checksums[@]}"; do
46 |   url=$(echo "$file_and_checksum" | awk '{ print $1 }')
47 |   expected_md5=$(echo "$file_and_checksum" | awk '{ print $2 }')
48 |   target_path=$(echo "$file_and_checksum" | awk '{ print $3 }')
49 | 
50 |   download_if_needed "$url" "$expected_md5" "$target_path"
51 |   unzip "$target_path"
52 | done
53 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/Int4llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaForCausalLM.h"
 2 | 
 3 | #include <chrono>
 4 | 
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | 
 8 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) {
 9 |     allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float));
10 |     allocate_aligned_memory(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(uint8_t)) / 2);
11 | 
12 |     this->decoder = Int4llamaDecoder(param_path + "/decoder", config);
13 |     this->lm_head = Linear_FP_int4(Matrix3D<uint8_t>(lm_head_weight, 1, config.vocsize, config.embed_dim / 2),
14 |                                    param_path + "/lm_head");
15 | }
16 | 
17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(const struct Int4LlamaForCausalLM_input &input) {
18 |     PROFILE_START(profile_name);
19 |     int sqlen = input.input_ids.m_dim_z;
20 | 
21 |     struct Int4llamaDecoder_output decoder_output;
22 | 
23 |     // Call decoder
24 |     if (input.has_past_keys_values) {
25 |         struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values};
26 |         decoder_output = this->decoder.forward(decoder_input);
27 | 
28 |     } else {
29 |         struct Int4llamaDecoder_input decoder_input = {input.input_ids};
30 |         decoder_output = this->decoder.forward(decoder_input);
31 |     }
32 | 
33 |     // Get logits
34 |     Matrix3D<float> logits(logits_output, 1, sqlen, this->decoder.voc_size);
35 |     this->lm_head.forward(decoder_output.last_hidden_state, logits);
36 | 
37 |     struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values};
38 |     PROFILE_END(profile_name);
39 |     return LMoutput;
40 | }
41 | 


--------------------------------------------------------------------------------
/transformer/include/model.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_H
 2 | #define MODEL_H
 3 | #include <cstring>
 4 | 
 5 | struct model_config {
 6 |     int batch;
 7 |     int num_heads;
 8 |     int num_layers;
 9 |     int max_sqlen;
10 |     int embed_dim;
11 |     int hidden_dim;
12 |     int vocsize;
13 |     int padding_idx;
14 |     int qk;  // group size
15 | 
16 |     model_config() : model_config(1, 12, 12, 512, 768, 3072, 50272, 1) {}
17 |     model_config(int batch, int num_heads, int num_layers, int max_sqlen, int embed_dim, int hidden_dim, int vocsize,
18 |                  int padding_idx)
19 |         : batch(batch),
20 |           num_heads(num_heads),
21 |           num_layers(num_layers),
22 |           max_sqlen(max_sqlen),
23 |           embed_dim(embed_dim),
24 |           hidden_dim(hidden_dim),
25 |           vocsize(vocsize),
26 |           padding_idx(padding_idx) {}
27 | };
28 | 
29 | enum { OPT_125M, OPT_1_3B, OPT_6_7B, LLaMA_7B };
30 | enum { FP32, INT8, INT4 };
31 | 
32 | const struct model_config opt_6_7B(1, 32, 32, 2048, 4096, 16384, 50272, 1);
33 | const struct model_config opt_1_3B(1, 32, 24, 2048, 2048, 8192, 50272, 1);
34 | const struct model_config opt_125m(1, 12, 12, 2048, 768, 3072, 50272, 1);
35 | const struct model_config llama_7B(1, 32, 32, 2048, 4096, 11008, 32000, 1);
36 | static struct model_config get_opt_model_config(int choise) {
37 |     struct model_config ret;
38 |     switch (choise) {
39 |         case OPT_125M:
40 |             ret = opt_125m;
41 |             break;
42 |         case OPT_1_3B:
43 |             ret = opt_1_3B;
44 |             break;
45 |         case OPT_6_7B:
46 |             ret = opt_6_7B;
47 |             break;
48 |         case LLaMA_7B:;
49 |             ret = llama_7B;
50 |             break;
51 |         default:
52 |             throw("Unsupported model choise.");
53 |             break;
54 |     }
55 |     return ret;
56 | }
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/transformer/include/LLaMATokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef LLaMA_TOKENIZER_H
 2 | #define LLaMA_TOKENIZER_H
 3 | 
 4 | #include <cstdint>
 5 | #include <cstdio>
 6 | #include <iostream>
 7 | #include <map>
 8 | #include <queue>
 9 | #include <string>
10 | #include <unordered_map>
11 | #include <vector>
12 | 
13 | static int llama_token_bos() { return 1; }
14 | 
15 | static int llama_token_eos() { return 2; }
16 | 
17 | static int llama_token_nl() { return 13; }
18 | 
19 | struct llama_vocab {
20 |     struct token_score {
21 |         std::string tok;
22 |         float score;
23 |     };
24 | 
25 |     std::unordered_map<std::string, int32_t> token_to_id;
26 |     std::vector<token_score> id_to_token;
27 | };
28 | 
29 | /*
30 |  *  Tokenizer
31 |  */
32 | static size_t utf8_len(char src) {
33 |     const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
34 |     uint8_t highbits = static_cast<uint8_t>(src) >> 4;
35 | 
36 |     return lookup[highbits];
37 | }
38 | 
39 | struct llama_sp_symbol {
40 |     using index = int;
41 |     index prev;
42 |     index next;
43 |     const char* text;
44 |     size_t n;
45 | };
46 | 
47 | struct llama_sp_bigram {
48 |     struct comparator {
49 |         bool operator()(llama_sp_bigram& l, llama_sp_bigram& r) {
50 |             return (l.score < r.score) || (l.score == r.score && l.left > r.left);
51 |         }
52 |     };
53 |     using queue_storage = std::vector<llama_sp_bigram>;
54 |     using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
55 |     llama_sp_symbol::index left;
56 |     llama_sp_symbol::index right;
57 |     float score;
58 |     size_t size;
59 | };
60 | 
61 | llama_vocab llama_init_vocab(const char* vocab_file);
62 | 
63 | const char* llama_id_to_token(const llama_vocab& vocab, int id);
64 | 
65 | int llama_tokenize(const llama_vocab& vocab, const char* text, int* tokens, int n_max_tokens, bool add_bos);
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/transformer/include/OPTTokenizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPT_TOKENIZER_H
 2 | #define OPT_TOKENIZER_H
 3 | 
 4 | #include <algorithm>
 5 | #include <cassert>
 6 | #include <cmath>
 7 | #include <codecvt>
 8 | #include <cstdio>
 9 | #include <fstream>
10 | #include <locale>
11 | #include <map>
12 | #include <queue>
13 | #include <random>
14 | #include <regex>
15 | #include <set>
16 | #include <sstream>
17 | #include <string>
18 | #include <unordered_map>
19 | #include <utility>
20 | #include <vector>
21 | // #include <boost/regex.hpp> // Tricky to support this in windows
22 | #include <nlohmann/json.hpp>
23 | 
24 | // std::vector<int> OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos);
25 | 
26 | struct pair_hash {
27 |     template <class T1, class T2>
28 |     std::size_t operator()(const std::pair<T1, T2> &p) const {
29 |         auto h1 = std::hash<T1>{}(p.first);
30 |         auto h2 = std::hash<T2>{}(p.second);
31 |         return h1 ^ h2;
32 |     }
33 | };
34 | 
35 | class Encoder {
36 |    public:
37 |     Encoder(std::map<std::string, int> encoder, std::vector<std::pair<std::string, std::string>> bpe_merges);
38 |     std::unordered_map<int, std::string> bytes_to_unicode();
39 |     std::set<std::pair<std::string, std::string>> get_pairs(std::vector<std::string> word);
40 |     std::string bpe(std::string token);
41 |     std::vector<int> encode(std::string text);
42 |     std::string decode(std::vector<int> tokens);
43 | 
44 |    private:
45 |     std::map<std::string, int> encoder;
46 |     std::map<int, std::string> decoder;
47 |     std::unordered_map<int, std::string> byte_encoder;
48 |     std::unordered_map<std::string, int> byte_decoder;
49 |     std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
50 |     std::unordered_map<std::string, std::string> cache;
51 | };
52 | 
53 | Encoder get_encoder(std::string vocab_file, std::string bpe_file);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/transformer/upload.py:
--------------------------------------------------------------------------------
 1 | """Uploading models and asset to the dropbox storage.
 2 | 
 3 | Example commandline:
 4 |    python upload.py <dropbox app token>
 5 | """
 6 | import argparse
 7 | import os
 8 | 
 9 | import dropbox
10 | 
11 | files_to_upload = [
12 |     "assets.zip",
13 |     "models.zip",
14 | ]
15 | 
16 | 
17 | def subebackups(file_path, target_path, token):
18 |     """Upload a file to the dropbox storage."""
19 |     dbx = dropbox.Dropbox(token, timeout=36000)
20 |     file_size = os.path.getsize(file_path)
21 |     CHUNK_SIZE = 50 * 1024 * 1024
22 |     dest_path = target_path
23 | 
24 |     with open(file_path, "rb") as f:
25 |         if file_size <= CHUNK_SIZE:
26 |             dbx.files_upload(f.read(), dest_path)
27 | 
28 |         else:
29 |             upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE))
30 |             cursor = dropbox.files.UploadSessionCursor(
31 |                 session_id=upload_session_start_result.session_id, offset=f.tell()
32 |             )
33 |             commit = dropbox.files.CommitInfo(path=dest_path, mode=dropbox.files.WriteMode("overwrite"))
34 | 
35 |             while f.tell() < file_size:
36 |                 if (file_size - f.tell()) <= CHUNK_SIZE:
37 |                     print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit))
38 |                 else:
39 |                     dbx.files_upload_session_append(f.read(CHUNK_SIZE), cursor.session_id, cursor.offset)
40 |                     cursor.offset = f.tell()
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser(description="Upload a file to Dropbox.")
45 |     parser.add_argument("token", help="Your Dropbox OAuth2 token.")
46 |     args = parser.parse_args()
47 | 
48 |     db_prefix = "/MIT/transformer_assets/"
49 |     local_prefix = "uploads"
50 | 
51 |     for file in files_to_upload:
52 |         subebackups(file, db_prefix + file, args.token)
53 | 


--------------------------------------------------------------------------------
/transformer/include/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <math.h>
 5 | 
 6 | #include <cstdlib>
 7 | #include <fstream>
 8 | #include <typeinfo>
 9 | 
10 | #include "profiler.h"
11 | 
12 | #define STATS_START(x) Profiler::getInstance().start(x)
13 | #define STATS_FLOPS(x, y) Profiler::getInstance().start(x, y)
14 | #define STATS_END(x) Profiler::getInstance().stop(x)
15 | 
16 | #ifdef PROFILER
17 | #define PROFILE_START(x) Profiler::getInstance().start(x)
18 | #define PROFILE_START_FLOPS(x, y) Profiler::getInstance().start(x, y)
19 | #define PROFILE_END(x) Profiler::getInstance().stop(x)
20 | #else
21 | #define PROFILE_START(x)
22 | #define PROFILE_START_FLOPS(x, y)
23 | #define PROFILE_END(x)
24 | #endif
25 | 
26 | #define MAX_SQ_ERROR_MAX 5e-6
27 | #define ERROR_MAX 1e-9
28 | #define INT_ERROR_MAX 1e-5
29 | 
30 | template <typename T>
31 | void read_to_array(const char* path, T* array, int size);
32 | 
33 | template <typename T>
34 | bool check_two_equal(T* array, T* array2, int size);
35 | 
36 | template <>
37 | bool check_two_equal(int8_t* array, int8_t* array2, int size);
38 | 
39 | bool check_two_equal(int8_t* array, int8_t* array2, int size, float error);
40 | 
41 | bool check_two_equal(float* array, float* array2, int size, float error);
42 | bool check_two_exact_equal(int8_t* array, int8_t* array2, int size);
43 | void print_MSE_max_diff(float* a, float* a2, int size);
44 | 
45 | void print_first_k_elelment(std::string name, const int8_t* arr, int k, int start_idx = 0);
46 | void print_first_k_elelment(std::string name, const int32_t* arr, int k, int start_idx = 0);
47 | void print_first_k_elelment(std::string name, const float* arr, int k, int start_idx = 0);
48 | 
49 | #ifdef QM_METAL
50 | template <typename T>
51 | void allocate_aligned_memory(T*& ptr, size_t size);
52 | #else
53 | template <typename T>
54 | void allocate_aligned_memory(T*& ptr, size_t size);
55 | #endif
56 | 
57 | void deallocate_memory(void* ptr);
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Fp32llamaAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Fp32llamaAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Fp32llamaAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Fp32llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Fp32llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                              Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Fp32llamaAttention {
32 |    public:
33 |     Fp32llamaAttention(std::string param_path, const struct model_config config);
34 |     Fp32llamaAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Fp32llamaAttention_output forward(const struct Fp32llamaAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     int embed_dim, num_heads, head_dim;
42 |     Linear_FP k_proj, v_proj, q_proj, o_proj;
43 |     RotaryPosEmb rotary_pos_emb;
44 |     BMM_F32T qk_bmm, pv_bmm;
45 |     std::string profile_name = "Fp32llamaAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int4llamaAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Int4llamaAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<float> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
10 | };
11 | struct Int4llamaAttention_input {
12 |     Matrix3D<float> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<float> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Int4llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Int4llamaAttention_input(Matrix3D<float> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<float> past_key_,
22 |                              Matrix3D<float> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Int4llamaAttention {
32 |    public:
33 |     Int4llamaAttention(std::string param_path, const struct model_config config);
34 |     Int4llamaAttention() {}
35 |     static void initialized_memory(const struct model_config config);
36 |     struct Int4llamaAttention_output forward(const struct Int4llamaAttention_input &input);
37 | 
38 |    private:
39 |     void unshape(Matrix3D<float> shaped, Matrix3D<float> unshape, int sqlen);
40 |     void shape(Matrix3D<float> unshape, Matrix3D<float> shaped, int sqlen);
41 |     int embed_dim, num_heads, head_dim;
42 |     Linear_FP_int4 k_proj, v_proj, q_proj, o_proj;
43 |     RotaryPosEmb rotary_pos_emb;
44 |     BMM_F32T qk_bmm, pv_bmm;
45 |     std::string profile_name = "Int4llamaAttention";
46 | };
47 | 


--------------------------------------------------------------------------------
/transformer/src/ops/RotaryPosEmb.cc:
--------------------------------------------------------------------------------
 1 | #include "operators.h"
 2 | #include <cmath>
 3 | 
 4 | float q_buf[4096], k_buf[4096];
 5 | // TODO: optimize this with multithreading
 6 | void RotaryPosEmb::forward(Matrix3D<float> &query, Matrix3D<float> &key,
 7 |                            int start_idx, int len) {
 8 |   PROFILE_START(profile_name);
 9 |   int num_heads = query.m_dim_x;
10 |   int head_embed = cos.m_dim_z;
11 |   int max_sqlen = cos.m_dim_y;
12 | 
13 |   assert(query.m_dim_z == cos.m_dim_z);
14 |   assert(key.m_dim_z == cos.m_dim_z);
15 |   assert(max_sqlen > len + start_idx);
16 | 
17 |   // cos, sin = self.rotary_emb(key_states, seq_len=kv_seq_len)
18 |   // query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
19 |   // cos, sin, position_ids) cos = cos[position_ids].unsqueeze(1)  # [bs, 1,
20 |   // seq_len, dim] sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
21 |   // q_embed = (q * cos) + (rotate_half(q) * sin)
22 |   // k_embed = (k * cos) + (rotate_half(k) * sin)
23 |   // x1 = x[..., : x.shape[-1] // 2]
24 |   // x2 = x[..., x.shape[-1] // 2 :]
25 |   // rotate_half: torch.cat((-x2, x1), dim=-1)
26 | 
27 |   int half = head_embed / 2;
28 |   for (int b = 0; b < num_heads; b++) {
29 |     for (int i = 0; i < len; i++) {
30 |       // first half
31 |       for (int j = 0; j < half; j++) {
32 |         q_buf[j] = -1 * query(b, i, j + half);
33 |         k_buf[j] = -1 * key(b, i, j + half);
34 |       }
35 |       // second half
36 |       for (int j = half; j < head_embed; j++) {
37 |         q_buf[j] = query(b, i, j - half);
38 |         k_buf[j] = key(b, i, j - half);
39 |       }
40 | 
41 |       for (int j = 0; j < head_embed; j++) {
42 |         query(b, i, j) = ((query(b, i, j) * cos(0, i + start_idx, j)) +
43 |                           (q_buf[j] * sin(0, i + start_idx, j)));
44 |         key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) +
45 |                         (k_buf[j] * sin(0, i + start_idx, j)));
46 |       }
47 |     }
48 |   }
49 | 
50 |   PROFILE_END(profile_name);
51 | }
52 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Fp32llamaDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Fp32llamaDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Fp32llamaDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                  std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Fp32llamaDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Fp32llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Fp32llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                                 Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Fp32llamaDecoderLayer {
40 |    public:
41 |     Fp32llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Fp32llamaDecoderLayer_output forward(const struct Fp32llamaDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LlamaRMSNorm input_layernorm, post_attention_layernorm;
46 |     Linear_FP gate_proj, down_proj, up_proj;
47 |     Fp32llamaAttention attn;
48 |     std::string profile_name = "Fp32llamaDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int4llamaDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Int4llamaDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<float> attentions;
 8 |     std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value;
 9 | 
10 |     Int4llamaDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<float> attentions_,
11 |                                  std::pair<Matrix3D<float>, Matrix3D<float>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Int4llamaDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<float> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Int4llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Int4llamaDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                                 Matrix3D<float> past_key_, Matrix3D<float> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Int4llamaDecoderLayer {
40 |    public:
41 |     Int4llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx);
42 |     struct Int4llamaDecoderLayer_output forward(const struct Int4llamaDecoderLayer_input &input);
43 | 
44 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
45 |     LlamaRMSNorm input_layernorm, post_attention_layernorm;  // from torch_int.nn
46 |     Linear_FP_int4 gate_proj, down_proj, up_proj;
47 |     Int4llamaAttention attn;
48 |     std::string profile_name = "Int4llamaDecoderLayer";
49 | };
50 | 


--------------------------------------------------------------------------------
/transformer/src/ops/LayerNorm.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void load_LayerNorm(LayerNorm &op, std::string prefix) {
 8 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length());
 9 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length());
10 | }
11 | 
12 | void LayerNorm::forward(const Matrix3D<float> &x, Matrix3D<float> &output) {
13 |     PROFILE_START(profile_name);
14 |     Matrix3D<float> weight = params.weight;
15 |     Matrix3D<float> bias = params.bias;
16 |     const int last_dims = 2;
17 |     const float eps = 1e-5;
18 | 
19 |     assert(last_dims == 2);  // support the last dim for now
20 |     assert(output.m_dim_x == x.m_dim_x);
21 |     assert(output.m_dim_y == x.m_dim_y);
22 |     assert(output.m_dim_z == x.m_dim_z);
23 |     assert(x.m_dim_z == weight.m_dim_z);
24 |     assert(x.m_dim_z == bias.m_dim_z);
25 | 
26 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
27 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
28 |             float mean = 0;
29 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
30 |                 mean += x(i, j, k);
31 |             }
32 |             mean /= static_cast<float>(x.m_dim_z);
33 |             float squared_diff_sum = 0;
34 |             for (int k = 0; k < x.m_dim_z; k++) {
35 |                 float value = static_cast<float>(x(i, j, k));
36 |                 squared_diff_sum += (value - mean) * (value - mean);
37 |             }
38 |             float std_dev = sqrtl(squared_diff_sum / static_cast<float>(x.m_dim_z) + eps);
39 | 
40 |             for (int k = 0; k < x.m_dim_z; k++) {
41 |                 float value = static_cast<float>(x(i, j, k));
42 |                 float fp_out = (((value - mean) / (std_dev)) * static_cast<float>(weight(0, 0, k))) +
43 |                                static_cast<float>(bias(0, 0, k));
44 |                 output(i, j, k) = static_cast<float>(fp_out);
45 |             }
46 |         }
47 |     }
48 |     PROFILE_END(profile_name);
49 | }
50 | 


--------------------------------------------------------------------------------
/transformer/src/ops/LayerNormQ.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include <iomanip>
 3 | 
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | 
 7 | void load_LayerNormQ(LayerNormQ &op, std::string prefix) {
 8 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length());
 9 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length());
10 | }
11 | 
12 | void LayerNormQ::forward(const Matrix3D<float> &x, Matrix3D<int8_t> &output) {
13 |     PROFILE_START(profile_name);
14 |     Matrix3D<float> weight = params.weight;
15 |     Matrix3D<float> bias = params.bias;
16 |     const int last_dims = 2;
17 |     const float eps = 0.00001;
18 | 
19 |     assert(last_dims == 2);  // support the last dim for now
20 |     assert(output.m_dim_x == x.m_dim_x);
21 |     assert(output.m_dim_y == x.m_dim_y);
22 |     assert(output.m_dim_z == x.m_dim_z);
23 |     assert(x.m_dim_z == weight.m_dim_z);
24 |     assert(x.m_dim_z == bias.m_dim_z);
25 | 
26 |     for (int i = 0; i < x.m_dim_x; i++) {      // batches
27 |         for (int j = 0; j < x.m_dim_y; j++) {  // samples
28 |             float mean = 0;
29 |             for (int k = 0; k < x.m_dim_z; k++) {  // hideden states
30 |                 mean += x(i, j, k);
31 |             }
32 |             mean /= static_cast<float>(x.m_dim_z);
33 |             float squared_diff_sum = 0;
34 |             for (int k = 0; k < x.m_dim_z; k++) {
35 |                 float value = static_cast<float>(x(i, j, k));
36 |                 squared_diff_sum += (value - mean) * (value - mean);
37 |             }
38 | 
39 |             float var = squared_diff_sum / static_cast<float>(x.m_dim_z);
40 |             float std_dev = sqrt(var + eps);
41 | 
42 |             for (int k = 0; k < x.m_dim_z; k++) {
43 |                 float value = static_cast<float>(x(i, j, k));
44 |                 float fp_out = ((value - mean) / (std_dev) * static_cast<float>(weight(0, 0, k))) +
45 |                                static_cast<float>(bias(0, 0, k));
46 |                 output(i, j, k) = static_cast<int8_t>(std::round(fp_out));
47 |             }
48 |         }
49 |     }
50 | 
51 |     PROFILE_END(profile_name);
52 | }
53 | 


--------------------------------------------------------------------------------
/transformer/tests/test_linear.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | #include "utils_memalloc.h"
 6 | 
 7 | void test_FPLinear_int4() {
 8 |     const int m = 1, n = 32000, k = 4096;
 9 | 
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(m * k), 1, m, k);
13 |     Matrix3D<float> weight(mem_buf.get_fpbuffer(n * k), 1, n, k);
14 |     Matrix3D<float> outputGT(mem_buf.get_fpbuffer(m * n), 1, m, n);
15 |     Matrix3D<float> output(mem_buf.get_fpbuffer(m * n), 1, m, n);
16 | 
17 |     hidden_states.load("tests/assets/input.bin");
18 |     outputGT.load("tests/assets/output.bin");
19 | 
20 |     // quantize the weight to int4
21 |     Matrix3D<uint8_t> int4_weight((uint8_t *)mem_buf.get_int8buffer(n * k / 2), 1, n, k / 2);
22 |     // Linear_FP_int4 int4_op;
23 |     Linear_FP_int4 int4_op = Linear_FP_int4(int4_weight, "INT4/models/LLaMA_7B_2_chat/lm_head/");
24 | 
25 |     Matrix3D<float> outputQ(mem_buf.get_fpbuffer(m * n), 1, m, n);
26 |     Matrix3D<float> outputQ_simd(mem_buf.get_fpbuffer(m * n), 1, m, n);
27 |     Matrix3D<float> outputQ_fast(mem_buf.get_fpbuffer(m * n), 1, m, n);
28 | 
29 |     // warm up
30 |     for (int i = 0; i < 1; i++) {
31 |         int4_op.forward(hidden_states, outputQ_fast);
32 |     }
33 | 
34 |     const int flops = k * m * n * 2;
35 |     int4_op.forward_ref(hidden_states, outputQ);
36 | 
37 |     for (int i = 0; i < 10; i++) {
38 |         STATS_FLOPS(int4_op.profile_name, flops);
39 |         int4_op.forward(hidden_states, outputQ_fast);
40 |         STATS_END(int4_op.profile_name);
41 |     }
42 |     bool success = check_two_equal(outputQ.m_data, outputQ_fast.m_data, outputQ_fast.length(), 1e-3);
43 | 
44 |     if (!success) {
45 |         std::cout << "-------- Sanity check of " << int4_op.profile_name << " implementation: Fail! -------- "
46 |                   << std::endl;
47 |         exit(-1);
48 |     } else
49 |         std::cout << "-------- Sanity check of " << int4_op.profile_name << " implementation: Passed! -------- "
50 |                   << std::endl;
51 | }
52 | 
53 | int main() {
54 |     test_FPLinear_int4();
55 |     Profiler::getInstance().report_internal();
56 | }
57 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int8OPTAttention.h:
--------------------------------------------------------------------------------
 1 | #include <utility>
 2 | 
 3 | #include "common.h"
 4 | #include "operators.h"
 5 | 
 6 | struct Int8OPTAttention_output {
 7 |     Matrix3D<float> attn_output;
 8 |     Matrix3D<int8_t> attn_probs_reshaped;
 9 |     std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value;
10 | };
11 | struct Int8OPTAttention_input {
12 |     Matrix3D<int8_t> hidden_states;
13 |     Matrix3D<float> attention_mask;
14 |     Matrix3D<int8_t> past_key, past_value;
15 |     bool has_past_key_value = false;
16 |     int layer_idx;
17 | 
18 |     Int8OPTAttention_input(Matrix3D<int8_t> hidden_states_, Matrix3D<float> attention_mask_, int layer_idx_)
19 |         : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {}
20 | 
21 |     Int8OPTAttention_input(Matrix3D<int8_t> hidden_states_, Matrix3D<float> attention_mask_, Matrix3D<int8_t> past_key_,
22 |                            Matrix3D<int8_t> past_value_, bool has_past_key_value_, int layer_idx_)
23 |         : hidden_states(hidden_states_),
24 |           attention_mask(attention_mask_),
25 |           past_key(past_key_),
26 |           past_value(past_value_),
27 |           has_past_key_value(has_past_key_value_),
28 |           layer_idx(layer_idx_) {}
29 | };
30 | 
31 | class Int8OPTAttention {
32 |    public:
33 |     Int8OPTAttention(std::string param_path, const struct model_config config, BMM_S8T_S8N_F32T &qk_bmm,
34 |                      BMM_S8T_S8N_S8T &pv_bmm, W8A8B8O8Linear &k_proj, W8A8B8O8Linear &v_proj, W8A8B8O8Linear &q_proj,
35 |                      W8A8BFP32OFP32Linear &out_proj);
36 |     Int8OPTAttention() {}
37 |     static void initialized_memory(const struct model_config config);
38 |     struct Int8OPTAttention_output forward(const struct Int8OPTAttention_input &input);
39 | 
40 |    private:
41 |     void unshape(Matrix3D<int8_t> shaped, Matrix3D<int8_t> unshape, int sqlen);
42 |     void shpae(Matrix3D<int8_t> unshape, Matrix3D<int8_t> shaped, int sqlen);
43 |     int embed_dim, num_heads, head_dim;
44 |     BMM_S8T_S8N_F32T qk_bmm;
45 |     BMM_S8T_S8N_S8T pv_bmm;
46 |     W8A8B8O8Linear k_proj, v_proj, q_proj;
47 |     W8A8BFP32OFP32Linear out_proj;
48 |     std::string profile_name = "Int8OPTAttention";
49 | };
50 | 


--------------------------------------------------------------------------------
/transformer/tests/test_OPTTokenizer.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "OPTTokenizer.h"
 4 | 
 5 | void test_OPTEncode() {
 6 |     std::string bpe_file = "models/opt_merges.txt";
 7 |     std::string vocab_file = "models/opt_vocab.json";
 8 | 
 9 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
10 |     std::vector<int> encoded = encoder.encode(
11 |         "Building a website can be done in 10 simple steps. This message is for general people, so we assume they "
12 |         "don't have basic concepts.");
13 |     std::vector<int> encoded_answer = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
14 |                                        13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
15 |     bool is_equal = true;
16 |     for (int i = 0; i < encoded.size(); i++) {
17 |         if (encoded[i] != encoded_answer[i]) {
18 |             is_equal = false;
19 |             break;
20 |         }
21 |     }
22 |     if (!is_equal)
23 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
24 |     else
25 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
26 | }
27 | 
28 | void test_OPTDecode() {
29 |     std::string bpe_file = "models/opt_merges.txt";
30 |     std::string vocab_file = "models/opt_vocab.json";
31 |     ;
32 | 
33 |     Encoder encoder = get_encoder(vocab_file, bpe_file);
34 |     std::vector<int> encoded_answer = {37500, 10,  998, 64, 28, 626, 11,   158, 2007, 2402, 4,  152,  1579,  16,
35 |                                        13,    937, 82,  6,  98, 52,  6876, 51,  218,  75,   33, 3280, 14198, 4};
36 |     std::string decoded = encoder.decode(encoded_answer);
37 |     std::string decoded_answer =
38 |         "Building a website can be done in 10 simple steps. This message is for general people, so we assume they "
39 |         "don't have basic concepts.";
40 |     bool is_equal = true;
41 |     if (decoded != decoded_answer) is_equal = false;
42 |     if (!is_equal)
43 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
44 |     else
45 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
46 | }
47 | 
48 | int main() {
49 |     test_OPTEncode();
50 |     test_OPTDecode();
51 | };
52 | 


--------------------------------------------------------------------------------
/transformer/quantize_and_upload.py:
--------------------------------------------------------------------------------
 1 | """A script to quantize supported models and updload to model zoo.
 2 | 
 3 | Example usage:
 4 | python quantize_and_upload.py --method <method> --token <dropbox token>
 5 | 
 6 | Note: This script is for developers.
 7 | """
 8 | import argparse
 9 | import hashlib
10 | import os
11 | 
12 | from upload import subebackups
13 | 
14 | model_paths = ["models/LLaMA_7B", "models/LLaMA_7B_2_chat", "models/LLaMA_7B_AWQ"]
15 | 
16 | quantized_dir = "INT4"
17 | db_prefix = "/MIT/transformer_assets/"
18 | 
19 | 
20 | def _get_md5sum(file_path):
21 |     hash_md5 = hashlib.md5()
22 |     with open(file_path, "rb") as f:
23 |         for chunk in iter(lambda: f.read(4096), b""):
24 |             hash_md5.update(chunk)
25 |     return hash_md5.hexdigest()
26 | 
27 | 
28 | def main():
29 |     """Take arguments and quantize all models and upload to dropbox."""
30 | 
31 |     def _get_parser():
32 |         parser = argparse.ArgumentParser(description="Quantize model")
33 |         parser.add_argument("--method", type=str, help="Quantization method")
34 |         parser.add_argument("--token", help="Your Dropbox OAuth2 token.")
35 |         return parser
36 | 
37 |     parser = _get_parser()
38 |     args = parser.parse_args()
39 | 
40 |     if args.method not in ["QM_x86", "QM_ARM"]:
41 |         raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM']")
42 |     QM_method = args.method
43 | 
44 |     for model_path in model_paths:
45 |         # quantize
46 |         quantize_cmd = (
47 |             f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {quantized_dir}"
48 |         )
49 |         os.system(quantize_cmd)
50 |         # zip
51 |         print("zipping...")
52 |         model_name_size = model_path.rsplit("/", maxsplit=1)[-1]
53 |         zip_path = model_name_size + ".zip"
54 |         zip_cmd = f"zip -qq -r {zip_path} {os.path.join(quantized_dir, model_path)}"
55 |         os.system(zip_cmd)
56 |         # md5sum
57 |         print(f"md5sum is {_get_md5sum(zip_path)}.")
58 |         print("uploading...")
59 |         # upload
60 |         upload_path = os.path.join(db_prefix, QM_method, zip_path)
61 |         subebackups(zip_path, upload_path, args.token)
62 |         print("removing temporary zip file...")
63 |         # rm zip
64 |         os.system(f"rm {zip_path}")
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main()
69 | 


--------------------------------------------------------------------------------
/transformer/include/nn_modules/Int8OPTDecoderLayer.h:
--------------------------------------------------------------------------------
 1 | #include "Int8OPTAttention.h"
 2 | #include "common.h"
 3 | #include "operators.h"
 4 | 
 5 | struct Int8OPTDecoderLayer_output {
 6 |     Matrix3D<float> hidden_states;
 7 |     Matrix3D<int8_t> attentions;
 8 |     std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value;
 9 | 
10 |     Int8OPTDecoderLayer_output(Matrix3D<float> hidden_states_, Matrix3D<int8_t> attentions_,
11 |                                std::pair<Matrix3D<int8_t>, Matrix3D<int8_t>> past_key_value_) {
12 |         hidden_states = hidden_states_;
13 |         attentions = attentions_;
14 |         past_key_value = past_key_value_;
15 |     };
16 | };
17 | struct Int8OPTDecoderLayer_input {
18 |     Matrix3D<float> hidden_states;
19 |     Matrix3D<float> attention_mask;
20 |     Matrix3D<int8_t> past_key, past_value;
21 |     bool has_past_key_value = false;
22 | 
23 |     Int8OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_) {
24 |         hidden_states = hidden_states_;
25 |         attention_mask = attention_mask_;
26 |         has_past_key_value = false;
27 |     }
28 | 
29 |     Int8OPTDecoderLayer_input(Matrix3D<float> &hidden_states_, Matrix3D<float> &attention_mask_,
30 |                               Matrix3D<int8_t> past_key_, Matrix3D<int8_t> past_value_) {
31 |         hidden_states = hidden_states_;
32 |         attention_mask = attention_mask_;
33 |         past_key = past_key_;
34 |         past_value = past_value_;
35 |         has_past_key_value = true;
36 |     }
37 | };
38 | 
39 | class Int8OPTDecoderLayer {
40 |    public:
41 |     Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx,
42 |                         LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm, W8A8B8O8LinearReLU fc1,
43 |                         W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm, BMM_S8T_S8N_S8T pv_bmm,
44 |                         W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj, W8A8B8O8Linear q_proj,
45 |                         W8A8BFP32OFP32Linear out_proj);
46 |     struct Int8OPTDecoderLayer_output forward(const struct Int8OPTDecoderLayer_input &input);
47 | 
48 |     int embed_dim, num_attention_heads, hidden_dim, layer_idx;
49 |     LayerNormQ self_attn_layer_norm, final_layer_norm;  // from torch_int.nn
50 |     W8A8B8O8LinearReLU fc1;
51 |     W8A8BFP32OFP32Linear fc2;
52 |     Int8OPTAttention attn;
53 |     std::string profile_name = "Int8OPTDecoderLayer";
54 | };
55 | 


--------------------------------------------------------------------------------
/transformer/src/ops/BMM_S8T_S8N_S8T.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix) {
 7 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
 8 | }
 9 | 
10 | BMM_S8T_S8N_S8T::BMM_S8T_S8N_S8T(struct BMM_S8T_S8N_S8T_params &op_params) { alpha = op_params.alpha; }
11 | 
12 | void BMM_S8T_S8N_S8T::forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<int8_t> &output) {
13 |     const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x;
14 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
15 |     PROFILE_START_FLOPS(profile_name, ops);
16 |     assert(output.m_dim_x == x.m_dim_x);
17 |     assert(output.m_dim_y == x.m_dim_y);
18 |     assert(output.m_dim_z == weight.m_dim_y);
19 |     assert(x.m_dim_z == weight.m_dim_z);
20 | 
21 |     struct matmul_params params;
22 | 
23 |     params.A.row = m;
24 |     params.A.column = k;
25 |     params.A.int8_data_ptr = x.m_data;
26 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
27 |     params.B.qparams.scale = 1.0;
28 |     params.C.qparams.scale = 1.0;
29 |     params.A.qparams.zero_point = 0;
30 |     params.B.row = k;
31 |     params.B.column = n;
32 |     params.B.int8_data_ptr = weight.m_data;
33 |     params.B.qparams.zero_point = 0;
34 |     params.C.row = m;
35 |     params.C.column = n;
36 |     params.C.int8_data_ptr = output.m_data;
37 |     params.C.qparams.zero_point = 0;
38 |     params.opt_params.blk_size = BLK_SIZE;
39 |     params.opt_params.num_thread = NUM_THREAD;
40 |     params.C.qparams.q_max = 127;
41 |     params.C.qparams.q_min = -128;
42 |     params.alpha = alpha;
43 | 
44 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
45 | 
46 |     // process each batch
47 |     if (m == 1 && x.m_dim_x > 1) {
48 |         // merge each batch
49 |         params.A.row = x.m_dim_x;
50 |         params.C.row = x.m_dim_x;
51 |         // B is batched, need a new op for this!
52 |         matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(&params);
53 |     } else {
54 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
55 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(&params);
56 |             params.A.int8_data_ptr += m * k;
57 |             params.B.int8_data_ptr += k * n;
58 |             params.C.int8_data_ptr += m * n;
59 |         }
60 |     }
61 | 
62 |     PROFILE_END(profile_name);
63 | }
64 | 


--------------------------------------------------------------------------------
/transformer/src/ops/BMM_S8T_S8N_F32T.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix) {
 7 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
 8 | }
 9 | 
10 | BMM_S8T_S8N_F32T::BMM_S8T_S8N_F32T(struct BMM_S8T_S8N_F32T_params &op_params) { alpha = op_params.alpha; }
11 | 
12 | void BMM_S8T_S8N_F32T::forward(const Matrix3D<int8_t> &x, const Matrix3D<int8_t> &weight, Matrix3D<float> &output) {
13 |     const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x;
14 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
15 |     PROFILE_START_FLOPS(profile_name, ops);
16 |     assert(output.m_dim_x == x.m_dim_x);
17 |     assert(output.m_dim_y == x.m_dim_y);
18 |     assert(output.m_dim_z == weight.m_dim_y);
19 |     assert(x.m_dim_z == weight.m_dim_z);
20 | 
21 |     struct matmul_params params;
22 | 
23 |     params.A.row = m;
24 |     params.A.column = k;
25 |     params.A.int8_data_ptr = x.m_data;
26 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
27 |     params.B.qparams.scale = 1.0;
28 |     params.C.qparams.scale = 1.0;
29 |     params.A.qparams.zero_point = 0;
30 |     params.B.row = k;
31 |     params.B.column = n;
32 |     params.B.int8_data_ptr = weight.m_data;
33 |     params.B.qparams.zero_point = 0;
34 |     params.C.row = m;
35 |     params.C.column = n;
36 |     params.C.data_ptr = output.m_data;
37 |     params.C.qparams.zero_point = 0;
38 |     params.opt_params.blk_size = BLK_SIZE;
39 |     params.opt_params.num_thread = NUM_THREAD;
40 |     params.C.qparams.q_max = 127;
41 |     params.C.qparams.q_min = -128;
42 |     params.alpha = alpha;
43 | 
44 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
45 |     if (m == 1 && x.m_dim_x > 1) {
46 |         // merge each batch
47 |         params.A.row = x.m_dim_x;
48 |         params.C.row = x.m_dim_x;
49 |         // B is batched, need a new op for this!
50 |         matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(&params);
51 |     } else {
52 |         // process each batch
53 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
54 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(&params);
55 |             params.A.int8_data_ptr += m * k;
56 |             params.B.int8_data_ptr += k * n;
57 |             params.C.data_ptr += m * n;
58 |         }
59 |     }
60 | 
61 |     PROFILE_END(profile_name);
62 | }
63 | 


--------------------------------------------------------------------------------
/transformer/include/profiler.h:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <iostream>
 3 | #include <map>
 4 | #include <string>
 5 | 
 6 | class Profiler {
 7 |    public:
 8 |     static Profiler& getInstance() {
 9 |         static Profiler instance;
10 |         return instance;
11 |     }
12 | 
13 |     void start(const std::string& section) { start_times[section] = std::chrono::high_resolution_clock::now(); }
14 | 
15 |     void start(const std::string& section, const long long section_flops) {
16 |         start_times[section] = std::chrono::high_resolution_clock::now();
17 |         if (flops.count(section) == 0)
18 |             flops[section] = section_flops;
19 |         else
20 |             flops[section] += section_flops;
21 |     }
22 | 
23 |     void reset() {
24 |         start_times.clear();
25 |         durations.clear();
26 |         counts.clear();
27 |         flops.clear();
28 |     }
29 | 
30 |     void stop(const std::string& section) {
31 |         auto end_time = std::chrono::high_resolution_clock::now();
32 |         auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_times[section]).count();
33 |         durations[section] += duration;
34 |         counts[section]++;
35 |     }
36 | 
37 |     void report_internal() const {
38 |         std::cout << "Section, Total time(ms), Average time(ms), Count, GOPs" << std::endl;
39 |         for (const auto& entry : durations) {
40 |             std::string row;
41 |             row += entry.first + ", ";
42 |             row += std::to_string((float)(entry.second) / 1000) + ", ";
43 |             row += std::to_string((float)(entry.second / counts.at(entry.first)) / 1000) + ", ";
44 |             if (flops.count(entry.first) == 0)
45 |                 row += std::to_string(counts.at(entry.first)) + ", N/A";
46 |             else {
47 |                 row += std::to_string(counts.at(entry.first)) + ", ";
48 |                 // ops and microsecond
49 |                 row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0);
50 |             }
51 |             std::cout << row << std::endl;
52 |         }
53 |     }
54 | 
55 |     void report() const {
56 | #ifdef PROFILER
57 |         report_internal();
58 | #endif
59 |     }
60 | 
61 |    private:
62 |     Profiler() {}
63 |     Profiler(const Profiler&) = delete;
64 |     Profiler& operator=(const Profiler&) = delete;
65 | 
66 |     std::map<std::string, std::chrono::high_resolution_clock::time_point> start_times;
67 |     std::map<std::string, long long> flops;
68 |     std::map<std::string, long long> durations;
69 |     std::map<std::string, int> counts;
70 | };
71 | 


--------------------------------------------------------------------------------
/transformer/Makefile:
--------------------------------------------------------------------------------
 1 | # Compiler and flags
 2 | CXX = g++
 3 | CXXFLAGS = -std=c++11 -pthread -g -O0 -w
 4 | CXXFLAGS += -DIMP=$(IMP)
 5 | 
 6 | # Executable and source files
 7 | TEST_TARGET = test_linear
 8 | PROFILE_TARGET =
 9 | APP_TARGET = chat
10 | TARGET = $(TEST_TARGET) $(PROFILE_TARGET) $(APP_TARGET)
11 | 
12 | BUILDDIR := build/transformer
13 | PROFILEDIR := build_profile/transformer
14 | LIB_DIR = ../kernels
15 | LIB_SRC = $(wildcard $(LIB_DIR)/*.cc)
16 | INCLUDE_DIRS = -I$(LIB_DIR) -I./include -I./include/nn_modules -I./json/single_include/
17 | LIB =
18 | 
19 | ifeq ($(shell uname -m),x86_64)
20 | 	# For Intel machines with AVX
21 | 	CXXFLAGS += -mavx2 -mfma -ffast-math -fpermissive -DQM_x86
22 | 	LIB_SRC += $(wildcard $(LIB_DIR)/avx/*.cc)
23 | else ifeq ($(shell uname -p),arm)
24 | 	CXX = /opt/homebrew/opt/llvm/bin/clang++
25 | 	LIB += -L/opt/homebrew/opt/boost/lib
26 | 	CXXFLAGS += -march=native -DQM_ARM -fPIC -march=armv8.2-a+dotprod
27 | 	INCLUDE_DIRS += -I/opt/homebrew/opt/boost/include
28 | 	LIB_SRC += $(wildcard $(LIB_DIR)/neon/*.cc)
29 | else
30 | 	@echo "Device unsupported!.
31 | 	LIB_REF_SRC = $(wildcard $(LIB_DIR)/ref/*.cc)
32 | 	LIB_SRC += $(LIB_REF_SRC)
33 | endif
34 | LIB_REF_SRC = $(wildcard $(LIB_DIR)/starter_code/*.cc)
35 | LIB_SRC += $(LIB_REF_SRC)
36 | 
37 | SRC_DIR = src
38 | SRC = $(wildcard src/*.cc)
39 | SRC += $(wildcard src/nn_modules/*.cc)
40 | OPS =  $(wildcard src/ops/*.cc)
41 | SRC += $(OPS)
42 | SRC += $(LIB_SRC)
43 | 
44 | # Default target
45 | all: $(TARGET)
46 | 
47 | # Phony targets
48 | .PHONY: all clean
49 | 
50 | # Metal lib
51 | library.air: $(LIB_DIR)/metal/kernel/op.metal
52 | 	xcrun -sdk macosx metal -ffast-math -fno-fast-math $(LIB_ACC_INC) -c $< -o library.air
53 | default.metallib: library.air
54 | 	xcrun -sdk macosx metallib library.air -o default.metallib
55 | 
56 | OBJS = $(addprefix $(BUILDDIR)/,$(SRC:.cc=.o))
57 | PROFILE_OBJS = $(addprefix $(PROFILEDIR)/,$(SRC:.cc=.o))
58 | 
59 | # Pattern rules
60 | $(BUILDDIR)/%.o: %.cc
61 | 	@mkdir -p $(dir $@)
62 | 	@$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -c $< -o $@
63 | 
64 | $(PROFILEDIR)/%.o: %.cc
65 | 	@mkdir -p $(dir $@)
66 | 	@$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -DPROFILER -c $< -o $@
67 | 
68 | # Linking
69 | # Rule for TEST_TARGET
70 | $(TEST_TARGET): %: tests/%.cc $(OBJS)
71 | 	@ $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS)
72 | 
73 | # Rule for APP_TARGET
74 | $(APP_TARGET): %: application/%.cc $(OBJS)
75 | 	@ $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS)
76 | 
77 | # Clean up
78 | clean:
79 | 	@ rm -f $(TARGET)
80 | 	@ rm -rf *.dSYM
81 | 	@ rm -rf build/kernels
82 | 	@ rm -rf $(BUILDDIR)
83 | 	@ rm -rf $(PROFILEDIR)
84 | 


--------------------------------------------------------------------------------
/transformer/include/ops/linear.h:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "utils.h"
 3 | class Linear_FP {
 4 |    public:
 5 |     Linear_FP(Matrix3D<float> weight_, std::string weight_path) : weight(weight_) {
 6 |         read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length());
 7 |     };
 8 |     Linear_FP(){};
 9 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
10 |     Matrix3D<float> weight;
11 | 
12 |    private:
13 |     std::string profile_name = "Linear_FP";
14 | };
15 | 
16 | class Linear_FP_int4 {
17 |    public:
18 |     Linear_FP_int4(Matrix3D<uint8_t> weight_, std::string weight_path) : weight(weight_) {
19 |         float *scale_ptr, *zero_point_ptr;
20 |         float *offset_ptr;
21 |         // length of int8_t weight = elements / 2
22 |         // length of scales/offset = elements / QK = weight / (QK/2)
23 |         // length of zero_point = 1
24 |         assert((weight.m_dim_z * 2) % (QK) == 0);
25 |         allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
26 |         allocate_aligned_memory(offset_ptr, (this->weight.length() * 2 * sizeof(float)) / QK);
27 |         allocate_aligned_memory(zero_point_ptr, 1 * sizeof(float));
28 | 
29 |         int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK;
30 |         scale = Matrix3D<float>(scale_ptr, x, y, z);
31 |         offset = Matrix3D<float>(offset_ptr, x, y, z);
32 |         zero_point = Matrix3D<float>(zero_point_ptr, 1, 1, 1);
33 |         weight.load((weight_path + "/weight_int4.bin").c_str());
34 |         offset.load((weight_path + "/offset_int4.bin").c_str());
35 |         scale.load((weight_path + "/scaling_factor_int4.bin").c_str());
36 |         zero_point.load((weight_path + "/zero_point_int4.bin").c_str());
37 |     };
38 |     Linear_FP_int4(){};
39 |     void forward(const Matrix3D<float> &x, Matrix3D<float> &output);
40 |     void forward_ref(const Matrix3D<float> &x, Matrix3D<float> &output);
41 |     void forward_fast(const Matrix3D<float> &x, Matrix3D<float> &output);
42 |     static void initialize_memory(const int block_size);
43 |     Matrix3D<uint8_t> weight;
44 |     Matrix3D<float> scale, zero_point;
45 |     Matrix3D<float> offset;
46 | 
47 | #if IMP == 0
48 |     std::string profile_name = "reference";
49 | #elif IMP == 1
50 |     std::string profile_name = "loop_unrolling";
51 | #elif IMP == 2
52 |     std::string profile_name = "multithreading";
53 | #elif IMP == 3
54 |     std::string profile_name = "simd_programming";
55 | #elif IMP == 4
56 |     std::string profile_name = "multithreading_loop_unrolling";
57 | #elif IMP == 5
58 |     std::string profile_name = "all_techniques";
59 | #else
60 |     std::string profile_name = "Unkown";
61 | #endif
62 | };
63 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Fp32llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <cstring>
 3 | 
 4 | #include "Fp32llamaForCausalLM.h"
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | #include "utils_memalloc.h"
 8 | 
 9 | void test_Fp32LlamaForCausalLM() {
10 |     struct model_config config = get_opt_model_config(LLaMA_7B);
11 |     const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1,
12 |               hidden_dim = config.hidden_dim;
13 |     const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers;
14 |     MemoryAllocator mem_buf;
15 | 
16 |     // reasoning phase: 1st run
17 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
18 |     input_ids.load("assets/llama/tests/model/1st_input_ids.bin");
19 |     struct Fp32LlamaForCausalLM_input input_1st = {input_ids};
20 | 
21 |     Fp32LlamaForCausalLM model = Fp32LlamaForCausalLM("models/LLaMA_7B", config);
22 | 
23 |     struct Fp32LlamaForCausalLM_output output_1st = model.forward(input_1st);
24 | 
25 |     Matrix3D<float> logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size);
26 |     logits.load("assets/llama/tests/model/1st_logits.bin");
27 |     // print_first_k_elelment("O", output_1st.logits.m_data, 20);
28 |     // print_first_k_elelment("G", logits.m_data, 20);
29 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8);
30 | 
31 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
32 |                                    embed_dim / num_heads);
33 |     Profiler::getInstance().report();
34 |     Profiler::getInstance().reset();
35 | 
36 |     // generating phase: 2nd run
37 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
38 |     input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin");
39 |     struct Fp32LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
40 | 
41 |     struct Fp32LlamaForCausalLM_output output_2nd = model.forward(input_2nd);
42 | 
43 |     logits = Matrix3D<float>(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size);
44 |     logits.load("assets/llama/tests/model/2nd_logits.bin");
45 |     // print_first_k_elelment("O", output_2nd.logits.m_data, 20);
46 |     // print_first_k_elelment("G", logits.m_data, 20);
47 |     success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8);
48 | 
49 |     Profiler::getInstance().report();
50 |     if (!success)
51 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
52 |     else
53 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
54 | }
55 | 
56 | int main() { test_Fp32LlamaForCausalLM(); }
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for TinyChat: Optimizing LLM on Edge Devices
 2 | 
 3 | This is a lab for [efficientml.ai course](https://efficientml.ai/).
 4 | 
 5 | Running large language models (LLMs) on the edge is of great importance. By embedding LLMs directly into real-world systems such as in-car entertainment systems or spaceship control interfaces, users can access instant responses and services without relying on a stable internet connection. Moreover, this approach alleviates the inconvenience of queuing delays often associated with cloud services. As such, running LLMs on the edge not only enhances user experience but also addresses privacy concerns, as sensitive data remains localized and reduces the risk of potential breaches.
 6 | 
 7 | However, despite their impressive capabilities, LLMs have traditionally been quite resource-intensive. They require considerable computational power and memory resources, which makes it challenging to run these models on edge devices with limited capabilities.
 8 | 
 9 | In this lab, you will learn the following:
10 | * How to deploy an LLaMA2-7B-chat with TinyChatEngine on your computer.
11 | * Implement different optimization techniques (loop unrolling, multithreading, and SIMD programming) for the linear kernel.
12 | * Observe the end-to-end latency improvement achieved by each technique.
13 | 
14 | 
15 | ## TinyChatEngine
16 | 
17 | This tutorial is based on [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine), a powerful neural network library specifically designed for the efficient deployment of quantized large language models (LLMs) on edge devices. 
18 | 
19 | ![demo](assets/figures/chat.gif)
20 | 
21 | ## Tutorial document
22 | 
23 | Please check this document and follow the instructions which will walk you through the tutorial: https://docs.google.com/document/d/13IaTfPKjp0KiSBEhPdX9IxgXMIAZfiFjor37OWQJhMM/edit?usp=sharing
24 | 
25 | ## Submission
26 | 
27 | * Report: Please write a report ([form](https://docs.google.com/document/d/17Z_ab8EhDvjcigLXdDqMqd2LTVsZ4CnpOYNkRTrnTmU/edit?usp=sharing)) that includes your code and the performance improvement for each starter code. 
28 | * Code: Use `git diff` to generate a patch for your implementation. We will use this patch to test the correctness of your code. Please name your patch as `{studentID}-{ISA}.patch` where {ISA} should be one of x86 and ARM, depending on your computer.
29 | 
30 | ## Related Projects
31 | 
32 | [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine).
33 | 
34 | [TinyEngine](https://github.com/mit-han-lab/tinyengine).
35 | 
36 | [Smoothquant](https://github.com/mit-han-lab/smoothquant).
37 | 
38 | [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://github.com/mit-han-lab/llm-awq)
39 | 
40 | ## Acknowledgement
41 | 
42 | [llama.cpp](https://github.com/ggerganov/llama.cpp)
43 | 
44 | [transformers](https://github.com/huggingface/transformers)
45 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Int4llamaForCausalLM.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <cstring>
 3 | 
 4 | #include "Int4llamaForCausalLM.h"
 5 | #include "operators.h"
 6 | #include "utils.h"
 7 | #include "utils_memalloc.h"
 8 | 
 9 | void test_Int4LlamaForCausalLM() {
10 |     struct model_config config = get_opt_model_config(LLaMA_7B);
11 |     const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1,
12 |               hidden_dim = config.hidden_dim;
13 |     const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers;
14 |     MemoryAllocator mem_buf;
15 | 
16 |     // reasoning phase: 1st run
17 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
18 |     input_ids.load("assets/llama/tests/model/1st_input_ids.bin");
19 |     struct Int4LlamaForCausalLM_input input_1st = {input_ids};
20 | 
21 |     Int4LlamaForCausalLM model = Int4LlamaForCausalLM("models/LLaMA_7B", config);
22 | 
23 |     struct Int4LlamaForCausalLM_output output_1st = model.forward(input_1st);
24 | 
25 |     Matrix3D<float> logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size);
26 |     logits.load("assets/llama/tests/model/1st_logits.bin");
27 |     // print_first_k_elelment("O", output_1st.logits.m_data, 20);
28 |     // print_first_k_elelment("G", logits.m_data, 20);
29 |     bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8);
30 | 
31 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
32 |                                    embed_dim / num_heads);
33 |     Profiler::getInstance().report();
34 |     Profiler::getInstance().reset();
35 | 
36 |     // generating phase: 2nd run
37 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
38 |     input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin");
39 |     struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
40 | 
41 |     struct Int4LlamaForCausalLM_output output_2nd;
42 |     for (int i = 0; i < 10; i++) output_2nd = model.forward(input_2nd);
43 | 
44 |     logits = Matrix3D<float>(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size);
45 |     logits.load("assets/llama/tests/model/2nd_logits.bin");
46 |     // print_first_k_elelment("O", output_2nd.logits.m_data, 20);
47 |     // print_first_k_elelment("G", logits.m_data, 20);
48 |     success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8);
49 | 
50 |     Profiler::getInstance().report();
51 |     if (!success)
52 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
53 |     else
54 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
55 | }
56 | 
57 | int main() {
58 |     // This tests are directly from fp32 and are not completed yet!
59 |     test_Int4LlamaForCausalLM();
60 | }
61 | 


--------------------------------------------------------------------------------
/transformer/src/ops/W8A8BFP32OFP32Linear.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix) {
 7 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length());
 8 |     read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.data_ptr, op.params.bias.length());
 9 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
10 | }
11 | 
12 | W8A8BFP32OFP32Linear::W8A8BFP32OFP32Linear(struct W8A8BFP32OFP32Linear_params &op_params) {
13 |     Matrix3D<int8_t> weight = op_params.weight;
14 |     Matrix3D<float> bias = op_params.bias;
15 |     alpha = op_params.alpha;
16 | 
17 |     int k = weight.m_dim_z, n = weight.m_dim_y;
18 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
19 |     params.B.qparams.scale = 1.0;
20 |     params.C.qparams.scale = 1.0;
21 |     params.A.qparams.zero_point = 0;
22 |     params.B.row = k;
23 |     params.B.column = n;
24 |     params.B.int8_data_ptr = weight.m_data;
25 |     params.B.qparams.zero_point = 0;
26 |     params.C.column = n;
27 |     params.C.qparams.zero_point = 0;
28 |     params.opt_params.blk_size = BLK_SIZE;
29 |     params.opt_params.num_thread = NUM_THREAD;
30 |     params.bias.data_ptr = bias.m_data;
31 |     params.bias.row = 1;
32 |     params.bias.column = bias.m_dim_z;
33 | }
34 | 
35 | void W8A8BFP32OFP32Linear::forward(const Matrix3D<int8_t> &x, Matrix3D<float> &output) {
36 |     const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x;
37 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
38 |     PROFILE_START_FLOPS(profile_name, ops);
39 |     assert(output.m_dim_x == x.m_dim_x);
40 |     assert(output.m_dim_y == x.m_dim_y);
41 |     assert(output.m_dim_z == params.B.column);
42 |     assert(x.m_dim_z == params.B.row);
43 |     assert(output.m_dim_z == params.bias.column);
44 | 
45 |     params.A.row = m;
46 |     params.A.column = k;
47 |     params.A.int8_data_ptr = x.m_data;
48 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
49 |     params.C.row = m;
50 |     params.C.column = n;
51 |     params.C.data_ptr = output.m_data;
52 |     params.C.qparams.zero_point = 0;
53 |     params.alpha = alpha;
54 | 
55 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
56 | 
57 |     if (m == 1) {
58 |         // let's loop over the column dim instead of row
59 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
60 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(&params);
61 |             params.A.int8_data_ptr += m * k;
62 |             params.C.data_ptr += m * n;
63 |         }
64 |     } else {
65 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
66 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(&params);
67 |             params.A.int8_data_ptr += m * k;
68 |             params.C.data_ptr += m * n;
69 |         }
70 |     }
71 | 
72 |     PROFILE_END(profile_name);
73 | }
74 | 


--------------------------------------------------------------------------------
/transformer/tests/test_LLaMATokenizer.cc:
--------------------------------------------------------------------------------
 1 | #include "LLaMATokenizer.h"
 2 | 
 3 | static const std::map<std::string, std::vector<int>> &test_LLaMATokenizer() {
 4 |     static std::map<std::string, std::vector<int>> llama_answer = {
 5 |         /* 1. */ {
 6 |             "Hello World",
 7 |             {
 8 |                 1,
 9 |                 10994,
10 |                 2787,
11 |             },
12 |         },
13 |         /* 2. */
14 |         {
15 |             " Hello World!",
16 |             {
17 |                 1,
18 |                 15043,
19 |                 2787,
20 |                 29991,
21 |             },
22 |         },
23 |         /* 3. */
24 |         {
25 |             "This is Tiny LLM Engine.",
26 |             {
27 |                 1,
28 |                 4013,
29 |                 338,
30 |                 323,
31 |                 4901,
32 |                 365,
33 |                 26369,
34 |                 10863,
35 |                 29889,
36 |             },
37 |         },
38 |         /* 4. */
39 |         {
40 |             "Please introduce Massachusetts Institute of Technology (MIT)",
41 |             {
42 |                 1,
43 |                 12148,
44 |                 14944,
45 |                 16167,
46 |                 8907,
47 |                 310,
48 |                 17968,
49 |                 313,
50 |                 26349,
51 |                 29897,
52 |             },
53 |         },
54 |         /* 5. */
55 |         {
56 |             "Building a website can be done in 10 simple steps. This message is for general people, so we assume "
57 |             "they don't have basic concepts.",
58 |             {
59 |                 1,   8893, 292,  263,  4700,  508, 367, 2309, 297, 29871, 29896, 29900, 2560, 6576, 29889, 910,   2643,
60 |                 338, 363,  2498, 2305, 29892, 577, 591, 5251, 896, 1016,  29915, 29873, 505,  6996, 22001, 29889,
61 |             },
62 |         },
63 |     };
64 | 
65 |     return llama_answer;
66 | };
67 | 
68 | int main(int argc, char **argv) {
69 |     // load the vocab
70 |     const std::string fname = "models/llama_vocab.bin";
71 |     llama_vocab vocab = llama_init_vocab(fname.c_str());
72 | 
73 |     bool is_equal;
74 |     int test_count = 1;
75 |     for (const auto &llama_answer : test_LLaMATokenizer()) {
76 |         std::vector<int> input_ids(llama_answer.first.size());
77 |         const int n = llama_tokenize(vocab, llama_answer.first.c_str(), input_ids.data(), input_ids.size(), true);
78 |         input_ids.resize(n);
79 | 
80 |         is_equal = input_ids.size() == llama_answer.second.size();
81 | 
82 |         for (int i = 0; i < (int)input_ids.size() && is_equal; ++i) {
83 |             if (input_ids[i] != llama_answer.second[i]) {
84 |                 is_equal = false;
85 |             }
86 |         }
87 | 
88 |         test_count++;
89 |     }
90 | 
91 |     if (!is_equal)
92 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
93 |     else
94 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
95 | 
96 |     return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/transformer/src/ops/W8A8B8O8LinearReLU.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix) {
 7 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length());
 8 |     read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length());
 9 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
10 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1);
11 |     read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1);
12 |     read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1);
13 | }
14 | 
15 | W8A8B8O8LinearReLU::W8A8B8O8LinearReLU(struct W8A8B8O8LinearReLU_params &op_params) {
16 |     Matrix3D<int8_t> weight = op_params.weight;
17 |     Matrix3D<int8_t> bias = op_params.bias_int8;
18 | 
19 |     int k = weight.m_dim_z, n = weight.m_dim_y;
20 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
21 |     params.B.qparams.scale = 1.0;
22 |     params.C.qparams.scale = 1.0;
23 |     params.A.qparams.zero_point = 0;
24 |     params.B.row = k;
25 |     params.B.column = n;
26 |     params.B.int8_data_ptr = weight.m_data;
27 |     params.B.qparams.zero_point = 0;
28 |     params.C.qparams.zero_point = 0;
29 |     params.opt_params.blk_size = BLK_SIZE;
30 |     params.opt_params.num_thread = NUM_THREAD;
31 |     params.C.qparams.q_max = 127;
32 |     params.C.qparams.q_min = 0;
33 |     params.bias.int8_data_ptr = bias.m_data;
34 |     params.bias.row = 1;
35 |     params.bias.column = bias.m_dim_z;
36 |     params.alpha = alpha;
37 |     params.beta = op_params.beta;
38 | }
39 | 
40 | void W8A8B8O8LinearReLU::forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output) {
41 |     const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x;
42 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
43 |     PROFILE_START_FLOPS(profile_name, ops);
44 |     assert(output.m_dim_x == x.m_dim_x);
45 |     assert(output.m_dim_y == x.m_dim_y);
46 |     assert(output.m_dim_z == params.B.column);
47 |     assert(x.m_dim_z == params.B.row);
48 |     assert(output.m_dim_z == params.bias.column);
49 | 
50 |     params.A.row = m;
51 |     params.A.column = k;
52 |     params.A.int8_data_ptr = x.m_data;
53 |     params.C.row = m;
54 |     params.C.column = n;
55 |     params.C.int8_data_ptr = output.m_data;
56 |     params.A.qparams.scale = alpha;
57 |     params.alpha = alpha;
58 |     params.beta = beta;
59 | 
60 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
61 | 
62 |     if (m == 1) {
63 |         // let's loop over the column dim instead of row
64 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
65 |             matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(&params);
66 |             params.A.int8_data_ptr += m * k;
67 |             params.C.int8_data_ptr += m * n;
68 |         }
69 |     } else {
70 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
71 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(&params);
72 |             params.A.int8_data_ptr += m * k;
73 |             params.C.int8_data_ptr += m * n;
74 |         }
75 |     }
76 | 
77 |     PROFILE_END(profile_name);
78 | }
79 | 


--------------------------------------------------------------------------------
/transformer/src/ops/W8A8B8O8Linear.cc:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | 
 3 | #include "operators.h"
 4 | #include "utils.h"
 5 | 
 6 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix) {
 7 |     read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length());
 8 |     read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length());
 9 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1);
10 |     read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1);
11 |     read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1);
12 |     read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1);
13 | }
14 | 
15 | W8A8B8O8Linear::W8A8B8O8Linear(struct W8A8B8O8Linear_params &op_params) {
16 |     Matrix3D<int8_t> weight = op_params.weight;
17 |     Matrix3D<int8_t> bias = op_params.bias;
18 | 
19 |     int k = weight.m_dim_z, n = weight.m_dim_y;
20 |     params.A.qparams.scale = alpha;  // effective_scale = a * B / C
21 |     params.B.qparams.scale = 1.0;
22 |     params.C.qparams.scale = 1.0;
23 |     params.A.qparams.zero_point = 0;
24 |     params.B.row = k;
25 |     params.B.column = n;
26 |     params.B.int8_data_ptr = weight.m_data;
27 |     params.B.qparams.zero_point = 0;
28 |     params.C.qparams.zero_point = 0;
29 |     params.opt_params.blk_size = BLK_SIZE;
30 |     params.opt_params.num_thread = NUM_THREAD;
31 |     params.C.qparams.q_max = 127;
32 |     params.C.qparams.q_min = -128;
33 |     params.bias.int8_data_ptr = bias.m_data;
34 |     params.bias.row = 1;
35 |     params.bias.column = n;
36 | }
37 | 
38 | void W8A8B8O8Linear::forward(const Matrix3D<int8_t> &x, Matrix3D<int8_t> &output) {
39 |     const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x;
40 |     const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n;
41 |     PROFILE_START_FLOPS(profile_name, ops);
42 |     assert(output.m_dim_x == x.m_dim_x);
43 |     assert(output.m_dim_y == x.m_dim_y);
44 |     assert(output.m_dim_z == params.B.column);
45 |     assert(x.m_dim_z == params.B.row);
46 |     assert(output.m_dim_z == params.bias.column);
47 | 
48 |     params.A.row = m;
49 |     params.A.column = k;
50 |     params.A.int8_data_ptr = x.m_data;
51 |     params.C.row = m;
52 |     params.C.column = n;
53 |     params.C.int8_data_ptr = output.m_data;
54 |     params.A.qparams.scale = alpha;
55 |     params.alpha = alpha;
56 |     params.beta = beta;
57 | 
58 |     matmul::MatmulOperator matmul_op = matmul::MatmulOperator();
59 | 
60 |     // printf("W8A8B8O8Linear-m,n,k: %d, %d, %d\n", m,n,k);
61 |     if (m == 1) {
62 |         // params.opt_params.num_thread = 8;
63 |         // let's loop over the column dim instead of row
64 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
65 |             matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(&params);
66 |             params.A.int8_data_ptr += m * k;
67 |             params.C.int8_data_ptr += m * n;
68 |         }
69 |     } else {
70 |         for (int bz = 0; bz < x.m_dim_x; bz++) {
71 |             matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(&params);
72 |             params.A.int8_data_ptr += m * k;
73 |             params.C.int8_data_ptr += m * n;
74 |         }
75 |     }
76 | 
77 |     PROFILE_END(profile_name);
78 | }
79 | 


--------------------------------------------------------------------------------
/kernels/metal/matmul_metal_int4.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | #define NS_PRIVATE_IMPLEMENTATION
10 | #define CA_PRIVATE_IMPLEMENTATION
11 | #define MTL_PRIVATE_IMPLEMENTATION
12 | #include "matmul_metal_int4_imp.h"
13 | 
14 | namespace matmul {
15 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) {
16 |     int i, j, k;
17 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
18 |     const int block_size = params->block_size;
19 |     float *scale = params->scales, *offset = params->offset;
20 | 
21 |     assert(params->block_size == 32);  // support block size 32 for now
22 | 
23 |     for (i = 0; i < C->row; i++) {
24 |         for (j = 0; j < C->column; j++) {
25 |             float acc = 0;
26 |             for (k = 0; k < B->row * 2; k += block_size) {
27 |                 float s = scale[j * (B->row / 16) + k / 32];   // /16:B->column is packed 4bits
28 |                 float o = offset[j * (B->row / 16) + k / 32];  // /16:B->column is packed 4bits
29 |                 uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2];
30 |                 float *x_ptr = &A->data_ptr[i * A->column + k];
31 |                 for (int qi = 0; qi < block_size / 2; qi += 4) {
32 |                     uint8_t packed_int4_0 = weight_32_int4[qi];
33 |                     uint8_t packed_int4_1 = weight_32_int4[qi + 1];
34 |                     uint8_t packed_int4_2 = weight_32_int4[qi + 2];
35 |                     uint8_t packed_int4_3 = weight_32_int4[qi + 3];
36 |                     float deq_0 = (float)((packed_int4_0 & 0x0F) - 8.0) * s + o;
37 |                     float deq_1 = (float)((packed_int4_1 & 0x0F) - 8.0) * s + o;
38 |                     float deq_2 = (float)((packed_int4_2 & 0x0F) - 8.0) * s + o;
39 |                     float deq_3 = (float)((packed_int4_3 & 0x0F) - 8.0) * s + o;
40 |                     float deq_4 = (float)((packed_int4_0 >> 4) - 8.0) * s + o;
41 |                     float deq_5 = (float)((packed_int4_1 >> 4) - 8.0) * s + o;
42 |                     float deq_6 = (float)((packed_int4_2 >> 4) - 8.0) * s + o;
43 |                     float deq_7 = (float)((packed_int4_3 >> 4) - 8.0) * s + o;
44 |                     acc += *x_ptr++ * deq_0;
45 |                     acc += *x_ptr++ * deq_1;
46 |                     acc += *x_ptr++ * deq_2;
47 |                     acc += *x_ptr++ * deq_3;
48 |                     acc += *x_ptr++ * deq_4;
49 |                     acc += *x_ptr++ * deq_5;
50 |                     acc += *x_ptr++ * deq_6;
51 |                     acc += *x_ptr++ * deq_7;
52 |                 }
53 |             }
54 |             C->data_ptr[i * C->column + j] = acc;
55 |         }
56 |     }
57 | };
58 | 
59 | void MatmulOperator::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) {
60 |     int i, j, k;
61 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
62 |     const int block_size = params->block_size;
63 |     float *scale = params->scales, *offset = params->offset;
64 | 
65 |     assert(params->block_size % 32 == 0);  // support block size to be multiply of 32
66 |     assert(A->row == C->row);              // support block size to be multiply of 32
67 | 
68 |     MetalMatMulParams matmulparams = {(unsigned int)A->row, (unsigned int)C->column, (unsigned int)A->column,
69 |                                       (unsigned int)block_size};
70 |     MetalMatmulBuffers bufferparams = {A->data_ptr, C->data_ptr, scale, offset, B->int4_data_ptr};
71 |     MetalMatmulInt4IMP::run(matmulparams, &bufferparams);
72 | };
73 | }  // namespace matmul
74 | 


--------------------------------------------------------------------------------
/transformer/include/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | #include <cstdint>
  4 | #include <cstdlib>
  5 | #include <fstream>
  6 | #include <iostream>
  7 | #include <stdexcept>
  8 | 
  9 | #include "model.h"
 10 | 
 11 | #define MAX_LINEAR_LENGTH 1024 * 1024 * 16  // 16MB, TO BE REMOVED with better memory allocation!
 12 | #define DEBUG false
 13 | 
 14 | #define DEBUG_INS(x) \
 15 |     if (DEBUG) x
 16 | 
 17 | #define QK 32
 18 | 
 19 | struct pack_q4_tensor {
 20 |     uint8_t qx[QK / 2];
 21 |     float scale;
 22 | };
 23 | 
 24 | struct pack_q8_tensor {
 25 |     int8_t qx[QK];
 26 |     float scale;
 27 | };
 28 | 
 29 | template <typename T>
 30 | class Matrix3D {
 31 |    public:
 32 |     Matrix3D(T *data, int dim_x, int dim_y, int dim_z) : m_data(data), m_dim_x(dim_x), m_dim_y(dim_y), m_dim_z(dim_z) {}
 33 | 
 34 |     T &operator()(int x, int y, int z) {
 35 |         if (x < 0 || x >= m_dim_x || y < 0 || y >= m_dim_y || z < 0 || z >= m_dim_z) {
 36 |             printf("%d, %d, %d\n", x, y, z);
 37 |             printf("%d, %d, %d\n", m_dim_x, m_dim_y, m_dim_z);
 38 |             throw std::out_of_range("Matrix3D: Indices out of range.");
 39 |         }
 40 |         return m_data[x * m_dim_y * m_dim_z + y * m_dim_z + z];
 41 |     }
 42 | 
 43 |     const T &operator()(int x, int y, int z) const {
 44 |         if (x < 0 || x >= m_dim_x || y < 0 || y >= m_dim_y || z < 0 || z >= m_dim_z) {
 45 |             printf("%d, %d, %d\n", x, y, z);
 46 |             printf("%d, %d, %d\n", m_dim_x, m_dim_y, m_dim_z);
 47 |             throw std::out_of_range("Matrix3D: Indices out of range.");
 48 |         }
 49 |         return m_data[x * m_dim_y * m_dim_z + y * m_dim_z + z];
 50 |     }
 51 | 
 52 |     bool operator==(const Matrix3D<T> &other) const {
 53 |         if (m_dim_x != other.m_dim_x || m_dim_y != other.m_dim_y || m_dim_z != other.m_dim_z) {
 54 |             return false;
 55 |         }
 56 | 
 57 |         for (int x = 0; x < m_dim_x; ++x) {
 58 |             for (int y = 0; y < m_dim_y; ++y) {
 59 |                 for (int z = 0; z < m_dim_z; ++z) {
 60 |                     if ((*this)(x, y, z) != other(x, y, z)) {
 61 |                         return false;
 62 |                     }
 63 |                 }
 64 |             }
 65 |         }
 66 | 
 67 |         return true;
 68 |     }
 69 | 
 70 |     int length() const { return m_dim_x * m_dim_y * m_dim_z; }
 71 |     T sum() const {
 72 |         T sum = 0;
 73 |         for (int i = 0; i < this->length(); i++) {
 74 |             sum += this->m_data[i];
 75 |         }
 76 |         return sum;
 77 |     }
 78 |     T sum(int size) const {
 79 |         T sum = 0;
 80 |         for (int i = 0; i < size; i++) {
 81 |             sum += this->m_data[i];
 82 |         }
 83 |         return sum;
 84 |     }
 85 | 
 86 |     T sum(int size, int start_idx) const {
 87 |         T sum = 0;
 88 |         for (int i = 0; i < size; i++) {
 89 |             sum += this->m_data[start_idx + i];
 90 |         }
 91 |         return sum;
 92 |     }
 93 | 
 94 |     void load(const char *path) {
 95 |         std::ifstream infile(path, std::ios::binary | std::ios::in);
 96 |         if (infile.fail()) {
 97 |             std::cout << strerror(errno) << ": " << path << std::endl;
 98 |             throw("Expected error...");
 99 |         } else {
100 |             infile.read(reinterpret_cast<char *>(this->m_data), this->length() * sizeof(T));
101 |             infile.close();
102 |         }
103 |     }
104 |     T *m_data;
105 |     int m_dim_x, m_dim_y, m_dim_z;
106 | 
107 |     // Default constructor
108 |     Matrix3D() { m_data = NULL; }
109 | };
110 | 
111 | static inline void debug_info(std::string s) {
112 | #ifdef DEBUG
113 |     std::cout << s << std::endl;
114 | #endif
115 | }
116 | #endif
117 | 


--------------------------------------------------------------------------------
/kernels/cuda/matmul.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <sys/time.h>
  3 | #include <stdlib.h>
  4 | #include <cstdlib>
  5 | #include <iostream>
  6 | 
  7 | #include "../matmul.h"
  8 | 
  9 | #include <cuda_runtime.h>
 10 | #include <torch/extension.h>
 11 | #include "gemm_cuda.h"
 12 | #include "dequantize.cuh"
 13 | #include <cuda_fp16.h>
 14 | #include <c10/cuda/CUDAGuard.h>
 15 | 
 16 | const int threadDim = 32;
 17 | const int TILE_SIZE = threadDim;
 18 | 
 19 | __global__ void matrixMul_blockC(float *A, float *B, float *C, int A_row, int A_column, int B_column){
 20 | 	int i = blockIdx.x * blockDim.x + threadIdx.x;
 21 | 	int j = blockIdx.y * blockDim.y + threadIdx.y;
 22 | 
 23 | 	float acc = 0;
 24 | 	for (int k = 0; k < A_column; k++)
 25 | 		acc += A[j * A_column + k] * B[k * B_column + i];
 26 | 	C[j * B_column +i] = acc;
 27 | }
 28 | 
 29 | __global__ void matrixMultiplyShared(const float *A, const float *B, float *C, int A_row, int A_column, int B_column) {
 30 | 	int row = blockIdx.y * blockDim.y + threadIdx.y;
 31 | 	int col = blockIdx.x * blockDim.x + threadIdx.x;
 32 | 
 33 | 	__shared__ float As[TILE_SIZE][TILE_SIZE];
 34 | 	__shared__ float Bs[TILE_SIZE][TILE_SIZE];
 35 | 
 36 | 	float value = 0;
 37 | 
 38 | 	for (int i = 0; i < A_column / TILE_SIZE; i++){
 39 | 		As[threadIdx.y][threadIdx.x] = A[(blockIdx.y * TILE_SIZE + threadIdx.y) * A_column + TILE_SIZE * i + threadIdx.x];
 40 | 		Bs[threadIdx.y][threadIdx.x] = B[(i * TILE_SIZE + threadIdx.y) * B_column + blockIdx.x * TILE_SIZE + threadIdx.x];
 41 | 
 42 | 		__syncthreads();
 43 | 
 44 | 		for (int k = 0; k < TILE_SIZE; k++)
 45 | 			value += As[threadIdx.y][k] * Bs[k][threadIdx.x];
 46 | 
 47 | 		__syncthreads();
 48 | 	}
 49 | 
 50 | 
 51 | 	C[row * B_column + col] = value;
 52 | }
 53 | 
 54 | namespace matmul{
 55 | 
 56 | 	void MatmulOperator::mat_mul_cuda(const struct matmul_params *params){
 57 | 		const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
 58 | 		assert(A->column == B->row);
 59 | 		assert(C->column == B->column);
 60 | 		assert(C->row == A->row);
 61 | 
 62 | 		float *d_A;
 63 | 		float *d_B;
 64 | 		float *d_C;
 65 | 
 66 | 		// Initailize C
 67 | 		/*for (int i = 0; i < C->row; i++)
 68 | 		  for (int j = 0; j < C->column; j++)
 69 | 		  C->data_ptr[j + C->column * i] = 0;*/
 70 | 
 71 | 		// Allocate memory
 72 | 		cudaMalloc(&d_A, A->column*A->row*sizeof(float));
 73 | 		cudaMalloc(&d_B, B->column*B->row*sizeof(float));
 74 | 		cudaMalloc(&d_C, C->column*C->row*sizeof(float));
 75 | 
 76 | 		// Copy data to GPU
 77 | 		cudaMemcpy(d_A, A->data_ptr, A->column*A->row*sizeof(float), cudaMemcpyHostToDevice);
 78 | 		cudaMemcpy(d_B, B->data_ptr, B->column*B->row*sizeof(float), cudaMemcpyHostToDevice);
 79 | 		cudaMemcpy(d_C, C->data_ptr, C->column*C->row*sizeof(float), cudaMemcpyHostToDevice);
 80 | 
 81 | 		// Make sure we can break the input matrix into blocks
 82 | 		assert(A->column % threadDim == 0);
 83 | 		assert(A->row % threadDim == 0);
 84 | 		assert(B->column % threadDim == 0);
 85 | 		const dim3 threadsPerBlock(threadDim, threadDim);
 86 | 		const dim3 numBlocks(C->column / threadsPerBlock.x, C->row / threadsPerBlock.y);
 87 | 
 88 | 		// Invoke the cuda imp.
 89 | 
 90 | 		// struct timeval start, end;
 91 | 		// gettimeofday(&start, NULL);
 92 | 		//matrixMul_blockC<<< numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, A->row, A->column, B->column);
 93 | 		matrixMultiplyShared<<< numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, A->row, A->column, B->column);
 94 | 		cudaDeviceSynchronize();
 95 | 		// gettimeofday(&end, NULL);
 96 | 		// int us = interval_to_us(&start, &end);
 97 | 		// std::cout << "cuda kernel: " << us / 1000 << " ms" << std::endl;
 98 | 
 99 | 		// Get the result back
100 | 		cudaMemcpy(C->data_ptr, d_C, C->column*C->row*sizeof(float), cudaMemcpyDeviceToHost);
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/kernels/matmul.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <sys/time.h>
 3 | 
 4 | void quantize_fp32_to_int8(float *A, int8_t *qA, float *sA, int size, int block_size);
 5 | 
 6 | // Data structures
 7 | struct quantization_params {
 8 |     float scale;
 9 |     bool per_channel = false;
10 |     int32_t zero_point;
11 |     int8_t q_min = -128, q_max = 127;
12 | };
13 | 
14 | struct matrix {
15 |     int row;
16 |     int column;
17 |     float *data_ptr;
18 |     int32_t *int32_data_ptr;
19 |     int8_t *int8_data_ptr;
20 |     uint8_t *uint8_data_ptr;
21 |     uint8_t *int4_data_ptr;
22 |     struct quantization_params qparams;
23 |     int length() { return row * column; }
24 | };
25 | 
26 | struct optimization_params {
27 |     int blk_size;
28 |     int num_thread = 8;
29 | };
30 | 
31 | struct matmul_params {
32 |     struct matrix A, B, C, bias;
33 |     struct optimization_params opt_params;
34 |     float alpha, beta;
35 |     // for int4
36 |     float *scales, *offset, *zero_point;
37 |     int block_size;
38 |     // for int8 activation
39 |     float *A_scales;
40 |     int8_t A_zero_point;
41 | };
42 | 
43 | struct thread_args {
44 |     const struct matrix *A;
45 |     const struct matrix *B;
46 |     const struct matrix *C;
47 |     const struct matmul_params *params;
48 |     int start_i, end_i, blk_size;
49 | };
50 | 
51 | #define MAX(A, B) ((A) > (B) ? (A) : (B))
52 | #define MIN(A, B) ((A) < (B) ? (A) : (B))
53 | namespace matmul {
54 | class MatmulOperator {
55 |    public:
56 |     void mat_mul_transposed(const struct matmul_params *params);
57 |     void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params);
58 |     // int8
59 |     void naive_mat_mul_int8(const struct matmul_params *params);
60 |     void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params);
61 |     void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params);
62 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params);
63 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params);
64 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params);
65 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params);
66 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params);
67 |     void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params *params);
68 |     // int4
69 |     void mat_mul_accelerator_int4_fast(const struct matmul_params *params);
70 |     void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params);
71 |     void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params);
72 |     void naive_mat_mul_int4(const struct matmul_params *params);
73 |     void naive_mat_mul_int4_with_offset(const struct matmul_params *params);
74 |     // w8a4 code template functions
75 |     void mat_mul_reference(struct matmul_params *params);
76 |     void mat_mul_loop_unrolling(struct matmul_params *params);
77 |     void mat_mul_multithreading(struct matmul_params *params);
78 |     void mat_mul_multithreading_loop_unrolling(struct matmul_params *params);
79 |     void mat_mul_simd_programming(struct matmul_params *params);
80 |     void mat_mul_all_techniques(struct matmul_params *params);
81 |     // cuda
82 |     void mat_mul_cuda(const struct matmul_params *params);
83 | 
84 |    private:
85 |     float interval_to_us(struct timeval *start, struct timeval *end);
86 |     void CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C);
87 |     void CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C);
88 | };
89 | }  // namespace matmul
90 | 


--------------------------------------------------------------------------------
/kernels/avx/matmul_avx_fp32.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <immintrin.h>  // AVX intrinsic
 3 | #include <math.h>
 4 | #include <pthread.h>
 5 | #include <stdio.h>
 6 | #include <xmmintrin.h>  // intel SSE intrinsic
 7 | 
 8 | #include "../matmul.h"
 9 | 
10 | namespace matmul {
11 | 
12 | inline void simd_mul_fp_128(const float *a, const float *b, float *c) {
13 |     __m128 val = _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b));
14 |     __m128 acc = _mm_add_ps(_mm_load_ps(c), val);
15 |     _mm_store_ps(c, acc);
16 | }
17 | 
18 | void *mat_mul_transposed_fastover_column_func(void *args) {
19 |     int i, j, k;
20 |     struct thread_args *mat_args = (struct thread_args *)args;
21 |     const struct matrix *A = mat_args->A;
22 |     const struct matrix *B = mat_args->B;
23 |     const struct matrix *C = mat_args->C;
24 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
25 |     int start_i = mat_args->start_i, end_i = mat_args->end_i;
26 | 
27 |     __m256 zero256 = _mm256_setzero_ps();
28 |     for (i = 0; i < C->row; i++) {
29 |         for (j = start_i; j + 1 < end_i; j += 2) {
30 |             __m256 acc = zero256, acc1 = zero256;
31 |             __m256 *A256 = (__m256 *)&data_A[i * A->column];
32 |             __m256 *B256 = (__m256 *)&data_B[j * B->row];
33 |             __m256 *B256_1 = (__m256 *)&data_B[(j + 1) * B->row];
34 |             for (k = 0; k < A->column; k += 8) {
35 |                 __m256 Aik = _mm256_load_ps((const float *)A256++);
36 |                 __m256 Bjk = _mm256_load_ps((const float *)B256++);
37 |                 __m256 Bj1k = _mm256_load_ps((const float *)B256_1++);
38 |                 acc = _mm256_add_ps(acc, _mm256_mul_ps(Aik, Bjk));
39 |                 acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(Aik, Bj1k));
40 |             }
41 |             float *ptr = (float *)&acc;
42 |             data_C[i * C->column + j] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7];
43 |             ptr = (float *)&acc1;
44 |             data_C[i * C->column + j + 1] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7];
45 |         }
46 |         // leftover
47 |         if (j < end_i) {
48 |             __m256 acc = zero256;
49 |             for (k = 0; k < A->column; k += 8) {
50 |                 __m256 Aik = _mm256_load_ps(&data_A[i * A->column + k]);
51 |                 __m256 Bjk = _mm256_load_ps(&data_B[j * B->row + k]);
52 |                 acc = _mm256_add_ps(acc, _mm256_mul_ps(Aik, Bjk));
53 |             }
54 |             float *ptr = (float *)&acc;
55 |             data_C[i * C->column + j] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7];
56 |             j++;
57 |         }
58 |     }
59 | 
60 |     return NULL;
61 | }
62 | 
63 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) {
64 |     int i, j, k;
65 | 
66 |     int num_thread = params->opt_params.num_thread;
67 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
68 |     float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr;
69 | 
70 |     assert(A->column % 8 == 0);
71 | 
72 |     if (num_thread > C->column) num_thread = C->column;
73 | 
74 |     pthread_t thread_pool[num_thread];
75 |     struct thread_args threads_args[num_thread];
76 | 
77 |     // Thread creation
78 |     for (j = 0; j < num_thread; j++) {
79 |         threads_args[j].start_i = j * (C->column / num_thread);
80 |         threads_args[j].end_i = (j + 1) * (C->column / num_thread);
81 |         threads_args[j].blk_size = params->opt_params.blk_size;
82 |         threads_args[j].A = A;
83 |         threads_args[j].B = B;
84 |         threads_args[j].C = C;
85 |         pthread_create(&thread_pool[j], NULL, mat_mul_transposed_fastover_column_func, &threads_args[j]);
86 |     }
87 |     // Join threads
88 |     for (j = 0; j < num_thread; j++) {
89 |         pthread_join(thread_pool[j], NULL);
90 |     }
91 | }
92 | 
93 | }  // namespace matmul
94 | 


--------------------------------------------------------------------------------
/transformer/include/Generate.h:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | #include <cstdio>
 4 | #include <iostream>
 5 | #include <queue>
 6 | #include <random>
 7 | #include <string>
 8 | #include <unordered_map>
 9 | #include <vector>
10 | 
11 | #include "Fp32llamaForCausalLM.h"
12 | #include "Int4llamaForCausalLM.h"
13 | #include "OPTForCausalLM.h"
14 | #include "OPTTokenizer.h"
15 | #include "operators.h"
16 | #include "utils.h"
17 | 
18 | inline std::mt19937 OPT_rng;
19 | 
20 | typedef struct OPT_token_data {
21 |     int id;       // token id
22 |     float logit;  // log-odds of the token
23 |     float p;      // probability of the token
24 | } OPT_token_data;
25 | 
26 | typedef struct OPT_token_data_array {
27 |     OPT_token_data* data;
28 |     size_t size;
29 |     bool sorted;
30 | } OPT_token_data_array;
31 | 
32 | struct opt_params {
33 |     int32_t seed = -1;        // RNG seed
34 |     int32_t n_threads = 1;    // TODO: fix this
35 |     int32_t n_predict = 128;  // new tokens to predict
36 |     int32_t n_parts = -1;     // amount of model parts (-1 = determine from model dimensions)
37 |     int32_t n_ctx = 512;      // context size
38 |     int32_t n_batch = 512;    // batch size for prompt processing (must be >=32 to use BLAS)
39 |     int32_t n_keep = 0;       // number of tokens to keep from initial prompt
40 |     int32_t n_vocab = 50272;  // vocabulary size
41 | 
42 |     // sampling parameters
43 |     std::unordered_map<int, float> logit_bias;  // logit bias for specific tokens
44 |     int32_t top_k = 40;                         // <= 0 to use vocab size
45 |     float top_p = 0.95f;                        // 1.0 = disabled
46 |     float tfs_z = 1.00f;                        // 1.0 = disabled
47 |     float typical_p = 1.00f;                    // 1.0 = disabled
48 |     float temp = 0.80f;                         // 1.0 = disabled
49 |     float repeat_penalty = 1.10f;               // 1.0 = disabled
50 |     int32_t repeat_last_n = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
51 |     float frequency_penalty = 0.00f;            // 0.0 = disabled
52 |     float presence_penalty = 0.00f;             // 0.0 = disabled
53 |     int mirostat = 0;                           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
54 |     float mirostat_tau = 5.00f;                 // target entropy
55 |     float mirostat_eta = 0.10f;                 // learning rate
56 | };
57 | 
58 | void sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size,
59 |                                float penalty);
60 | 
61 | void sample_frequency_and_presence_penalties(OPT_token_data_array* candidates, const int* last_tokens_p,
62 |                                              size_t last_tokens_size, float alpha_frequency, float alpha_presence);
63 | 
64 | int sample_token_greedy(OPT_token_data_array* candidates);
65 | 
66 | void sample_temperature(OPT_token_data_array* candidates_p, float temp);
67 | 
68 | void sample_softmax(OPT_token_data_array* candidates);
69 | 
70 | int sample_token(OPT_token_data_array* candidates);
71 | 
72 | void sample_top_k(OPT_token_data_array* candidates, int k, size_t min_keep);
73 | 
74 | int sample_token_mirostat(const int n_vocab, OPT_token_data_array* candidates, float tau, float eta, int m, float* mu);
75 | 
76 | int sample_token_mirostat_v2(OPT_token_data_array* candidates, float tau, float eta, float* mu);
77 | 
78 | void sample_tail_free(OPT_token_data_array* candidates, float z, size_t min_keep);
79 | 
80 | void sample_typical(OPT_token_data_array* candidates, float p, size_t min_keep);
81 | 
82 | void sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep);
83 | 
84 | std::vector<int> OPTGenerate(OPTForCausalLM model, std::vector<int> input_ids,
85 |                              const struct opt_params generation_config, Encoder* encoder = NULL,
86 |                              bool interactive = false);
87 | 
88 | enum { OPT, LLaMA_FP32, LLaMA_INT4 };
89 | std::vector<int> LLaMAGenerate(void* model, int model_type, std::string text, const struct opt_params generation_config,
90 |                                std::string voc_path, bool interactive);
91 | 


--------------------------------------------------------------------------------
/transformer/src/ops/BMM_F32T.cc:
--------------------------------------------------------------------------------
  1 | #include "operators.h"
  2 | #include "utils.h"
  3 | 
  4 | void load_BMM_F32T(BMM_F32T &op, std::string prefix) { read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); }
  5 | 
  6 | BMM_F32T::BMM_F32T(float _alpha) { this->alpha = _alpha; }
  7 | 
  8 | void BMM_F32T::forward(const Matrix3D<float> &a, const Matrix3D<float> &weight, Matrix3D<float> &c) {
  9 |     const Matrix3D<float> b = weight;
 10 |     const int m = a.m_dim_y, n = b.m_dim_y, k = a.m_dim_z, b_size = b.m_dim_x;
 11 |     const long long ops = (long long)b_size * 2 * (long long)m * (long long)n * (long long)k;
 12 |     PROFILE_START_FLOPS(profile_name, ops);
 13 | 
 14 |     // a: m x k   b: n x k   c: m x n
 15 |     assert(a.m_dim_x == b.m_dim_x);  // batch dim
 16 |     assert(a.m_dim_z == b.m_dim_z);  // k
 17 |     assert(a.m_dim_y == c.m_dim_y);  // m
 18 |     assert(b.m_dim_y == c.m_dim_z);  // n
 19 | 
 20 |     struct matmul_params params;
 21 |     params.A.row = a.m_dim_y;
 22 |     params.A.column = a.m_dim_z;
 23 |     params.A.data_ptr = a.m_data;
 24 |     params.B.row = b.m_dim_y;
 25 |     params.B.column = b.m_dim_z;
 26 |     params.B.data_ptr = b.m_data;
 27 |     params.C.row = c.m_dim_y;
 28 |     params.C.column = c.m_dim_z;
 29 |     params.C.data_ptr = c.m_data;
 30 |     params.opt_params.blk_size = BLK_SIZE;
 31 |     params.opt_params.num_thread = NUM_THREAD;
 32 |     params.alpha = alpha;
 33 | 
 34 |     matmul::MatmulOperator op = matmul::MatmulOperator();
 35 | 
 36 |     for (int bz = 0; bz < a.m_dim_x; bz++) {
 37 |         // if (params.A.column % 8 == 0) // TODO: debug this
 38 |         //     op.mat_mul_transposed_fastover_column((const struct matmul_params
 39 |         //     *)&params);
 40 |         // else
 41 |         op.mat_mul_transposed(&params);  // TODO: optimize this
 42 |         // TODO: apply SIMD here
 43 |         for (int i = 0; i < m * n; i++) {
 44 |             params.C.data_ptr[i] *= this->alpha;
 45 |         }
 46 |         params.A.data_ptr += m * k;
 47 |         params.B.data_ptr += k * n;
 48 |         params.C.data_ptr += m * n;
 49 |     }
 50 | 
 51 |     PROFILE_END(profile_name);
 52 | }
 53 | 
 54 | void BMM_F32T::forward_weight_untransposed(const Matrix3D<float> &a, const Matrix3D<float> &weight,
 55 |                                            Matrix3D<float> &c) {
 56 |     const Matrix3D<float> b = weight;
 57 |     const int m = a.m_dim_y, n = c.m_dim_z, k = a.m_dim_z, b_size = b.m_dim_x;
 58 |     const long long ops = (long long)b_size * 2 * (long long)m * (long long)n * (long long)k;
 59 |     PROFILE_START_FLOPS(profile_name, ops);
 60 | 
 61 |     // a: m x k   b: n x k   c: m x n
 62 |     assert(a.m_dim_x == b.m_dim_x);  // batch dim
 63 |     assert(a.m_dim_z == b.m_dim_y);  // k
 64 |     assert(a.m_dim_y == c.m_dim_y);  // m
 65 |     assert(b.m_dim_z == c.m_dim_z);  // n
 66 | 
 67 |     struct matmul_params params;
 68 |     params.A.row = a.m_dim_y;
 69 |     params.A.column = a.m_dim_z;
 70 |     params.A.data_ptr = a.m_data;
 71 |     params.B.row = b.m_dim_y;
 72 |     params.B.column = b.m_dim_z;
 73 |     params.B.data_ptr = b.m_data;
 74 |     params.C.row = c.m_dim_y;
 75 |     params.C.column = c.m_dim_z;
 76 |     params.C.data_ptr = c.m_data;
 77 |     params.opt_params.blk_size = BLK_SIZE;
 78 |     params.opt_params.num_thread = NUM_THREAD;
 79 |     params.alpha = alpha;
 80 | 
 81 |     matmul::MatmulOperator op = matmul::MatmulOperator();
 82 | 
 83 |     for (int i = 0; i < m * n * a.m_dim_x; i++) {
 84 |         params.C.data_ptr[i] = 0;
 85 |     }
 86 | 
 87 |     for (int bz = 0; bz < a.m_dim_x; bz++) {
 88 |         float *data_A = params.A.data_ptr + bz * m * k, *data_B = params.B.data_ptr + bz * k * n,
 89 |               *data_C = params.C.data_ptr + bz * m * n;
 90 |         for (int i = 0; i < m; i++)
 91 |             for (int kk = 0; kk < k; kk++) {
 92 |                 float Aikk0 = data_A[i * k + kk];
 93 |                 for (int j = 0; j < n; j++) {
 94 |                     float Bjk0 = data_B[kk * n + j];
 95 |                     data_C[i * n + j] += Aikk0 * Bjk0;
 96 |                 }
 97 |             }
 98 |     }
 99 | 
100 |     PROFILE_END(profile_name);
101 | }
102 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Fp32llamaDecoder.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | 
 3 | #include "Fp32llamaDecoder.h"
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | #include "utils_memalloc.h"
 7 | 
 8 | void test_Decoder() {
 9 |     const struct model_config llama7B = llama_7B;
10 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
11 |               head_dim = embed_dim / num_heads, num_layers = llama7B.num_layers;
12 |     MemoryAllocator mem_buf;
13 | 
14 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
15 |     input_ids.load("assets/llama/tests/decoder/1st_input_ids.bin");
16 |     struct Fp32llamaDecoder_input input_1st = {input_ids};
17 | 
18 |     Fp32llamaDecoder decoder = Fp32llamaDecoder("models/LLaMA_7B/decoder/", llama7B);
19 | 
20 |     struct Fp32llamaDecoder_output output_1st = decoder.forward(input_1st);
21 | 
22 |     // reasoning phase: 1st run
23 |     Matrix3D<float> last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
24 |     last_hidden_state1_GT.load("assets/llama/tests/decoder/1st_last_hidden_state.bin");
25 | 
26 |     // print_first_k_elelment("output_1st.last_hidden_state", output_1st.last_hidden_state.m_data, 20);
27 |     // print_first_k_elelment("last_hidden_state1_GT", last_hidden_state1_GT.m_data, 20);
28 |     bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data,
29 |                                    last_hidden_state1_GT.length(), 1e-8);
30 | 
31 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
32 |                                    embed_dim / num_heads);
33 |     for (int i = 0; i < num_layers; i++) {
34 |         std::string path = "assets/llama/tests/decoder/1st/past_key_value/key" + std::to_string(i) + ".bin";
35 |         temp_key_value.load(path.c_str());
36 |         success &=
37 |             check_two_equal(output_1st.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
38 | 
39 |         path = "assets/llama/tests/decoder/1st/past_key_value/value" + std::to_string(i) + ".bin";
40 |         temp_key_value.load(path.c_str());
41 |         success &=
42 |             check_two_equal(output_1st.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
43 |     }
44 | 
45 |     // generating phase: 2nd run
46 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
47 |     input_ids_2nd.load("assets/llama/tests/decoder/2nd/input_ids.bin");
48 |     struct Fp32llamaDecoder_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
49 | 
50 |     struct Fp32llamaDecoder_output output_2nd = decoder.forward(input_2nd);
51 | 
52 |     Matrix3D<float> last_hidden_state2_GT(mem_buf.get_fpbuffer(b * 1 * embed_dim), b, 1, embed_dim);
53 |     last_hidden_state2_GT.load("assets/llama/tests/decoder/2nd/last_hidden_state.bin");
54 |     success &= check_two_equal(output_2nd.last_hidden_state.m_data, last_hidden_state2_GT.m_data,
55 |                                last_hidden_state2_GT.length(), 1e-8);
56 | 
57 |     temp_key_value = Matrix3D<float>(mem_buf.get_fpbuffer(b * (sqlen + 1) * embed_dim), num_heads, (sqlen + 1),
58 |                                      embed_dim / num_heads);
59 |     for (int i = 0; i < num_layers; i++) {
60 |         std::string path = "assets/llama/tests/decoder/2nd/past_key_value/key" + std::to_string(i) + ".bin";
61 |         temp_key_value.load(path.c_str());
62 |         success &=
63 |             check_two_equal(output_2nd.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
64 | 
65 |         path = "assets/llama/tests/decoder/2nd/past_key_value/value" + std::to_string(i) + ".bin";
66 |         temp_key_value.load(path.c_str());
67 |         success &=
68 |             check_two_equal(output_2nd.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
69 |     }
70 | 
71 |     if (!success)
72 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
73 |     else
74 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
75 | }
76 | 
77 | int main() { test_Decoder(); }
78 | 


--------------------------------------------------------------------------------
/kernels/cuda/dequantize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
 3 | 
 4 | @article{lin2023awq,
 5 |   title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
 6 |   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 7 |   journal={arXiv},
 8 |   year={2023}
 9 | }
10 | */
11 | 
12 | #pragma once
13 | 
14 | 
15 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
16 | {
17 |     uint4 result;
18 | 
19 |     uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
20 |     uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
21 | 
22 |     // First, we extract the i4s and construct an intermediate fp16 number.
23 |     static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
24 |     static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
25 |     static constexpr uint32_t TOP_MASK              = 0x00f000f0;
26 |     static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
27 | 
28 |     // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
29 |     // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
30 |     // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
31 |     // elt_67 to fp16 without having to shift them to the bottom bits before hand.
32 | 
33 |     // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
34 |     // immediately before required.
35 |     const uint32_t top_i4s = i4s >> 8;
36 |     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
37 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
38 |                     : "=r"(h[0])
39 |                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
40 |     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
41 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
42 |                     : "=r"(h[1])
43 |                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
44 |     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
45 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
46 |                     : "=r"(h[2])
47 |                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
48 |     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
49 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
50 |                     : "=r"(h[3])
51 |                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
52 | 
53 |     // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
54 |     // half2 ctor. In this case, I chose performance reliability over code readability.
55 | 
56 |     // This is the half2 {1032, 1032} represented as an integer.
57 |     // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
58 |     // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
59 |     static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
60 |     // This is the half2 {1 / 16, 1 / 16} represented as an integer.
61 |     static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
62 |     // This is the half2 {-72, -72} represented as an integer.
63 |     // static constexpr uint32_t NEG_72 = 0xd480d480;
64 |     // Haotian: Let's use {-64, -64}.
65 |     static constexpr uint32_t NEG_64 = 0xd400d400;
66 | 
67 |     // Finally, we construct the output numbers.
68 |     // Convert elt_01
69 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
70 |     // Convert elt_23
71 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
72 |     // Convert elt_45
73 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
74 |     // Convert elt_67
75 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
76 | 
77 |     return result;
78 | }
79 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Int4llamaDecoder.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | 
 3 | #include "Int4llamaDecoder.h"
 4 | #include "operators.h"
 5 | #include "utils.h"
 6 | #include "utils_memalloc.h"
 7 | 
 8 | void test_Decoder() {
 9 |     const struct model_config llama7B = llama_7B;
10 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
11 |               head_dim = embed_dim / num_heads, num_layers = llama7B.num_layers;
12 |     MemoryAllocator mem_buf;
13 | 
14 |     Matrix3D<int> input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen);
15 |     input_ids.load("assets/llama/tests/decoder/1st_input_ids.bin");
16 |     struct Int4llamaDecoder_input input_1st = {input_ids};
17 | 
18 |     Int4llamaDecoder decoder = Int4llamaDecoder("models/LLaMA_7B/decoder/", llama7B);
19 | 
20 |     struct Int4llamaDecoder_output output_1st = decoder.forward(input_1st);
21 | 
22 |     // reasoning phase: 1st run
23 |     Matrix3D<float> last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
24 |     last_hidden_state1_GT.load("assets/llama/tests/decoder/1st_last_hidden_state.bin");
25 | 
26 |     // print_first_k_elelment("output_1st.last_hidden_state", output_1st.last_hidden_state.m_data, 20);
27 |     // print_first_k_elelment("last_hidden_state1_GT", last_hidden_state1_GT.m_data, 20);
28 |     bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data,
29 |                                    last_hidden_state1_GT.length(), 1e-8);
30 | 
31 |     Matrix3D<float> temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen,
32 |                                    embed_dim / num_heads);
33 |     for (int i = 0; i < num_layers; i++) {
34 |         std::string path = "assets/llama/tests/decoder/1st/past_key_value/key" + std::to_string(i) + ".bin";
35 |         temp_key_value.load(path.c_str());
36 |         success &=
37 |             check_two_equal(output_1st.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
38 | 
39 |         path = "assets/llama/tests/decoder/1st/past_key_value/value" + std::to_string(i) + ".bin";
40 |         temp_key_value.load(path.c_str());
41 |         success &=
42 |             check_two_equal(output_1st.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
43 |     }
44 | 
45 |     // generating phase: 2nd run
46 |     Matrix3D<int> input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1);
47 |     input_ids_2nd.load("assets/llama/tests/decoder/2nd/input_ids.bin");
48 |     struct Int4llamaDecoder_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values};
49 | 
50 |     struct Int4llamaDecoder_output output_2nd = decoder.forward(input_2nd);
51 | 
52 |     Matrix3D<float> last_hidden_state2_GT(mem_buf.get_fpbuffer(b * 1 * embed_dim), b, 1, embed_dim);
53 |     last_hidden_state2_GT.load("assets/llama/tests/decoder/2nd/last_hidden_state.bin");
54 |     success &= check_two_equal(output_2nd.last_hidden_state.m_data, last_hidden_state2_GT.m_data,
55 |                                last_hidden_state2_GT.length(), 1e-8);
56 | 
57 |     temp_key_value = Matrix3D<float>(mem_buf.get_fpbuffer(b * (sqlen + 1) * embed_dim), num_heads, (sqlen + 1),
58 |                                      embed_dim / num_heads);
59 |     for (int i = 0; i < num_layers; i++) {
60 |         std::string path = "assets/llama/tests/decoder/2nd/past_key_value/key" + std::to_string(i) + ".bin";
61 |         temp_key_value.load(path.c_str());
62 |         success &=
63 |             check_two_equal(output_2nd.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
64 | 
65 |         path = "assets/llama/tests/decoder/2nd/past_key_value/value" + std::to_string(i) + ".bin";
66 |         temp_key_value.load(path.c_str());
67 |         success &=
68 |             check_two_equal(output_2nd.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8);
69 |     }
70 | 
71 |     if (!success)
72 |         std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl;
73 |     else
74 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
75 | }
76 | 
77 | int main() {
78 |     // This tests are directly from fp32 and are not completed yet!
79 |     // test_Decoder();
80 | }
81 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/Int8OPTDecoderLayer.cc:
--------------------------------------------------------------------------------
 1 | #include "Int8OPTDecoderLayer.h"
 2 | 
 3 | #include "utils.h"
 4 | 
 5 | // Shared memory space across all layers
 6 | static float *hidden_states_float_arr;
 7 | static int8_t *final_layer_norm_arr;
 8 | static int8_t *fc_1_arr;
 9 | static float *fc_2_arr;
10 | static float *temp;
11 | static int8_t *hidden_states_int8_arr;
12 | 
13 | template <typename T>
14 | void add(Matrix3D<T> a, Matrix3D<T> b, Matrix3D<T> c) {
15 |     PROFILE_START("Int8OPTDecoderLayer::add");
16 |     assert(c.length() == a.length() && a.length() == b.length());
17 | 
18 |     for (int i = 0; i < a.length(); i++) {
19 |         c.m_data[i] = a.m_data[i] + b.m_data[i];
20 |     }
21 |     PROFILE_END("Int8OPTDecoderLayer::add");
22 | }
23 | 
24 | struct Int8OPTDecoderLayer_output Int8OPTDecoderLayer::forward(const struct Int8OPTDecoderLayer_input &input) {
25 |     PROFILE_START(profile_name);
26 |     // Layernorm
27 |     Matrix3D<int8_t> hidden_states_int8(hidden_states_int8_arr, input.hidden_states.m_dim_x,
28 |                                         input.hidden_states.m_dim_y, input.hidden_states.m_dim_z);
29 |     this->self_attn_layer_norm.forward(input.hidden_states, hidden_states_int8);
30 | 
31 |     // Attention
32 |     struct Int8OPTAttention_input attn_param(hidden_states_int8, input.attention_mask, input.past_key, input.past_value,
33 |                                              input.has_past_key_value, this->layer_idx);
34 |     struct Int8OPTAttention_output attn_output = this->attn.forward(attn_param);
35 | 
36 |     // Residual add
37 |     Matrix3D<float> residual_add(hidden_states_float_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y,
38 |                                  input.hidden_states.m_dim_z);
39 |     add(input.hidden_states, attn_output.attn_output, residual_add);
40 | 
41 |     // Layernorm
42 |     Matrix3D<int8_t> final_layer_norm(final_layer_norm_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y,
43 |                                       input.hidden_states.m_dim_z);
44 |     this->final_layer_norm.forward(residual_add, final_layer_norm);
45 | 
46 |     // FC
47 |     Matrix3D<int8_t> fc1_out(fc_1_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y, this->hidden_dim);
48 |     this->fc1.forward(final_layer_norm, fc1_out);
49 |     Matrix3D<float> fc2_out(fc_2_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y,
50 |                             input.hidden_states.m_dim_z);
51 |     this->fc2.forward(fc1_out, fc2_out);
52 | 
53 |     // Reidual add
54 |     add(residual_add, fc2_out, residual_add);
55 | 
56 |     struct Int8OPTDecoderLayer_output output(residual_add, attn_output.attn_probs_reshaped, attn_output.past_key_value);
57 |     PROFILE_END(profile_name);
58 |     return output;
59 | }
60 | 
61 | Int8OPTDecoderLayer::Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx,
62 |                                          LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm,
63 |                                          W8A8B8O8LinearReLU fc1, W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm,
64 |                                          BMM_S8T_S8N_S8T pv_bmm, W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj,
65 |                                          W8A8B8O8Linear q_proj, W8A8BFP32OFP32Linear out_proj) {
66 |     if (layer_idx == 0) {
67 |         allocate_aligned_memory(hidden_states_float_arr, config.max_sqlen * config.embed_dim * sizeof(float));
68 |         allocate_aligned_memory(final_layer_norm_arr, config.max_sqlen * config.embed_dim * sizeof(int8_t));
69 |         allocate_aligned_memory(fc_1_arr, config.max_sqlen * config.hidden_dim * sizeof(int8_t));
70 |         allocate_aligned_memory(fc_2_arr, config.max_sqlen * config.embed_dim * sizeof(float));
71 |         allocate_aligned_memory(hidden_states_int8_arr, config.max_sqlen * config.embed_dim * sizeof(int8_t));
72 |         Int8OPTAttention::initialized_memory(config);
73 |     }
74 | 
75 |     load_LayerNormQ(self_attn_layer_norm, param_path + "/self_attn_layer_norm");
76 |     load_W8A8B8O8LinearReLU_params(fc1, param_path + "/fc1");
77 |     load_W8A8BFP32OFP32Linear_params(fc2, param_path + "/fc2");
78 |     load_LayerNormQ(final_layer_norm, param_path + "/final_layer_norm");
79 | 
80 |     this->embed_dim = config.embed_dim;
81 |     this->num_attention_heads = config.num_heads;
82 |     this->hidden_dim = config.hidden_dim;
83 |     this->layer_idx = layer_idx;
84 |     this->self_attn_layer_norm = self_attn_layer_norm;
85 |     this->fc1 = fc1;
86 |     this->fc2 = fc2;
87 |     this->final_layer_norm = final_layer_norm;
88 | 
89 |     this->attn = Int8OPTAttention(param_path + "/self_attn", config, qk_bmm, pv_bmm, k_proj, v_proj, q_proj, out_proj);
90 | }
91 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Fp32llamaDecoderLayer.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaDecoderLayer.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | void test_Fp32llamaDecoderLayer() {
 7 |     const struct model_config llama7B = llama_7B;
 8 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads;
 9 | 
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0);
13 | 
14 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
15 |     hidden_states.load("assets/llama/tests/layer0/sqlen9/hidden_states.bin");
16 |     // print_first_k_elelment("hidden_states", hidden_states.m_data, 10);
17 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
18 |     attention_mask.load("assets/llama/tests/layer0/sqlen9/attention_mask.bin");
19 | 
20 |     struct Fp32llamaDecoderLayer_input input(hidden_states, attention_mask);
21 | 
22 |     struct Fp32llamaDecoderLayer_output output = layer.forward(input);
23 | 
24 |     Matrix3D<float> outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
25 |     outputGT.load("assets/llama/tests/layer0/sqlen9/output_hidden_states.bin");
26 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
27 |     key_statesGT.load("assets/llama/tests/layer0/sqlen9/present_key.bin");
28 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
29 |     value_statesGT.load("assets/llama/tests/layer0/sqlen9/present_value.bin");
30 | 
31 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
32 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
33 |     success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length());
34 |     if (!success)
35 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
36 |     else
37 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
38 | }
39 | 
40 | void test_Fp32llamaDecoderLayer_gen() {
41 |     const struct model_config llama7B = llama_7B;
42 |     const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
43 |               head_dim = embed_dim / num_heads;
44 |     const int tgz = (sqlen + past_sqlen);
45 | 
46 |     MemoryAllocator mem_buf;
47 | 
48 |     Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0);
49 | 
50 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
51 |     hidden_states.load("assets/llama/tests/layer0/sqlen1/hidden_states.bin");
52 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * tgz), 1, sqlen, tgz);
53 |     attention_mask.load("assets/llama/tests/layer0/sqlen1/attention_mask.bin");
54 |     Matrix3D<float> past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
55 |     past_key.load("assets/llama/tests/layer0/sqlen1/past_key.bin");
56 |     Matrix3D<float> past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
57 |     past_value.load("assets/llama/tests/layer0/sqlen1/past_value.bin");
58 | 
59 |     struct Fp32llamaDecoderLayer_input input(hidden_states, attention_mask, past_key, past_value);
60 | 
61 |     struct Fp32llamaDecoderLayer_output output = layer.forward(input);
62 | 
63 |     Matrix3D<float> outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
64 |     outputGT.load("assets/llama/tests/layer0/sqlen1/output_hidden_states.bin");
65 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, sqlen, embed_dim / num_heads);
66 |     key_statesGT.load("assets/llama/tests/layer0/sqlen1/present_key.bin");
67 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, tgz, embed_dim / num_heads);
68 |     value_statesGT.load("assets/llama/tests/layer0/sqlen1/present_value.bin");
69 | 
70 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
71 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
72 |     success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length());
73 |     if (!success)
74 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
75 |     else
76 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
77 | }
78 | 
79 | int main() {
80 |     test_Fp32llamaDecoderLayer();
81 |     test_Fp32llamaDecoderLayer_gen();
82 | }
83 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Int4llamaDecoderLayer.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaDecoderLayer.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | void test_Int4llamaDecoderLayer() {
 7 |     const struct model_config llama7B = llama_7B;
 8 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads;
 9 | 
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Int4llamaDecoderLayer layer = Int4llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0);
13 | 
14 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
15 |     hidden_states.load("assets/llama/tests/layer0/sqlen9/hidden_states.bin");
16 |     // print_first_k_elelment("hidden_states", hidden_states.m_data, 10);
17 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
18 |     attention_mask.load("assets/llama/tests/layer0/sqlen9/attention_mask.bin");
19 | 
20 |     struct Int4llamaDecoderLayer_input input(hidden_states, attention_mask);
21 | 
22 |     struct Int4llamaDecoderLayer_output output = layer.forward(input);
23 | 
24 |     Matrix3D<float> outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
25 |     outputGT.load("assets/llama/tests/layer0/sqlen9/output_hidden_states.bin");
26 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
27 |     key_statesGT.load("assets/llama/tests/layer0/sqlen9/present_key.bin");
28 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
29 |     value_statesGT.load("assets/llama/tests/layer0/sqlen9/present_value.bin");
30 | 
31 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
32 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
33 |     success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length());
34 |     if (!success)
35 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
36 |     else
37 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
38 | }
39 | 
40 | void test_Int4llamaDecoderLayer_gen() {
41 |     const struct model_config llama7B = llama_7B;
42 |     const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
43 |               head_dim = embed_dim / num_heads;
44 |     const int tgz = (sqlen + past_sqlen);
45 | 
46 |     MemoryAllocator mem_buf;
47 | 
48 |     Int4llamaDecoderLayer layer = Int4llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0);
49 | 
50 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
51 |     hidden_states.load("assets/llama/tests/layer0/sqlen1/hidden_states.bin");
52 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * tgz), 1, sqlen, tgz);
53 |     attention_mask.load("assets/llama/tests/layer0/sqlen1/attention_mask.bin");
54 |     Matrix3D<float> past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
55 |     past_key.load("assets/llama/tests/layer0/sqlen1/past_key.bin");
56 |     Matrix3D<float> past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
57 |     past_value.load("assets/llama/tests/layer0/sqlen1/past_value.bin");
58 | 
59 |     struct Int4llamaDecoderLayer_input input(hidden_states, attention_mask, past_key, past_value);
60 | 
61 |     struct Int4llamaDecoderLayer_output output = layer.forward(input);
62 | 
63 |     Matrix3D<float> outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
64 |     outputGT.load("assets/llama/tests/layer0/sqlen1/output_hidden_states.bin");
65 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, sqlen, embed_dim / num_heads);
66 |     key_statesGT.load("assets/llama/tests/layer0/sqlen1/present_key.bin");
67 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, tgz, embed_dim / num_heads);
68 |     value_statesGT.load("assets/llama/tests/layer0/sqlen1/present_value.bin");
69 | 
70 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
71 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
72 |     success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length());
73 |     if (!success)
74 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
75 |     else
76 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
77 | }
78 | 
79 | int main() {
80 |     // This tests are directly from fp32 and are not completed yet!
81 |     test_Int4llamaDecoderLayer();
82 |     test_Int4llamaDecoderLayer_gen();
83 | }
84 | 


--------------------------------------------------------------------------------
/kernels/starter_code/reference.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | 
 8 | #include "../matmul.h"
 9 | #include "common.h"
10 | 
11 | namespace matmul {
12 | void MatmulOperator::mat_mul_reference(struct matmul_params *params) {
13 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
14 |     const int block_size = params->block_size;  // block_size = 32
15 |     float *scale = params->scales, *offset = params->offset;
16 | 
17 |     quantize_fp32_to_int8(A->data_ptr, A->int8_data_ptr, params->A_scales, A->row * A->column, block_size);
18 | 
19 |     int m = C->row, n = C->column, k = A->column;
20 |     // A: m x k; B: n x k; C: m x n
21 |     for (int row = 0; row < m; row++) {
22 |         for (int col = 0; col < n; col++) {
23 |             float acc = 0;
24 |             // Compute each block
25 |             for (int ch = 0; ch < k;) {
26 |                 // pointer of the int4 weights
27 |                 uint8_t *w_int4 = &B->int4_data_ptr[(col * k + ch) / 2];
28 |                 // pointer of the int8 activation
29 |                 const signed char *a_int8 = &A->int8_data_ptr[row * k + ch];
30 |                 // scale of weight
31 |                 float s_w = params->scales[(col * k + ch) / block_size];
32 |                 // scale of activation
33 |                 float s_a = params->A_scales[(row * k + ch) / block_size];
34 | #ifdef QM_ARM
35 |                 // order of weights with QM_ARM:
36 |                 // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w30,w31)
37 |                 // QM_ARM order: (w0,w16),(w1,w17),(w2,w18),(w3,w19),(w4, w20),... (w15,w31)
38 |                 //               |--|
39 |                 //               4 bits
40 |                 //               |------|
41 |                 //               8 bits (byte)
42 |                 //            low|----------------------------------------------------------|high
43 |                 //               0                         128 bit                         127
44 |                 // process 16 bytes of weigths (128 bit) = 1 block
45 |                 // intermediate variable to store sum of integer multiplication and accumulation
46 |                 int intermediate_sum = 0;
47 |                 for (int qj = 0; qj < 16; qj++) {
48 |                     // decode a packed byte into two int8 in the range of (-8, 7)
49 |                     uint8_t packed_int4_0 = w_int4[qj];
50 |                     signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0;
51 |                     signed char w_de_16 = (packed_int4_0 >> 4) - 8.0;
52 |                     // int8 multiply and accumulate operation
53 |                     intermediate_sum += a_int8[qj] * w_de_0;
54 |                     intermediate_sum += a_int8[qj + 16] * w_de_16;
55 |                 }
56 |                 // dequantize the sum into floating point
57 |                 acc += (float)intermediate_sum * s_a * s_w;
58 |                 ch += block_size;
59 | #endif
60 | #ifdef QM_x86
61 |                 // scales of the second block
62 |                 float s_w_2nd = params->scales[(col * k + ch) / block_size + 1];
63 |                 float s_a_2nd = params->A_scales[(row * k + ch) / block_size + 1];
64 |                 // order of weights with QM_x86:
65 |                 // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w62,w63)
66 |                 // QM_ARM order: (w0,w32),(w1,w33),(w2,w34),(w3,w35),(w4, w36),... (w31,w63)
67 |                 //               |--|
68 |                 //               4 bits
69 |                 //               |------|
70 |                 //               8 bits (byte)
71 |                 //            low|----------------------------------------------------------|high
72 |                 //               0                         256 bit
73 |                 // process 32 bytes of weigths (256 bit) = 2 blocks
74 |                 // intermediate variable to store sum of integer multiplication and accumulation
75 |                 int intermediate_sum = 0, intermediate_sum_2nd = 0;
76 |                 for (int qj = 0; qj < 32; qj++) {
77 |                     // decode a packed byte into two int8 in the range of (-8, 7)
78 |                     uint8_t packed_int4_0 = w_int4[qj];
79 |                     signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0;
80 |                     signed char w_de_16 = (packed_int4_0 >> 4) - 8.0;
81 |                     // int8 multiply and accumulate operation
82 |                     intermediate_sum += a_int8[qj] * w_de_0;
83 |                     intermediate_sum_2nd += a_int8[qj + 32] * w_de_16;
84 |                 }
85 |                 // dequantize the sum into floating point
86 |                 acc += (float)intermediate_sum * s_a * s_w;
87 |                 acc += (float)intermediate_sum_2nd * s_a_2nd * s_w_2nd;
88 |                 ch += block_size * 2;
89 | #endif
90 |             }
91 |             C->data_ptr[row * n + col] = acc;
92 |         }
93 |     }
94 | };
95 | }  // namespace matmul
96 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/Fp32llamaDecoder.cc:
--------------------------------------------------------------------------------
  1 | #include "Fp32llamaDecoder.h"
  2 | 
  3 | #include <cstring>
  4 | #include <iostream>
  5 | 
  6 | #include "utils.h"
  7 | 
  8 | Matrix3D<float> Fp32llamaDecoder::prepare_decoder_attention_mask(int length, int past_length) {
  9 |     PROFILE_START("Fp32llamaDecoder::prepare_decoder_attention_mask");
 10 |     assert(length - past_length > 0);
 11 |     Matrix3D<float> causal_attention_mask(attention_mask_buf, 1, length - past_length, length);
 12 |     float min = std::numeric_limits<float>::lowest();
 13 |     for (int i = 0; i < length - past_length; i++) {
 14 |         for (int j = 0; j < length; j++) {
 15 |             if (i + past_length < j) {
 16 |                 causal_attention_mask(0, i, j) = min;
 17 |             } else {
 18 |                 causal_attention_mask(0, i, j) = 0.0;
 19 |             }
 20 |         }
 21 |     }
 22 | 
 23 |     PROFILE_END("Fp32llamaDecoder::prepare_decoder_attention_mask");
 24 |     return causal_attention_mask;
 25 | }
 26 | 
 27 | Fp32llamaDecoder::Fp32llamaDecoder(std::string param_path, const struct model_config config) {
 28 |     allocate_aligned_memory(attention_mask_buf, config.max_sqlen * config.max_sqlen * sizeof(float));
 29 |     allocate_aligned_memory(pos_embeds_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 30 |     allocate_aligned_memory(last_hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 31 |     allocate_aligned_memory(hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 32 | 
 33 |     this->voc_size = config.vocsize;
 34 |     this->embed_dim = config.embed_dim;
 35 |     this->hidden_dim = config.hidden_dim;
 36 |     this->num_heads = config.num_heads;
 37 |     this->padding_idx = config.padding_idx;
 38 | 
 39 |     int max_sqlen = config.max_sqlen;
 40 | 
 41 |     // Embedding
 42 |     Matrix3D<float> embweight(new float[voc_size * embed_dim], 1, voc_size, embed_dim);
 43 |     this->embed_tokens = Embedding(embed_dim, voc_size, padding_idx, embweight);
 44 |     load_Embedding_params(this->embed_tokens, param_path + "/embed_tokens");
 45 | 
 46 |     // Norm
 47 |     Matrix3D<float> norm_weight(new float[embed_dim], 1, 1, embed_dim);
 48 |     norm_weight.load((param_path + "/norm/weight.bin").c_str());
 49 |     this->norm = LlamaRMSNorm(norm_weight);
 50 | 
 51 |     // Load all the decoder layers
 52 |     for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) {
 53 |         DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;)
 54 | 
 55 |         std::string path = param_path + "/layer" + std::to_string(layer_idx);
 56 |         Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer(path, config, layer_idx);
 57 | 
 58 |         this->layers.push_back(layer);
 59 |     }
 60 | };
 61 | 
 62 | // Fp32llamaDecoder:
 63 | struct Fp32llamaDecoder_output Fp32llamaDecoder::forward(const struct Fp32llamaDecoder_input &input) {
 64 |     PROFILE_START(profile_name);
 65 |     int sqlen = input.input_ids.m_dim_z, batch_size = input.input_ids.m_dim_x, past_key_values_length = 0;
 66 | 
 67 |     // Input token -> Embedding
 68 |     float inputs_embeds_buf[sqlen * this->embed_dim];
 69 |     Matrix3D<float> inputs_embeds(inputs_embeds_buf, 1, sqlen, this->embed_dim);
 70 |     this->embed_tokens.forward(input.input_ids, inputs_embeds);
 71 | 
 72 |     if (input.has_past_keys_values) {
 73 |         past_key_values_length = input.past_keys[0].m_dim_y;
 74 |     }
 75 | 
 76 |     // Attention mask
 77 |     Matrix3D<float> causal_attention_mask =
 78 |         this->prepare_decoder_attention_mask(sqlen + past_key_values_length, past_key_values_length);
 79 | 
 80 |     // Go through each layer
 81 |     Matrix3D<float> hidden_states = inputs_embeds;
 82 |     std::vector<Matrix3D<float>> past_keys, past_values;
 83 |     for (int i = 0; i < this->layers.size(); i++) {
 84 |         if (!input.has_past_keys_values) {
 85 |             struct Fp32llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask};
 86 |             struct Fp32llamaDecoderLayer_output l_o = this->layers[i].forward(l_i);
 87 |             hidden_states = l_o.hidden_states;
 88 |             past_keys.push_back(l_o.past_key_value.first);
 89 |             past_values.push_back(l_o.past_key_value.second);
 90 |         } else {
 91 |             struct Fp32llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask, input.past_keys[i],
 92 |                                                       input.past_values[i]};
 93 |             struct Fp32llamaDecoderLayer_output l_o = this->layers[i].forward(l_i);
 94 |             hidden_states = l_o.hidden_states;
 95 |             past_keys.push_back(l_o.past_key_value.first);
 96 |             past_values.push_back(l_o.past_key_value.second);
 97 |         }
 98 |     }
 99 | 
100 |     // Layernorm
101 |     Matrix3D<float> last_hidden_states(last_hidden_states_buf, 1, sqlen, this->embed_dim);
102 |     this->norm.forward(hidden_states, last_hidden_states);
103 | 
104 |     struct Fp32llamaDecoder_output output = {last_hidden_states, past_keys, past_values};
105 |     PROFILE_END(profile_name);
106 |     return output;
107 | }
108 | 


--------------------------------------------------------------------------------
/transformer/src/nn_modules/Int4llamaDecoder.cc:
--------------------------------------------------------------------------------
  1 | #include "Int4llamaDecoder.h"
  2 | 
  3 | #include <cstring>
  4 | #include <iostream>
  5 | 
  6 | #include "utils.h"
  7 | 
  8 | Matrix3D<float> Int4llamaDecoder::prepare_decoder_attention_mask(int length, int past_length) {
  9 |     PROFILE_START("Int4llamaDecoder::prepare_decoder_attention_mask");
 10 |     assert(length - past_length > 0);
 11 |     Matrix3D<float> causal_attention_mask(attention_mask_buf, 1, length - past_length, length);
 12 |     float min = std::numeric_limits<float>::lowest();
 13 |     for (int i = 0; i < length - past_length; i++) {
 14 |         for (int j = 0; j < length; j++) {
 15 |             if (i + past_length < j) {
 16 |                 causal_attention_mask(0, i, j) = min;
 17 |             } else {
 18 |                 causal_attention_mask(0, i, j) = 0.0;
 19 |             }
 20 |         }
 21 |     }
 22 | 
 23 |     PROFILE_END("Int4llamaDecoder::prepare_decoder_attention_mask");
 24 |     return causal_attention_mask;
 25 | }
 26 | 
 27 | Int4llamaDecoder::Int4llamaDecoder(std::string param_path, const struct model_config config) {
 28 |     allocate_aligned_memory(attention_mask_buf, config.max_sqlen * config.max_sqlen * sizeof(float));
 29 |     allocate_aligned_memory(pos_embeds_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 30 |     allocate_aligned_memory(last_hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 31 |     allocate_aligned_memory(hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float));
 32 | 
 33 |     this->voc_size = config.vocsize;
 34 |     this->embed_dim = config.embed_dim;
 35 |     this->hidden_dim = config.hidden_dim;
 36 |     this->num_heads = config.num_heads;
 37 |     this->padding_idx = config.padding_idx;
 38 | 
 39 |     int max_sqlen = config.max_sqlen;
 40 | 
 41 |     // Embedding
 42 |     Matrix3D<float> embweight(new float[voc_size * embed_dim], 1, voc_size, embed_dim);
 43 |     this->embed_tokens = Embedding(embed_dim, voc_size, padding_idx, embweight);
 44 |     load_Embedding_params(this->embed_tokens, param_path + "/embed_tokens");
 45 | 
 46 |     // Norm
 47 |     Matrix3D<float> norm_weight(new float[embed_dim], 1, 1, embed_dim);
 48 |     norm_weight.load((param_path + "/norm/weight.bin").c_str());
 49 |     this->norm = LlamaRMSNorm(norm_weight);
 50 | 
 51 |     // Load all the decoder layers
 52 |     for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) {
 53 |         DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;)
 54 | 
 55 |         std::string path = param_path + "/layer" + std::to_string(layer_idx);
 56 |         Int4llamaDecoderLayer layer = Int4llamaDecoderLayer(path, config, layer_idx);
 57 | 
 58 |         this->layers.push_back(layer);
 59 |     }
 60 | };
 61 | 
 62 | // Int4llamaDecoder:
 63 | struct Int4llamaDecoder_output Int4llamaDecoder::forward(const struct Int4llamaDecoder_input &input) {
 64 |     PROFILE_START(profile_name);
 65 |     int sqlen = input.input_ids.m_dim_z, batch_size = input.input_ids.m_dim_x, past_key_values_length = 0;
 66 | 
 67 |     // Input token -> Embedding
 68 |     float inputs_embeds_buf[sqlen * this->embed_dim];
 69 |     Matrix3D<float> inputs_embeds(inputs_embeds_buf, 1, sqlen, this->embed_dim);
 70 |     this->embed_tokens.forward(input.input_ids, inputs_embeds);
 71 | 
 72 |     if (input.has_past_keys_values) {
 73 |         past_key_values_length = input.past_keys[0].m_dim_y;
 74 |     }
 75 | 
 76 |     // Attention mask
 77 |     Matrix3D<float> causal_attention_mask =
 78 |         this->prepare_decoder_attention_mask(sqlen + past_key_values_length, past_key_values_length);
 79 | 
 80 |     // Go through each layer
 81 |     Matrix3D<float> hidden_states = inputs_embeds;
 82 |     std::vector<Matrix3D<float>> past_keys, past_values;
 83 |     for (int i = 0; i < this->layers.size(); i++) {
 84 |         if (!input.has_past_keys_values) {
 85 |             struct Int4llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask};
 86 |             struct Int4llamaDecoderLayer_output l_o = this->layers[i].forward(l_i);
 87 |             hidden_states = l_o.hidden_states;
 88 |             past_keys.push_back(l_o.past_key_value.first);
 89 |             past_values.push_back(l_o.past_key_value.second);
 90 |         } else {
 91 |             struct Int4llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask, input.past_keys[i],
 92 |                                                       input.past_values[i]};
 93 |             struct Int4llamaDecoderLayer_output l_o = this->layers[i].forward(l_i);
 94 |             hidden_states = l_o.hidden_states;
 95 |             past_keys.push_back(l_o.past_key_value.first);
 96 |             past_values.push_back(l_o.past_key_value.second);
 97 |         }
 98 |     }
 99 | 
100 |     // Layernorm
101 |     Matrix3D<float> last_hidden_states(last_hidden_states_buf, 1, sqlen, this->embed_dim);
102 |     this->norm.forward(hidden_states, last_hidden_states);
103 | 
104 |     struct Int4llamaDecoder_output output = {last_hidden_states, past_keys, past_values};
105 |     PROFILE_END(profile_name);
106 |     return output;
107 | }
108 | 


--------------------------------------------------------------------------------
/kernels/quantizer.cc:
--------------------------------------------------------------------------------
  1 | #include <cassert>
  2 | 
  3 | #ifdef QM_ARM
  4 | #include <arm_neon.h>
  5 | void quantize_fp32_to_int8(float* A, int8_t* qA, float* sA, int size, int block_size) {
  6 |     assert(size % block_size == 0);
  7 |     assert(block_size == 32);
  8 |     int num_block = size / 32;
  9 | 
 10 |     for (int i = 0; i < num_block; i++) {
 11 |         float32x4_t srcv[8];
 12 |         float32x4_t asrcv[8];
 13 |         float32x4_t amaxv[8];
 14 | 
 15 |         int8_t* start_qA = &qA[i * 32];
 16 | 
 17 |         for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(A + i * 32 + 4 * l);
 18 |         for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
 19 | 
 20 |         for (int l = 0; l < 4; l++) amaxv[2 * l] = vmaxq_f32(asrcv[2 * l], asrcv[2 * l + 1]);
 21 |         for (int l = 0; l < 2; l++) amaxv[4 * l] = vmaxq_f32(amaxv[4 * l], amaxv[4 * l + 2]);
 22 |         for (int l = 0; l < 1; l++) amaxv[8 * l] = vmaxq_f32(amaxv[8 * l], amaxv[8 * l + 4]);
 23 | 
 24 |         const float amax = vmaxvq_f32(amaxv[0]);
 25 | 
 26 |         const float d = amax / ((1 << 7) - 1);
 27 |         const float id = d ? 1.0f / d : 0.0f;
 28 | 
 29 |         sA[i] = d;
 30 | 
 31 |         // low half
 32 |         for (int l = 0; l < 4; l++) {
 33 |             const float32x4_t v = vmulq_n_f32(srcv[l], id);
 34 |             const int32x4_t vi = vcvtnq_s32_f32(v);
 35 | 
 36 |             start_qA[4 * l + 0] = vgetq_lane_s32(vi, 0);
 37 |             start_qA[4 * l + 1] = vgetq_lane_s32(vi, 1);
 38 |             start_qA[4 * l + 2] = vgetq_lane_s32(vi, 2);
 39 |             start_qA[4 * l + 3] = vgetq_lane_s32(vi, 3);
 40 |         }
 41 | 
 42 |         // high half
 43 |         for (int l = 4; l < 8; l++) {
 44 |             const float32x4_t v = vmulq_n_f32(srcv[l], id);
 45 |             const int32x4_t vi = vcvtnq_s32_f32(v);
 46 | 
 47 |             start_qA[4 * l + 0] = vgetq_lane_s32(vi, 0);
 48 |             start_qA[4 * l + 1] = vgetq_lane_s32(vi, 1);
 49 |             start_qA[4 * l + 2] = vgetq_lane_s32(vi, 2);
 50 |             start_qA[4 * l + 3] = vgetq_lane_s32(vi, 3);
 51 |         }
 52 |     }
 53 | }
 54 | #endif
 55 | #ifdef QM_x86
 56 | #include <immintrin.h>
 57 | void quantize_fp32_to_int8(float* A, int8_t* qA, float* sA, int size, int block_size) {
 58 |     int nb = size / 32;
 59 |     for (int i = 0; i < nb; i++) {
 60 |         // Load elements into 4 AVX vectors
 61 |         __m256 v0 = _mm256_loadu_ps(A);
 62 |         __m256 v1 = _mm256_loadu_ps(A + 8);
 63 |         __m256 v2 = _mm256_loadu_ps(A + 16);
 64 |         __m256 v3 = _mm256_loadu_ps(A + 24);
 65 |         A += 32;
 66 | 
 67 |         // Compute max(abs(e)) for the block
 68 |         const __m256 signBit = _mm256_set1_ps(-0.0f);
 69 |         __m256 maxAbs = _mm256_andnot_ps(signBit, v0);
 70 |         maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v1));
 71 |         maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v2));
 72 |         maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v3));
 73 | 
 74 |         __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(maxAbs, 1), _mm256_castps256_ps128(maxAbs));
 75 |         max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
 76 |         max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
 77 |         const float maxScalar = _mm_cvtss_f32(max4);
 78 | 
 79 |         // Quantize these floats
 80 |         const float d = maxScalar / 127.f;
 81 |         *sA++ = d;
 82 |         const float id = (maxScalar != 0.0f) ? 127.f / maxScalar : 0.0f;
 83 |         const __m256 mul = _mm256_set1_ps(id);
 84 | 
 85 |         // Apply the multiplier
 86 |         v0 = _mm256_mul_ps(v0, mul);
 87 |         v1 = _mm256_mul_ps(v1, mul);
 88 |         v2 = _mm256_mul_ps(v2, mul);
 89 |         v3 = _mm256_mul_ps(v3, mul);
 90 | 
 91 |         // Round to nearest integer
 92 |         v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST);
 93 |         v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST);
 94 |         v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST);
 95 |         v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST);
 96 | 
 97 |         // Convert floats to integers
 98 |         __m256i i0 = _mm256_cvtps_epi32(v0);
 99 |         __m256i i1 = _mm256_cvtps_epi32(v1);
100 |         __m256i i2 = _mm256_cvtps_epi32(v2);
101 |         __m256i i3 = _mm256_cvtps_epi32(v3);
102 | 
103 |         // Convert int32 to int16
104 |         i0 = _mm256_packs_epi32(i0, i1);  // 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
105 |         i2 = _mm256_packs_epi32(i2, i3);  // 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
106 |                                           // Convert int16 to int8
107 |         i0 = _mm256_packs_epi16(i0, i2);  // 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7,
108 |                                           // 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
109 | 
110 |         // We got our precious signed bytes, but the order is now wrong
111 |         // These AVX2 pack instructions process 16-byte pieces independently
112 |         // The following instruction is fixing the order
113 |         const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
114 |         i0 = _mm256_permutevar8x32_epi32(i0, perm);
115 | 
116 |         _mm256_storeu_si256((__m256i*)qA, i0);
117 |         qA += 32;
118 |     }
119 | }
120 | #endif
121 | 


--------------------------------------------------------------------------------
/transformer/llama_exporter.py:
--------------------------------------------------------------------------------
  1 | """Implementation of exporting LLaMA PyTorch model to TinyLLMEngine format.
  2 | 
  3 | Usage:
  4 |    python llama_exporter.py <path of hugging face model checkpoint> <output dir>
  5 | 
  6 | Example commandline:
  7 |    python llama_exporter.py ~/llama2-chat/hf7B models/LLaMA_7B_2_chat
  8 | """
  9 | import argparse
 10 | import math
 11 | import os
 12 | import struct
 13 | 
 14 | import torch
 15 | from transformers import LlamaForCausalLM
 16 | 
 17 | 
 18 | @torch.no_grad()
 19 | def _export_model(model, prefix):
 20 | 
 21 |     outpath = prefix
 22 |     os.makedirs(outpath, exist_ok=True)
 23 |     with open(os.path.join(f"{outpath}", "lm_head.bin"), "wb") as f:
 24 |         f.write(model.lm_head._parameters["weight"].cpu().float().numpy().tobytes())
 25 |     _export_llama_model(model.model, os.path.join(f"{outpath}", "decoder"))
 26 | 
 27 | 
 28 | def _export_embed_tokens(embed_tokens, prefix):
 29 |     outpath = prefix
 30 |     os.makedirs(outpath, exist_ok=True)
 31 |     with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f:
 32 |         f.write(embed_tokens.weight.cpu().float().numpy().tobytes())
 33 | 
 34 | 
 35 | def _export_llama_model(model, prefix):
 36 |     outpath = prefix
 37 |     os.makedirs(outpath, exist_ok=True)
 38 | 
 39 |     _export_embed_tokens(model.embed_tokens, os.path.join(outpath, "embed_tokens"))
 40 |     _export_LlamaRMSNorm(model.norm, os.path.join(outpath, "norm"))
 41 |     for idx, layer in enumerate(model.layers):
 42 |         _export_llama_layer(layer, os.path.join(outpath, f"layer{idx}"))
 43 | 
 44 | 
 45 | def _export_LlamaRMSNorm(op, prefix):
 46 |     outpath = prefix
 47 |     os.makedirs(outpath, exist_ok=True)
 48 |     with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f:
 49 |         f.write(op.weight.cpu().float().numpy().tobytes())
 50 | 
 51 | 
 52 | def _export_llama_layer(layer, prefix):
 53 |     outpath = prefix
 54 |     os.makedirs(outpath, exist_ok=True)
 55 |     _export_attention_params(layer.self_attn, os.path.join(outpath, "self_attn"))
 56 |     _export_LlamaRMSNorm(layer.input_layernorm, os.path.join(outpath, "input_layernorm"))
 57 |     _export_LlamaRMSNorm(
 58 |         layer.post_attention_layernorm,
 59 |         os.path.join(outpath, "post_attention_layernorm"),
 60 |     )
 61 |     _export_linearfp(layer.mlp.gate_proj, os.path.join(outpath, "gate_proj"))
 62 |     _export_linearfp(layer.mlp.down_proj, os.path.join(outpath, "down_proj"))
 63 |     _export_linearfp(layer.mlp.up_proj, os.path.join(outpath, "up_proj"))
 64 | 
 65 | 
 66 | def _export_linearfp(op, prefix):
 67 |     outpath = prefix
 68 |     os.makedirs(outpath, exist_ok=True)
 69 |     with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f:
 70 |         f.write(op._parameters["weight"].cpu().float().numpy().tobytes())
 71 | 
 72 | 
 73 | def _export_rotaryEmbedding(op, prefix):
 74 |     outpath = prefix
 75 |     os.makedirs(outpath, exist_ok=True)
 76 |     with open(os.path.join(f"{outpath}", "cos_cached.bin"), "wb") as f:
 77 |         f.write(op.cos_cached.cpu().float().numpy().tobytes())
 78 |     with open(os.path.join(f"{outpath}", "sin_cached.bin"), "wb") as f:
 79 |         f.write(op.sin_cached.cpu().float().numpy().tobytes())
 80 | 
 81 | 
 82 | def _export_BMM_F32T(alpha, prefix):
 83 |     outpath = prefix
 84 |     os.makedirs(outpath, exist_ok=True)
 85 |     with open(os.path.join(f"{outpath}", "alpha.bin"), "wb") as f:
 86 |         f.write(struct.pack("f", alpha))
 87 | 
 88 | 
 89 | def _export_attention_params(attn, prefix: str):
 90 |     outpath = prefix
 91 |     os.makedirs(outpath, exist_ok=True)
 92 |     _export_linearfp(attn.k_proj, os.path.join(outpath, "k_proj"))
 93 |     _export_linearfp(attn.v_proj, os.path.join(outpath, "v_proj"))
 94 |     _export_linearfp(attn.q_proj, os.path.join(outpath, "q_proj"))
 95 |     _export_linearfp(attn.o_proj, os.path.join(outpath, "o_proj"))
 96 |     qk_bmm_alpha = 1 / math.sqrt(attn.head_dim)
 97 |     _export_BMM_F32T(qk_bmm_alpha, os.path.join(outpath, "qk_bmm"))
 98 |     _export_rotaryEmbedding(attn.rotary_emb, os.path.join(outpath, "rotary_emb"))
 99 | 
100 | 
101 | def main():
102 |     """Export a LLaMA model to TinyLLMEngine format."""
103 |     parser = argparse.ArgumentParser(description="export LLaMA pytorch model to TinyLLMEngine format.")
104 |     parser.add_argument("model", type=str, help="Path of the LLaMA torch model")
105 |     parser.add_argument("output", type=str, help="Output directory of the exported model")
106 | 
107 |     args = parser.parse_args()
108 | 
109 |     if not os.path.exists(args.model):
110 |         print(f"The model path '{args.model}' does not exist.")
111 |         return
112 | 
113 |     if not os.path.exists(args.output):
114 |         print(f"The model path '{args.output}' does not exist.")
115 |         return
116 | 
117 |     print("Loading model...")
118 |     if args.model.endswith(".pt"):
119 |         model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", torch_dtype=torch.float16)
120 |         model.load_state_dict(torch.load(args.model))
121 |     else:
122 |         model = LlamaForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16)
123 | 
124 |     print("Start exporting the model...")
125 |     _export_model(model, args.output)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Int4llamaAttention.cc:
--------------------------------------------------------------------------------
 1 | #include "Int4llamaAttention.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | void test_Int4llamaAttention() {
 7 |     const struct model_config llama7B = llama_7B;
 8 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads;
 9 | 
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Int4llamaAttention attn = Int4llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B);
13 | 
14 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
15 |     read_to_array("assets/llama/tests/atten/sqlen9/hidden_states.bin", hidden_states.m_data, b * sqlen * embed_dim);
16 |     // print_first_k_elelment("hidden_states", hidden_states.m_data, 10);
17 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
18 |     read_to_array("assets/llama/tests/atten/sqlen9/attention_mask.bin", attention_mask.m_data, attention_mask.length());
19 | 
20 |     attn.initialized_memory(llama7B);
21 |     struct Int4llamaAttention_input input(hidden_states, attention_mask, 0);
22 | 
23 |     struct Int4llamaAttention_output output = attn.forward(input);
24 | 
25 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
26 |     read_to_array("assets/llama/tests/atten/sqlen9/attn_output.bin", attn_outputGT.m_data, b * sqlen * embed_dim);
27 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
28 |     read_to_array("assets/llama/tests/atten/sqlen9/past_key.bin", key_statesGT.m_data, b * sqlen * embed_dim);
29 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
30 |     read_to_array("assets/llama/tests/atten/sqlen9/past_value.bin", value_statesGT.m_data, b * sqlen * embed_dim);
31 | 
32 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
33 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length());
34 |     success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length());
35 |     if (!success)
36 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
37 |     else
38 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
39 | }
40 | 
41 | void test_Int4llamaAttention_gen() {
42 |     const struct model_config llama7B = llama_7B;
43 |     const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
44 |               head_dim = embed_dim / num_heads;
45 | 
46 |     MemoryAllocator mem_buf;
47 | 
48 |     Int4llamaAttention attn = Int4llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B);
49 | 
50 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
51 |     hidden_states.load("assets/llama/tests/atten/sqlen1/hidden_states.bin");
52 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * (sqlen + past_sqlen)), b, sqlen, sqlen + past_sqlen);
53 |     attention_mask.load("assets/llama/tests/atten/sqlen1/attention_mask.bin");
54 |     Matrix3D<float> past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
55 |     past_key.load("assets/llama/tests/atten/sqlen9/past_key.bin");
56 |     Matrix3D<float> past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
57 |     past_value.load("assets/llama/tests/atten/sqlen9/past_value.bin");
58 | 
59 |     attn.initialized_memory(llama7B);
60 |     struct Int4llamaAttention_input input(hidden_states, attention_mask, past_key, past_value, true, 0);
61 | 
62 |     struct Int4llamaAttention_output output = attn.forward(input);
63 | 
64 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
65 |     attn_outputGT.load("assets/llama/tests/atten/sqlen1/attn_output.bin");
66 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, sqlen + past_sqlen,
67 |                                  embed_dim / num_heads);
68 |     key_statesGT.load("assets/llama/tests/atten/sqlen1/past_key.bin");
69 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads,
70 |                                    sqlen + past_sqlen, embed_dim / num_heads);
71 |     value_statesGT.load("assets/llama/tests/atten/sqlen1/past_value.bin");
72 | 
73 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
74 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length());
75 |     success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length());
76 |     if (!success)
77 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
78 |     else
79 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
80 | }
81 | 
82 | int main() {
83 |     // This tests are directly from fp32 and are not completed yet!
84 |     // test_Int4llamaAttention();
85 |     // test_Int4llamaAttention_gen();
86 | }
87 | 


--------------------------------------------------------------------------------
/transformer/tests/test_Fp32llamaAttention.cc:
--------------------------------------------------------------------------------
 1 | #include "Fp32llamaAttention.h"
 2 | #include "operators.h"
 3 | #include "utils.h"
 4 | #include "utils_memalloc.h"
 5 | 
 6 | void test_Fp32llamaAttention() {
 7 |     const struct model_config llama7B = llama_7B;
 8 |     const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads;
 9 | 
10 |     MemoryAllocator mem_buf;
11 | 
12 |     Fp32llamaAttention attn = Fp32llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B);
13 | 
14 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
15 |     read_to_array("assets/llama/tests/atten/sqlen9/hidden_states.bin", hidden_states.m_data, b * sqlen * embed_dim);
16 |     // print_first_k_elelment("hidden_states", hidden_states.m_data, 10);
17 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen);
18 |     read_to_array("assets/llama/tests/atten/sqlen9/attention_mask.bin", attention_mask.m_data, attention_mask.length());
19 | 
20 |     attn.initialized_memory(llama7B);
21 |     struct Fp32llamaAttention_input input(hidden_states, attention_mask, 0);
22 | 
23 |     struct Fp32llamaAttention_output output = attn.forward(input);
24 | 
25 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
26 |     read_to_array("assets/llama/tests/atten/sqlen9/attn_output.bin", attn_outputGT.m_data, b * sqlen * embed_dim);
27 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
28 |     read_to_array("assets/llama/tests/atten/sqlen9/past_key.bin", key_statesGT.m_data, b * sqlen * embed_dim);
29 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads);
30 |     read_to_array("assets/llama/tests/atten/sqlen9/past_value.bin", value_statesGT.m_data, b * sqlen * embed_dim);
31 | 
32 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
33 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
34 |     success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length());
35 |     // print_first_k_elelment("output.attn_output", output.attn_output.m_data, 20);
36 |     // print_first_k_elelment("attn_outputGT", attn_outputGT.m_data, 20);
37 |     if (!success)
38 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
39 |     else
40 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
41 | }
42 | 
43 | void test_Fp32llamaAttention_gen() {
44 |     const struct model_config llama7B = llama_7B;
45 |     const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads,
46 |               head_dim = embed_dim / num_heads;
47 | 
48 |     MemoryAllocator mem_buf;
49 | 
50 |     Fp32llamaAttention attn = Fp32llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B);
51 | 
52 |     Matrix3D<float> hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim);
53 |     hidden_states.load("assets/llama/tests/atten/sqlen1/hidden_states.bin");
54 |     Matrix3D<float> attention_mask(mem_buf.get_fpbuffer(sqlen * (sqlen + past_sqlen)), b, sqlen, sqlen + past_sqlen);
55 |     attention_mask.load("assets/llama/tests/atten/sqlen1/attention_mask.bin");
56 |     Matrix3D<float> past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
57 |     past_key.load("assets/llama/tests/atten/sqlen9/past_key.bin");
58 |     Matrix3D<float> past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim);
59 |     past_value.load("assets/llama/tests/atten/sqlen9/past_value.bin");
60 | 
61 |     attn.initialized_memory(llama7B);
62 |     struct Fp32llamaAttention_input input(hidden_states, attention_mask, past_key, past_value, true, 0);
63 | 
64 |     struct Fp32llamaAttention_output output = attn.forward(input);
65 | 
66 |     Matrix3D<float> attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim);
67 |     attn_outputGT.load("assets/llama/tests/atten/sqlen1/attn_output.bin");
68 |     Matrix3D<float> key_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, sqlen + past_sqlen,
69 |                                  embed_dim / num_heads);
70 |     key_statesGT.load("assets/llama/tests/atten/sqlen1/past_key.bin");
71 |     Matrix3D<float> value_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads,
72 |                                    sqlen + past_sqlen, embed_dim / num_heads);
73 |     value_statesGT.load("assets/llama/tests/atten/sqlen1/past_value.bin");
74 | 
75 |     bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length());
76 |     success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9);
77 |     success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length());
78 |     // print_first_k_elelment("output.attn_output", output.attn_output.m_data, 20);
79 |     // print_first_k_elelment("attn_outputGT", attn_outputGT.m_data, 20);
80 |     if (!success)
81 |         std::cout << "Test of " << __func__ << ": Fail!" << std::endl;
82 |     else
83 |         std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl;
84 | }
85 | 
86 | int main() {
87 |     test_Fp32llamaAttention();
88 |     test_Fp32llamaAttention_gen();
89 | }
90 | 


--------------------------------------------------------------------------------
/kernels/starter_code/multithreading.cc:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <pthread.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include <cmath>
  6 | #include <cstdlib>
  7 | 
  8 | #include "../matmul.h"
  9 | #include "common.h"
 10 | struct multithreading_thread_args {
 11 |     int start, end;
 12 |     const struct matmul_params* params;
 13 | };
 14 | static void* multithreading_worker_func(void* args) {
 15 |     struct multithreading_thread_args* mat_args = (struct multithreading_thread_args*)args;
 16 |     const struct matmul_params* params = mat_args->params;
 17 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
 18 |     const int block_size = params->block_size;
 19 | 
 20 |     int m = C->row, n = C->column, k = A->column;
 21 |     // A: m x k; B: n x k; C: m x n
 22 |     for (int row = 0; row < m; row++) {
 23 |         for (int col = mat_args->start; col < mat_args->end; col++) {
 24 |             float acc = 0;
 25 |             // Compute each block
 26 |             for (int ch = 0; ch < k;) {
 27 |                 // pointer of the int4 weights
 28 |                 uint8_t* w_int4 = &B->int4_data_ptr[(col * k + ch) / 2];
 29 |                 // pointer of the int8 activation
 30 |                 const signed char* a_int8 = &A->int8_data_ptr[row * k + ch];
 31 |                 // scale of weight
 32 |                 float s_w = params->scales[(col * k + ch) / block_size];
 33 |                 // scale of activation
 34 |                 float s_a = params->A_scales[(row * k + ch) / block_size];
 35 | #ifdef QM_ARM
 36 |                 // order of weights with QM_ARM:
 37 |                 // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w30,w31)
 38 |                 // QM_ARM order: (w0,w16),(w1,w17),(w2,w18),(w3,w19),(w4, w20),... (w15,w31)
 39 |                 //               |--|
 40 |                 //               4 bits
 41 |                 //               |------|
 42 |                 //               8 bits (byte)
 43 |                 //            low|----------------------------------------------------------|high
 44 |                 //               0                         128 bit                         127
 45 |                 // process 16 bytes of weigths (128 bit) = 1 block
 46 |                 // intermediate variable to store sum of integer multiplication and accumulation
 47 |                 int intermediate_sum = 0;
 48 |                 // process 16 bytes of weigths (128 bit)
 49 |                 for (int qj = 0; qj < 16; qj++) {
 50 |                     // decode a packed byte into two int8 in the range of (-8, 7)
 51 |                     uint8_t packed_int4_0 = w_int4[qj];
 52 |                     signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0;
 53 |                     signed char w_de_16 = (packed_int4_0 >> 4) - 8.0;
 54 |                     // int8 multiply and accumulate operation
 55 |                     intermediate_sum += a_int8[qj] * w_de_0;
 56 |                     intermediate_sum += a_int8[qj + 16] * w_de_16;
 57 |                 }
 58 |                 // dequantize the sum into floating point
 59 |                 acc += (float)intermediate_sum * s_a * s_w;
 60 |                 ch += block_size;
 61 | #endif
 62 | #ifdef QM_x86
 63 |                 // scales of the second block
 64 |                 float s_w_2nd = params->scales[(col * k + ch) / block_size + 1];
 65 |                 float s_a_2nd = params->A_scales[(row * k + ch) / block_size + 1];
 66 |                 // order of weights with QM_x86:
 67 |                 // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w62,w63)
 68 |                 // QM_ARM order: (w0,w32),(w1,w33),(w2,w34),(w3,w35),(w4, w36),... (w31,w63)
 69 |                 //               |--|
 70 |                 //               4 bits
 71 |                 //               |------|
 72 |                 //               8 bits (byte)
 73 |                 //            low|----------------------------------------------------------|high
 74 |                 //               0                         256 bit
 75 |                 // process 32 bytes of weigths (256 bit) = 2 blocks
 76 |                 // intermediate variable to store sum of integer multiplication and accumulation
 77 |                 int intermediate_sum = 0, intermediate_sum_2nd = 0;
 78 |                 for (int qj = 0; qj < 32; qj++) {
 79 |                     // decode a packed byte into two int8 in the range of (-8, 7)
 80 |                     uint8_t packed_int4_0 = w_int4[qj];
 81 |                     signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0;
 82 |                     signed char w_de_16 = (packed_int4_0 >> 4) - 8.0;
 83 |                     // int8 multiply and accumulate operation
 84 |                     intermediate_sum += a_int8[qj] * w_de_0;
 85 |                     intermediate_sum_2nd += a_int8[qj + 32] * w_de_16;
 86 |                 }
 87 |                 // dequantize the sum into floating point
 88 |                 acc += (float)intermediate_sum * s_a * s_w;
 89 |                 acc += (float)intermediate_sum_2nd * s_a_2nd * s_w_2nd;
 90 |                 ch += block_size * 2;
 91 | #endif
 92 |             }
 93 |             C->data_ptr[row * n + col] = acc;
 94 |         }
 95 |     }
 96 |     return NULL;
 97 | }
 98 | 
 99 | namespace matmul {
100 | void MatmulOperator::mat_mul_multithreading(struct matmul_params* params) {
101 |     const struct matrix *A = &params->A, *B = &params->B, *C = &params->C;
102 |     const int block_size = params->block_size;
103 | 
104 |     quantize_fp32_to_int8(A->data_ptr, A->int8_data_ptr, params->A_scales, A->row * A->column, block_size);
105 | 
106 |     int m = C->row, n = C->column, k = A->column;
107 | 
108 |     const int num_thread = 4;
109 |     pthread_t thread_pool[num_thread];
110 |     struct multithreading_thread_args threads_args[num_thread];
111 | 
112 |     // TODO: Thread creation
113 | 
114 |     // TODO: Join threads
115 | };
116 | }  // namespace matmul
117 | 


--------------------------------------------------------------------------------