├── .flake8 ├── transformer ├── quantize_constants.py ├── zip_assets.sh ├── models │ └── llama_vocab.bin ├── tests │ ├── assets │ │ ├── input.bin │ │ └── output.bin │ ├── utils_memalloc.h │ ├── test_OPTGenerate.cc │ ├── test_linear.cc │ ├── test_OPTTokenizer.cc │ ├── test_Fp32llamaForCausalLM.cc │ ├── test_Int4llamaForCausalLM.cc │ ├── test_LLaMATokenizer.cc │ ├── test_Fp32llamaDecoder.cc │ ├── test_Int4llamaDecoder.cc │ ├── test_Fp32llamaDecoderLayer.cc │ ├── test_Int4llamaDecoderLayer.cc │ ├── test_Int4llamaAttention.cc │ └── test_Fp32llamaAttention.cc ├── include │ ├── ops │ │ ├── arg_max.h │ │ ├── LlamaRMSNorm.h │ │ ├── LayerNorm.h │ │ ├── BMM_F32T.h │ │ ├── LayerNormQ.h │ │ ├── BMM_S8T_S8N_S8T.h │ │ ├── BMM_S8T_S8N_F32T.h │ │ ├── W8A8B8O8Linear.h │ │ ├── W8A8BFP32OFP32Linear.h │ │ ├── W8A8B8O8LinearReLU.h │ │ ├── Embedding.h │ │ ├── RotaryPosEmb.h │ │ └── linear.h │ ├── operators.h │ ├── nn_modules │ │ ├── OPTForCausalLM.h │ │ ├── Fp32llamaForCausalLM.h │ │ ├── Int4llamaForCausalLM.h │ │ ├── Fp32llamaDecoder.h │ │ ├── Int4llamaDecoder.h │ │ ├── Int8OPTDecoder.h │ │ ├── Fp32llamaAttention.h │ │ ├── Int4llamaAttention.h │ │ ├── Fp32llamaDecoderLayer.h │ │ ├── Int4llamaDecoderLayer.h │ │ ├── Int8OPTAttention.h │ │ └── Int8OPTDecoderLayer.h │ ├── model.h │ ├── LLaMATokenizer.h │ ├── OPTTokenizer.h │ ├── utils.h │ ├── profiler.h │ ├── common.h │ └── Generate.h ├── test.sh ├── profile.sh ├── src │ ├── ops │ │ ├── arg_max.cc │ │ ├── batch_add.cc │ │ ├── embedding.cc │ │ ├── LlamaRMSNorm.cc │ │ ├── softmax.cc │ │ ├── RotaryPosEmb.cc │ │ ├── LayerNorm.cc │ │ ├── LayerNormQ.cc │ │ ├── BMM_S8T_S8N_S8T.cc │ │ ├── BMM_S8T_S8N_F32T.cc │ │ ├── W8A8BFP32OFP32Linear.cc │ │ ├── W8A8B8O8LinearReLU.cc │ │ ├── W8A8B8O8Linear.cc │ │ └── BMM_F32T.cc │ └── nn_modules │ │ ├── OPTForCausalLM.cc │ │ ├── Fp32llamaForCausalLM.cc │ │ ├── Int4llamaForCausalLM.cc │ │ ├── Int8OPTDecoderLayer.cc │ │ ├── Fp32llamaDecoder.cc │ │ └── Int4llamaDecoder.cc ├── evaluate.sh ├── .pre-commit-config.yaml ├── download_assets.sh ├── upload.py ├── quantize_and_upload.py ├── Makefile └── llama_exporter.py ├── assets └── figures │ └── chat.gif ├── .clang-format ├── kernels ├── metal │ ├── download_metal-cpp.sh │ ├── include │ │ ├── opParams.h │ │ └── MetalMatmulInt4.hpp │ ├── Makefile │ ├── matmul_ref_fp32.cc │ ├── matmul_metal_int4_imp.h │ └── matmul_metal_int4.cc ├── cuda │ ├── gemm_cuda.h │ ├── matmul.cu │ └── dequantize.cuh ├── ref │ ├── matmul_ref_fp32.cc │ └── matmul_ref_int4.cc ├── neon │ └── matmul_ref_fp32.cc ├── matmul_int8.cc ├── matmul_imp.cc ├── matmul.h ├── avx │ └── matmul_avx_fp32.cc ├── starter_code │ ├── reference.cc │ └── multithreading.cc └── quantizer.cc ├── .gitmodules ├── .gitignore ├── .pre-commit-config.yaml ├── pyproject.toml └── README.md /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /transformer/quantize_constants.py: -------------------------------------------------------------------------------- 1 | STORE_FP16 = False 2 | -------------------------------------------------------------------------------- /transformer/zip_assets.sh: -------------------------------------------------------------------------------- 1 | zip -r assets.zip assets 2 | zip -r models.zip models 3 | -------------------------------------------------------------------------------- /assets/figures/chat.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/assets/figures/chat.gif -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | ColumnLimit: 120 3 | ContinuationIndentWidth: 4 4 | IndentWidth: 4 5 | TabWidth: 4 6 | -------------------------------------------------------------------------------- /transformer/models/llama_vocab.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/models/llama_vocab.bin -------------------------------------------------------------------------------- /transformer/tests/assets/input.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/tests/assets/input.bin -------------------------------------------------------------------------------- /transformer/tests/assets/output.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/tinychat-tutorial/HEAD/transformer/tests/assets/output.bin -------------------------------------------------------------------------------- /kernels/metal/download_metal-cpp.sh: -------------------------------------------------------------------------------- 1 | wget https://developer.apple.com/metal/cpp/files/metal-cpp_macOS13_iOS16.zip 2 | unzip metal-cpp_macOS13_iOS16.zip 3 | -------------------------------------------------------------------------------- /transformer/include/ops/arg_max.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #define FLOAT_MIN -1000000.0 4 | 5 | void arg_max_dim2(Matrix3D &input, Matrix3D &output); 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "json"] 2 | path = json 3 | url = https://github.com/nlohmann/json 4 | [submodule "transformer/json"] 5 | path = transformer/json 6 | url = https://github.com/nlohmann/json 7 | -------------------------------------------------------------------------------- /kernels/metal/include/opParams.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | typedef struct { 4 | unsigned int m; 5 | unsigned int n; 6 | unsigned int k; 7 | unsigned int group_size; 8 | } MetalMatMulParams; 9 | -------------------------------------------------------------------------------- /kernels/cuda/gemm_cuda.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor gemm_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scaling_factors, 4 | torch::Tensor _zeros, int split_k_iters); 5 | -------------------------------------------------------------------------------- /transformer/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make clean && make -j 4 | 5 | # Find all executable files in the current directory starting with 'test_' 6 | for file in test_*; do 7 | # Check if the file is executable 8 | if [ -x "$file" ]; then 9 | echo "Running '$file'..." 10 | ./"$file" 11 | exit_code=$? 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /transformer/profile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make clean && make -j 4 | 5 | # Find all executable files in the current directory starting with 'profile_' 6 | for file in profile_*; do 7 | # Check if the file is executable 8 | if [ -x "$file" ]; then 9 | echo "Running '$file'..." 10 | ./"$file" 11 | exit_code=$? 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /transformer/include/ops/LlamaRMSNorm.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | class LlamaRMSNorm { 4 | public: 5 | LlamaRMSNorm(Matrix3D _weight) : weight(_weight){}; 6 | LlamaRMSNorm(){}; 7 | void forward(const Matrix3D &x, Matrix3D &output); 8 | Matrix3D weight; 9 | float eps = 1e-6; 10 | 11 | private: 12 | std::string profile_name = "LlamaRMSNorm"; 13 | }; 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | !main_int8.o 3 | !main.o 4 | !matmul_avx_int8.o 5 | !matmul_imp.o 6 | !matmul_int8.o 7 | !matmul_int4.o 8 | !matmul_onednn.o 9 | !utils.o 10 | *.a 11 | .DS_Store 12 | .build/ 13 | .cache/ 14 | .direnv/ 15 | .envrc 16 | .swiftpm 17 | .venv 18 | .vs/ 19 | .vscode/ 20 | 21 | assets/ 22 | *.bin 23 | !ggml-vocab.bin 24 | *.zip 25 | *.txt 26 | *.json 27 | test_* 28 | !test_*.cc 29 | demo 30 | profile_* 31 | !profile_*.cc 32 | libtorch/ 33 | -------------------------------------------------------------------------------- /transformer/include/ops/LayerNorm.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct LayerNorm_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | }; 7 | 8 | class LayerNorm { 9 | public: 10 | LayerNorm(LayerNorm_params ¶ms_): params(params_) {}; 11 | LayerNorm(){}; 12 | void forward(const Matrix3D &x, Matrix3D &output); 13 | struct LayerNorm_params params; 14 | private: 15 | std::string profile_name = "LayerNorm"; 16 | }; 17 | 18 | void load_LayerNorm(LayerNorm &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/BMM_F32T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | class BMM_F32T { 4 | public: 5 | BMM_F32T(float _alpha); 6 | BMM_F32T(){}; 7 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 8 | void forward_weight_untransposed(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 9 | float alpha; 10 | 11 | private: 12 | std::string profile_name = "BMM_F32T"; 13 | }; 14 | 15 | void load_BMM_F32T(BMM_F32T &op, std::string prefix); 16 | -------------------------------------------------------------------------------- /transformer/include/ops/LayerNormQ.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct LayerNormQ_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | }; 7 | 8 | class LayerNormQ { 9 | public: 10 | LayerNormQ(LayerNormQ_params ¶ms_): params(params_) {}; 11 | LayerNormQ(){}; 12 | void forward(const Matrix3D &x, Matrix3D &output); 13 | struct LayerNormQ_params params; 14 | private: 15 | std::string profile_name = "LayerNormQ"; 16 | }; 17 | 18 | void load_LayerNormQ(LayerNormQ &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/BMM_S8T_S8N_S8T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct BMM_S8T_S8N_S8T_params { 4 | float alpha; 5 | }; 6 | 7 | class BMM_S8T_S8N_S8T{ 8 | public: 9 | BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T_params ¶ms_); 10 | BMM_S8T_S8N_S8T(){}; 11 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 12 | struct matmul_params params; 13 | float alpha; 14 | private: 15 | std::string profile_name = "BMM_S8T_S8N_S8T"; 16 | }; 17 | 18 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/BMM_S8T_S8N_F32T.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct BMM_S8T_S8N_F32T_params { 4 | float alpha; 5 | }; 6 | 7 | class BMM_S8T_S8N_F32T{ 8 | public: 9 | BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T_params ¶ms_); 10 | BMM_S8T_S8N_F32T(){}; 11 | void forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output); 12 | struct matmul_params params; 13 | float alpha; 14 | private: 15 | std::string profile_name = "BMM_S8T_S8N_F32T"; 16 | }; 17 | 18 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/W8A8B8O8Linear.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8B8O8Linear_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | float alpha; 7 | float beta; 8 | }; 9 | 10 | class W8A8B8O8Linear { 11 | public: 12 | W8A8B8O8Linear(W8A8B8O8Linear_params ¶ms_); 13 | W8A8B8O8Linear(){}; 14 | void forward(const Matrix3D &x, Matrix3D &output); 15 | struct matmul_params params; 16 | float alpha; 17 | float beta; 18 | 19 | private: 20 | std::string profile_name = "W8A8B8O8Linear"; 21 | }; 22 | 23 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix); 24 | -------------------------------------------------------------------------------- /transformer/include/ops/W8A8BFP32OFP32Linear.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8BFP32OFP32Linear_params { 4 | Matrix3D weight; 5 | Matrix3D bias; 6 | float alpha; 7 | }; 8 | 9 | 10 | class W8A8BFP32OFP32Linear{ 11 | public: 12 | W8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear_params ¶ms_); 13 | W8A8BFP32OFP32Linear(){}; 14 | void forward(const Matrix3D &x, Matrix3D &output); 15 | struct matmul_params params; 16 | float alpha; 17 | private: 18 | std::string profile_name = "W8A8BFP32OFP32Linear"; 19 | }; 20 | 21 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/W8A8B8O8LinearReLU.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | struct W8A8B8O8LinearReLU_params { 4 | Matrix3D weight; 5 | Matrix3D bias_int8; 6 | float alpha; 7 | float beta; 8 | }; 9 | 10 | class W8A8B8O8LinearReLU { 11 | public: 12 | W8A8B8O8LinearReLU(W8A8B8O8LinearReLU_params ¶ms_); 13 | W8A8B8O8LinearReLU(){}; 14 | void forward(const Matrix3D &x, Matrix3D &output); 15 | struct matmul_params params; 16 | float alpha; 17 | float beta; 18 | 19 | private: 20 | std::string profile_name = "W8A8B8O8LinearReLU"; 21 | }; 22 | 23 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix); 24 | -------------------------------------------------------------------------------- /transformer/tests/utils_memalloc.h: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | class MemoryAllocator { 3 | // TODO: use allocate_aligned_memory instead! 4 | public: 5 | MemoryAllocator() { this->counter = 0; } 6 | float* get_fpbuffer(int size) { 7 | float* ptr; 8 | allocate_aligned_memory(ptr, size * sizeof(float)); 9 | return ptr; 10 | } 11 | int8_t* get_int8buffer(int size) { 12 | int8_t* ptr; 13 | allocate_aligned_memory(ptr, size * sizeof(int8_t)); 14 | return ptr; 15 | } 16 | int* get_intbuffer(int size) { 17 | int* ptr; 18 | allocate_aligned_memory(ptr, size * sizeof(int)); 19 | return ptr; 20 | } 21 | 22 | private: 23 | int counter; 24 | }; 25 | -------------------------------------------------------------------------------- /transformer/include/ops/Embedding.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | 4 | 5 | 6 | class Embedding { 7 | public: 8 | Embedding(int embed_dim_, int voc_size_, int padding_idx_, Matrix3D lookup_) 9 | : embed_dim(embed_dim_), voc_size(voc_size_), padding_idx(padding_idx_), lookup(lookup_) { 10 | assert(lookup_.m_dim_y == voc_size_); 11 | assert(lookup_.m_dim_z == embed_dim_); 12 | } 13 | Embedding(){}; 14 | void forward(Matrix3D input_id, Matrix3D output); 15 | int embed_dim, voc_size, padding_idx; 16 | Matrix3D lookup; 17 | private: 18 | std::string profile_name = "Embedding"; 19 | }; 20 | 21 | 22 | void load_Embedding_params(Embedding &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/include/ops/RotaryPosEmb.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | #include "utils.h" 4 | 5 | class RotaryPosEmb 6 | { 7 | public: 8 | RotaryPosEmb(Matrix3D _cos, Matrix3D _sin, std::string path) 9 | { 10 | sin = _sin; 11 | cos = _cos; 12 | read_to_array((path + "/cos_cached.bin").c_str(), cos.m_data, cos.length()); 13 | read_to_array((path + "/sin_cached.bin").c_str(), sin.m_data, sin.length()); 14 | }; 15 | RotaryPosEmb(){}; 16 | void forward(Matrix3D &key, Matrix3D &value, int start_idx, int len); 17 | Matrix3D cos, sin; 18 | 19 | private: 20 | std::string profile_name = "RotaryPosEmb"; 21 | }; 22 | 23 | void load_RotaryPosEmb(RotaryPosEmb &op, std::string prefix); -------------------------------------------------------------------------------- /transformer/src/ops/arg_max.cc: -------------------------------------------------------------------------------- 1 | #include "ops/arg_max.h" 2 | 3 | #include 4 | 5 | void arg_max_dim2(Matrix3D &input, Matrix3D &output) { 6 | int bz = input.m_dim_x; 7 | int sqlen = input.m_dim_y; 8 | int voc_size = input.m_dim_z; 9 | 10 | assert(sqlen == output.m_dim_z); 11 | assert(bz == output.m_dim_x); 12 | 13 | for (int b = 0; b < bz; b++) { 14 | for (int i = 0; i < sqlen; i++) { 15 | float max = FLOAT_MIN; 16 | int max_idx = -1; 17 | for (int j = 0; j < voc_size; j++) { 18 | float v = input(b, i, j); 19 | if (max < v) { 20 | max = v; 21 | max_idx = j; 22 | } 23 | } 24 | output(b, 0, i) = max_idx; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /kernels/metal/Makefile: -------------------------------------------------------------------------------- 1 | CXX = /opt/homebrew/opt/llvm/bin/clang++ 2 | CXXFLAGS = -std=c++17 -stdlib=libc++ -O3 3 | 4 | # Executable and source files 5 | TEST_TARGET = benchmark 6 | TARGET = $(TEST_TARGET) 7 | KERNEL_SRC = $(wildcard ./src/*.cpp) 8 | 9 | SRC = $(KERNEL_SRC) 10 | INCLUDE_DIRS = -I./metal-cpp -I./include 11 | LIB = -framework Metal -framework Foundation -framework MetalKit 12 | 13 | 14 | # Default target 15 | all: $(TARGET) 16 | 17 | # Linking 18 | benchmark: build_metallib 19 | $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o benchmark.x app/main.cpp $(SRC) $(LIB) $(LDFLAGS) 20 | 21 | build_air: 22 | xcrun -sdk macosx metal -ffast-math -fno-fast-math $(INCLUDE_DIRS) -c kernel/op.metal -o library.air 23 | 24 | build_metallib: build_air 25 | xcrun -sdk macosx metallib library.air -o default.metallib 26 | 27 | # Clean up 28 | clean: 29 | rm -f benchmark.x library.air library.metallib default.metallib 30 | -------------------------------------------------------------------------------- /transformer/include/operators.h: -------------------------------------------------------------------------------- 1 | #ifndef OPERATORS_H 2 | #define OPERATORS_H 3 | #include 4 | 5 | #include "common.h" 6 | #include "matmul.h" 7 | 8 | #define BLK_SIZE 16 9 | #define NUM_THREAD 4 10 | 11 | // include all ops 12 | #include "ops/BMM_F32T.h" 13 | #include "ops/BMM_S8T_S8N_F32T.h" 14 | #include "ops/BMM_S8T_S8N_S8T.h" 15 | #include "ops/Embedding.h" 16 | #include "ops/LayerNorm.h" 17 | #include "ops/LayerNormQ.h" 18 | #include "ops/LlamaRMSNorm.h" 19 | #include "ops/RotaryPosEmb.h" 20 | #include "ops/W8A8B8O8Linear.h" 21 | #include "ops/W8A8B8O8LinearReLU.h" 22 | #include "ops/W8A8BFP32OFP32Linear.h" 23 | #include "ops/arg_max.h" 24 | #include "ops/linear.h" 25 | 26 | void softmax(const Matrix3D &input, Matrix3D &output, int dim); 27 | void batch_Add(const Matrix3D &input, const Matrix3D &input2, Matrix3D &output); 28 | template 29 | void linear(Matrix3D &a, Matrix3D &b, Matrix3D &c); 30 | 31 | #endif // OPERATORS_H 32 | -------------------------------------------------------------------------------- /transformer/src/ops/batch_add.cc: -------------------------------------------------------------------------------- 1 | #include "operators.h" 2 | 3 | void batch_Add(const Matrix3D &input, const Matrix3D &input2,Matrix3D &output) { 4 | PROFILE_START("batch_Add"); 5 | assert(input.m_dim_y == input2.m_dim_y); 6 | assert(input.m_dim_z == input2.m_dim_z); 7 | assert(input.m_dim_x == output.m_dim_x); 8 | assert(input.m_dim_y == output.m_dim_y); 9 | assert(input.m_dim_z == output.m_dim_z); 10 | 11 | if (input.m_dim_x != input2.m_dim_x && input2.m_dim_x == 1) { 12 | // Find the maximum value in the input array 13 | for (int i = 0; i < input.m_dim_x; i++) { 14 | for (int j = 0; j < input.m_dim_y; j++) { 15 | for (int k = 0; k < input.m_dim_z; k++){ 16 | output(i, j, k) = input(i, j, k) + input2(0, j, k); 17 | } 18 | } 19 | } 20 | } else { 21 | throw("Unsupported dimension for softmax"); 22 | } 23 | PROFILE_END("batch_Add"); 24 | } -------------------------------------------------------------------------------- /transformer/src/ops/embedding.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_Embedding_params(Embedding& op, std::string prefix) { 7 | op.lookup.load((prefix + "/weight.bin").c_str()); 8 | // read_to_array((prefix + "/weight.bin").c_str(), op.lookup.m_data, op.lookup.length()); 9 | } 10 | 11 | void Embedding::forward(Matrix3D input_id, Matrix3D output) { 12 | PROFILE_START(profile_name); 13 | assert(input_id.m_dim_x == 1); 14 | assert(input_id.m_dim_y == 1); 15 | assert(input_id.m_dim_z == output.m_dim_y); 16 | assert(output.m_dim_z == this->embed_dim); 17 | 18 | for (int i = 0; i < input_id.m_dim_z; i++) { 19 | int token_id = input_id(0, 0, i); 20 | float* output_sample_ptr = &output.m_data[i * this->embed_dim]; 21 | float* target_embed = &this->lookup.m_data[token_id * this->embed_dim]; 22 | memcpy(output_sample_ptr, target_embed, sizeof(float) * this->embed_dim); 23 | } 24 | PROFILE_END(profile_name); 25 | } 26 | -------------------------------------------------------------------------------- /transformer/tests/test_OPTGenerate.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Generate.h" 4 | 5 | int main() { 6 | // std::vector input_ids = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 7 | // 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 8 | std::string vocab_file = "./models/OPT_125m/vocab.json"; 9 | std::string bpe_file = "./models/OPT_125m/merges.txt"; 10 | 11 | Encoder encoder = get_encoder(vocab_file, bpe_file); 12 | std::vector input_ids = encoder.encode("John went to MIT and study Computer Science."); 13 | 14 | std::string decoded = encoder.decode(input_ids); 15 | std::cout << "input:" << decoded << std::endl; 16 | 17 | OPTForCausalLM model = OPTForCausalLM("models/OPT_125m", get_opt_model_config(OPT_125M)); 18 | const struct opt_params generation_config; 19 | std::vector generated_ids = OPTGenerate(model, input_ids, generation_config); 20 | 21 | decoded = encoder.decode(generated_ids); 22 | std::cout << "generated:" << decoded << std::endl; 23 | }; 24 | -------------------------------------------------------------------------------- /kernels/ref/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void fp32_ref_matmul(const struct matmul_params *params) { 12 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 13 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 14 | 15 | assert(A->column == B->row); 16 | assert(C->row == A->row); 17 | assert(C->column == B->column); 18 | int m = A->row, n = B->column, k = A->column; 19 | 20 | for (int i = 0; i < m; i++) { 21 | for (int j = 0; j < n; j++) { 22 | float acc = 0; 23 | for (int kk = 0; kk < k; kk++) { 24 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 25 | } 26 | acc = acc; 27 | data_C[i * n + j] = acc; 28 | } 29 | } 30 | } 31 | 32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 33 | fp32_ref_matmul(params); 34 | } 35 | 36 | } // namespace matmul 37 | -------------------------------------------------------------------------------- /kernels/metal/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void fp32_ref_matmul(const struct matmul_params *params) { 12 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 13 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 14 | 15 | assert(A->column == B->row); 16 | assert(C->row == A->row); 17 | assert(C->column == B->column); 18 | int m = A->row, n = B->column, k = A->column; 19 | 20 | for (int i = 0; i < m; i++) { 21 | for (int j = 0; j < n; j++) { 22 | float acc = 0; 23 | for (int kk = 0; kk < k; kk++) { 24 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 25 | } 26 | acc = acc; 27 | data_C[i * n + j] = acc; 28 | } 29 | } 30 | } 31 | 32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 33 | fp32_ref_matmul(params); 34 | } 35 | 36 | } // namespace matmul 37 | -------------------------------------------------------------------------------- /kernels/neon/matmul_ref_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void fp32_ref_matmul(const struct matmul_params *params) { 12 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 13 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 14 | 15 | assert(A->column == B->row); 16 | assert(C->row == A->row); 17 | assert(C->column == B->column); 18 | int m = A->row, n = B->column, k = A->column; 19 | 20 | for (int i = 0; i < m; i++) { 21 | for (int j = 0; j < n; j++) { 22 | float acc = 0; 23 | for (int kk = 0; kk < k; kk++) { 24 | acc += data_A[i * k + kk] * data_B[j * k + kk]; 25 | } 26 | acc = acc; 27 | data_C[i * n + j] = acc; 28 | } 29 | } 30 | } 31 | 32 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 33 | fp32_ref_matmul(params); 34 | } 35 | 36 | } // namespace matmul 37 | -------------------------------------------------------------------------------- /kernels/metal/include/MetalMatmulInt4.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Foundation/Foundation.hpp" 4 | #include "Metal/Metal.hpp" 5 | #include "opParams.h" 6 | 7 | class MetalMatmulInt4 { 8 | public: 9 | MTL::Device *_mDevice; 10 | 11 | // The compute pipeline generated from the compute kernel in the .metal shader file. 12 | MTL::ComputePipelineState *_mMatmulFunctionPSO; 13 | 14 | // The command queue used to pass commands to the device. 15 | MTL::CommandQueue *_mCommandQueue; 16 | 17 | // Buffers to hold data. 18 | MTL::Buffer *_mBufferA; 19 | MTL::Buffer *_mBufferB; 20 | MTL::Buffer *_mBufferScales; 21 | MTL::Buffer *_mBufferResult; 22 | MTL::Buffer *_mParams; 23 | 24 | // Matmul params 25 | MetalMatMulParams *_mParamsPtr; 26 | 27 | MetalMatmulInt4(MTL::Device *device, MetalMatMulParams param); 28 | ~MetalMatmulInt4(); 29 | 30 | void prepareData(); 31 | void sendComputeCommand(); 32 | void verifyResults(); 33 | 34 | private: 35 | void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder); 36 | void generateRandomFloatData(MTL::Buffer *buffer, int length); 37 | void generateRandomIn4Data(MTL::Buffer *buffer, int length); 38 | }; 39 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/OPTForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int8OPTDecoder.h" 2 | 3 | struct OPTForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct OPTForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | OPTForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 13 | OPTForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 14 | std::vector> past_values_) 15 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 16 | has_past_keys_values = true; 17 | } 18 | }; 19 | 20 | class OPTForCausalLM { 21 | public: 22 | OPTForCausalLM(std::string param_path, const struct model_config config); 23 | struct OPTForCausalLM_output forward(const struct OPTForCausalLM_input& input); 24 | 25 | private: 26 | Int8OPTDecoder decoder; 27 | Linear_FP lm_head; 28 | std::string profile_name = "OPTForCausalLM"; 29 | float* logits_output; 30 | float* lm_head_weight; 31 | }; 32 | -------------------------------------------------------------------------------- /kernels/matmul_int8.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "matmul.h" 5 | 6 | namespace matmul { 7 | 8 | void MatmulOperator::naive_mat_mul_int8(const struct matmul_params *params) { 9 | int i, j, k; 10 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 11 | int32_t A_zp = A->qparams.zero_point, C_zp = C->qparams.zero_point; 12 | float A_sc = A->qparams.scale, B_sc = B->qparams.scale, C_sc = C->qparams.scale; 13 | float effective_scale = A_sc * B_sc / C_sc; 14 | int8_t *data_A = A->int8_data_ptr, *data_B = B->int8_data_ptr, *data_C = C->int8_data_ptr; 15 | const int8_t q_min = C->qparams.q_min, q_max = C->qparams.q_max; 16 | CHECK_MATRICES(A, B, C); 17 | 18 | for (i = 0; i < C->row; i++) 19 | for (j = 0; j < C->column; j++) { 20 | int acc = 0; 21 | for (k = 0; k < A->column; k++) 22 | acc += ((int32_t)data_A[i * A->column + k] - A_zp) * data_B[k * B->column + j]; 23 | 24 | acc = (int32_t)((float)acc * effective_scale); 25 | acc -= C_zp; 26 | acc = MAX(acc, q_min); 27 | acc = MIN(acc, q_max); 28 | data_C[i * C->column + j] = (int8_t)acc; 29 | } 30 | } 31 | } // namespace matmul 32 | -------------------------------------------------------------------------------- /transformer/src/ops/LlamaRMSNorm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void LlamaRMSNorm::forward(const Matrix3D &x, Matrix3D &output) { 8 | PROFILE_START(profile_name); 9 | const int last_dims = 2; 10 | 11 | assert(last_dims == 2); // support the last dim for now 12 | assert(output.m_dim_x == x.m_dim_x); 13 | assert(output.m_dim_y == x.m_dim_y); 14 | assert(output.m_dim_z == x.m_dim_z); 15 | assert(x.m_dim_z == weight.m_dim_z); 16 | 17 | for (int i = 0; i < x.m_dim_x; i++) { // batches 18 | for (int j = 0; j < x.m_dim_y; j++) { // samples 19 | float var = 0; 20 | 21 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 22 | var += x(i, j, k) * x(i, j, k); 23 | } 24 | var /= static_cast(x.m_dim_z); 25 | float variance = 1.0 / sqrt(var + eps); 26 | 27 | for (int k = 0; k < x.m_dim_z; k++) { 28 | float value = static_cast(x(i, j, k)); 29 | float fp_out = (value * variance) * weight(0, 0, k); 30 | output(i, j, k) = fp_out; 31 | } 32 | } 33 | } 34 | 35 | PROFILE_END(profile_name); 36 | } 37 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Fp32llamaForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaDecoder.h" 2 | 3 | struct Fp32LlamaForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Fp32LlamaForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Fp32LlamaForCausalLM_input() {} 13 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 14 | Fp32LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 15 | std::vector> past_values_) 16 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 17 | has_past_keys_values = true; 18 | } 19 | }; 20 | 21 | class Fp32LlamaForCausalLM { 22 | public: 23 | Fp32LlamaForCausalLM(std::string param_path, const struct model_config config); 24 | 25 | struct Fp32LlamaForCausalLM_output forward(const struct Fp32LlamaForCausalLM_input& input); 26 | 27 | private: 28 | Fp32llamaDecoder decoder; 29 | Linear_FP lm_head; 30 | std::string profile_name = "Fp32LlamaForCausalLM"; 31 | float* logits_output; 32 | float* lm_head_weight; 33 | }; 34 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int4llamaForCausalLM.h: -------------------------------------------------------------------------------- 1 | #include "Int4llamaDecoder.h" 2 | 3 | struct Int4LlamaForCausalLM_output { 4 | Matrix3D logits; 5 | std::vector> past_keys, past_values; 6 | }; 7 | struct Int4LlamaForCausalLM_input { 8 | Matrix3D input_ids; 9 | std::vector> past_keys, past_values; 10 | bool has_past_keys_values; 11 | 12 | Int4LlamaForCausalLM_input() {} 13 | Int4LlamaForCausalLM_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 14 | Int4LlamaForCausalLM_input(Matrix3D input_ids_, std::vector> past_keys_, 15 | std::vector> past_values_) 16 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 17 | has_past_keys_values = true; 18 | } 19 | }; 20 | 21 | class Int4LlamaForCausalLM { 22 | public: 23 | Int4LlamaForCausalLM(std::string param_path, const struct model_config config); 24 | struct Int4LlamaForCausalLM_output forward(const struct Int4LlamaForCausalLM_input& input); 25 | 26 | private: 27 | Int4llamaDecoder decoder; 28 | Linear_FP_int4 lm_head; 29 | std::string profile_name = "Int4LlamaForCausalLM"; 30 | float* logits_output; 31 | uint8_t* lm_head_weight; 32 | }; 33 | -------------------------------------------------------------------------------- /transformer/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List of implementation 4 | # 0: reference 5 | # 1: loop_unrolling 6 | # 2: multithreading 7 | # 3: simd_programming 8 | # 4: multithreading_loop_unrolling 9 | # 5: all_techniques 10 | keys=("reference" "loop_unrolling" "multithreading" "simd_programming" "multithreading_loop_unrolling" "all_techniques") 11 | values=("0" "1" "2" "3" "4" "5") 12 | 13 | # If a implementation is provided to the script, map it to the corresponding argument 14 | if [ "$#" -eq 1 ]; then 15 | found=0 16 | for i in "${!keys[@]}"; do 17 | if [ "${keys[$i]}" = "$1" ]; then 18 | test_args=("${values[$i]}") 19 | found=1 20 | break 21 | fi 22 | done 23 | if [ "$found" -eq 0 ]; then 24 | echo "Invalid implementation. Please provide a valid key from the mapping." 25 | exit 1 26 | fi 27 | else 28 | # If no argument is provided, use all values 29 | test_args=("${values[@]}") 30 | fi 31 | 32 | 33 | # Run the program with different arguments 34 | for arg in "${test_args[@]}"; do 35 | make clean 36 | make chat test_linear -j IMP="$arg" 37 | # Check if make was successful 38 | if [ $? -ne 0 ]; then 39 | echo "Compilation failed!" 40 | exit 1 41 | fi 42 | ./test_linear 43 | echo "" 44 | done 45 | 46 | echo "All tests completed!" 47 | -------------------------------------------------------------------------------- /kernels/metal/matmul_metal_int4_imp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "Foundation/Foundation.hpp" 6 | #include "Metal/Metal.hpp" 7 | #include "include/opParams.h" 8 | 9 | typedef struct { 10 | float *A, *C, *scales, *offset; 11 | unsigned char *B; 12 | } MetalMatmulBuffers; 13 | 14 | class MetalMatmulInt4IMP { 15 | public: 16 | static MTL::Device *_mDevice; 17 | 18 | // The compute pipeline generated from the compute kernel in the .metal shader file. 19 | static MTL::ComputePipelineState *_mMatmulFunctionPSO; 20 | 21 | // The command queue used to pass commands to the device. 22 | static MTL::CommandQueue *_mCommandQueue; 23 | 24 | // Buffers to hold data. 25 | static MTL::Buffer *_mBufferA; 26 | static MTL::Buffer *_mBufferB; 27 | static MTL::Buffer *_mBufferScales; 28 | static MTL::Buffer *_mBufferResult; 29 | static MTL::Buffer *_mParams; 30 | 31 | static std::unordered_map _mumap; 32 | 33 | static bool has_init; 34 | static void init(); 35 | static void run(MetalMatMulParams param, MetalMatmulBuffers *bufferParams); 36 | static void *allocateSharedMem(size_t size); 37 | 38 | static MetalMatMulParams *_mParamsPtr; 39 | static void sendComputeCommand(); 40 | static void encodeCommand(MTL::ComputeCommandEncoder *computeEncoder); 41 | static MTL::Buffer *getBufferfromPtr(void *ptr); 42 | }; 43 | -------------------------------------------------------------------------------- /kernels/matmul_imp.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "matmul.h" 10 | 11 | namespace matmul { 12 | 13 | void MatmulOperator::CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C) { 14 | assert(A->column == B->row); 15 | assert(C->column == B->column); 16 | assert(C->row == A->row); 17 | } 18 | 19 | void MatmulOperator::CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C) { 20 | assert(B->row * B->column == A->column * C->column / 2); 21 | assert(C->row == A->row); 22 | } 23 | 24 | void MatmulOperator::mat_mul_transposed(const struct matmul_params *params) { 25 | int i, j, k; 26 | 27 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 28 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 29 | 30 | for (i = 0; i < C->row; i++) 31 | for (j = 0; j < C->column; j++) { 32 | float acc = 0; 33 | for (k = 0; k < A->column; k++) acc += data_A[i * A->column + k] * data_B[j * B->column + k]; 34 | data_C[i * C->column + j] = acc; 35 | } 36 | } 37 | 38 | float interval_to_ms(struct timeval *start, struct timeval *end) { 39 | float us_seconds = (end->tv_sec - start->tv_sec) * 1000000 + (end->tv_usec - start->tv_usec); 40 | return us_seconds / 1000; 41 | } 42 | 43 | } // namespace matmul 44 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "code_generator/tflite/.*" 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.0.1 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: mixed-line-ending 8 | args: ["--fix=lf"] 9 | - id: end-of-file-fixer 10 | - id: check-merge-conflict 11 | - id: requirements-txt-fixer 12 | - id: fix-encoding-pragma 13 | args: ["--remove"] 14 | - id: debug-statements 15 | - id: check-toml 16 | - repo: https://github.com/executablebooks/mdformat 17 | rev: 0.7.10 18 | hooks: 19 | - id: mdformat 20 | - repo: https://github.com/psf/black 21 | rev: 22.3.0 22 | hooks: 23 | - id: black 24 | - repo: https://github.com/pycqa/isort 25 | rev: 5.12.0 26 | hooks: 27 | - id: isort 28 | args: ["--sp", "pyproject.toml"] 29 | - repo: https://github.com/pycqa/flake8 30 | rev: 4.0.1 31 | hooks: 32 | - id: flake8 33 | additional_dependencies: 34 | - flake8-comprehensions==3.7.0 35 | - flake8-docstrings==1.6.0 36 | - repo: local 37 | hooks: 38 | - id: pylint 39 | name: pylint 40 | entry: pylint 41 | language: system 42 | types: [python] 43 | require_serial: true 44 | - repo: https://github.com/pre-commit/mirrors-mypy 45 | rev: v0.910-1 46 | hooks: 47 | - id: mypy 48 | - repo: https://github.com/pre-commit/mirrors-clang-format 49 | rev: v13.0.0 50 | hooks: 51 | - id: clang-format 52 | -------------------------------------------------------------------------------- /transformer/src/ops/softmax.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "operators.h" 3 | 4 | void softmax(const Matrix3D &input, Matrix3D &output, const int dim) { 5 | PROFILE_START("softmax"); 6 | int len = input.length(); 7 | 8 | if (dim == 2) { 9 | // Find the maximum value in the input array 10 | for (int i = 0; i < input.m_dim_x; i++) { 11 | for (int j = 0; j < input.m_dim_y; j++) { 12 | float max_value = input.m_data[0]; 13 | float sum = 0; 14 | // Find the maximum value in the input array 15 | for (int k = 0; k < input.m_dim_z; k++) { 16 | float value = input(i, j, k); 17 | if (value > max_value) { 18 | max_value = value; 19 | } 20 | } 21 | 22 | // Compute the softmax values 23 | for (int k = 0; k < input.m_dim_z; k++) { 24 | float value = input(i, j, k); 25 | sum += std::exp(value - max_value); 26 | } 27 | 28 | // Normalize the softmax values and store them in the output array 29 | for (int k = 0; k < input.m_dim_z; k++) { 30 | float value = input(i, j, k); 31 | output(i, j, k) = (std::exp(value - max_value) / sum); 32 | } 33 | } 34 | } 35 | } else { 36 | throw("Unsupported dimension for softmax"); 37 | } 38 | PROFILE_END("softmax"); 39 | } -------------------------------------------------------------------------------- /transformer/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: "code_generator/tflite/.*" 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.0.1 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: mixed-line-ending 8 | args: ["--fix=lf"] 9 | - id: end-of-file-fixer 10 | - id: check-merge-conflict 11 | - id: requirements-txt-fixer 12 | - id: fix-encoding-pragma 13 | args: ["--remove"] 14 | - id: debug-statements 15 | - id: check-toml 16 | - repo: https://github.com/executablebooks/mdformat 17 | rev: 0.7.10 18 | hooks: 19 | - id: mdformat 20 | - repo: https://github.com/psf/black 21 | rev: 22.3.0 22 | hooks: 23 | - id: black 24 | - repo: https://github.com/pycqa/isort 25 | rev: 5.12.0 26 | hooks: 27 | - id: isort 28 | args: ["--sp", "pyproject.toml"] 29 | - repo: https://github.com/pycqa/flake8 30 | rev: 4.0.1 31 | hooks: 32 | - id: flake8 33 | additional_dependencies: 34 | - flake8-comprehensions==3.7.0 35 | - flake8-docstrings==1.6.0 36 | - repo: local 37 | hooks: 38 | - id: pylint 39 | name: pylint 40 | entry: pylint 41 | language: system 42 | types: [python] 43 | require_serial: true 44 | - repo: https://github.com/pre-commit/mirrors-mypy 45 | rev: v0.910-1 46 | hooks: 47 | - id: mypy 48 | - repo: https://github.com/pre-commit/mirrors-clang-format 49 | rev: v13.0.0 50 | hooks: 51 | - id: clang-format 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | include = '\.pyi?$' 4 | extend-exclude = "codegen/.*" 5 | 6 | [tool.isort] 7 | profile = "black" 8 | known_first_party = ["code_generator"] 9 | extend_skip = ["codegen"] 10 | multi_line_output = 3 11 | include_trailing_comma = true 12 | force_grid_wrap = 0 13 | use_parentheses = true 14 | ensure_newline_before_comments = true 15 | line_length = 120 16 | 17 | [tool.pylint] 18 | [tool.pylint.master] 19 | ignore-paths = ["codegen"] 20 | [tool.pylint.messages_control] 21 | disable = [ 22 | "C0103", 23 | "C0114", 24 | "C0115", 25 | "C0116", 26 | "C0123", 27 | "C0209", 28 | "C0330", 29 | "C0301", 30 | "C0302", 31 | "C0411", 32 | "C0415", 33 | "E0401", 34 | "E1121", 35 | "E1123", 36 | "E1101", 37 | "R", 38 | "W" 39 | ] 40 | [tool.pylint.basic] 41 | good-names-rgxs = "^[_a-z][_a-z0-9]?$" # allow 1 or 2 character names 42 | [tool.pylint.format] 43 | max-line-length = 120 44 | max-module-lines = 5000 45 | [tool.pylint.design] 46 | max-args = 10 47 | max-attributes = 15 48 | max-parents = 10 49 | 50 | [tool.mypy] 51 | files = "." 52 | exclude ="codegen/.*" 53 | install_types = true 54 | non_interactive = true 55 | show_error_codes = true 56 | disable_error_code = [ 57 | "import", 58 | "assignment", 59 | "operator", 60 | "has-type", 61 | "var-annotated", 62 | "operator", 63 | "call-arg", 64 | ] 65 | explicit_package_bases = true 66 | namespace_packages = true 67 | -------------------------------------------------------------------------------- /kernels/ref/matmul_ref_int4.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { 12 | int i, j, k; 13 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 14 | const int block_size = params->block_size; 15 | float *scale = params->scales, *offset = params->offset; 16 | 17 | assert(params->block_size == 32); // support block size 32 for now 18 | 19 | for (i = 0; i < C->row; i++) { 20 | for (j = 0; j < C->column; j++) { 21 | float acc = 0; 22 | for (k = 0; k < B->row; k += block_size) { 23 | float s = scale[j * (B->row / 16) + k / 32]; // /16:B->column is packed 4bits 24 | float o = offset[j * (B->row / 16) + k / 32]; 25 | uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2]; 26 | float *x_ptr = &A->data_ptr[i * A->column + k]; 27 | for (int qi = 0; qi < block_size / 2; qi++) { 28 | uint8_t packed_int4 = weight_32_int4[qi]; 29 | float deq_0 = (float)(packed_int4 & 0x0F) * s + o; 30 | float deq_1 = (float)(packed_int4 >> 4) * s + o; 31 | acc += *x_ptr++ * deq_0; 32 | acc += *x_ptr++ * deq_1; 33 | } 34 | } 35 | C->data_ptr[i * C->column + j] = acc; 36 | } 37 | } 38 | }; 39 | 40 | } // namespace matmul 41 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/OPTForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "OPTForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | OPTForCausalLM::OPTForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Int8OPTDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct OPTForCausalLM_output OPTForCausalLM::forward(const struct OPTForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Int8OPTDecoder_output decoder_output; 22 | 23 | if (input.has_past_keys_values) { 24 | struct Int8OPTDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 25 | decoder_output = this->decoder.forward(decoder_input); 26 | 27 | } else { 28 | struct Int8OPTDecoder_input decoder_input = {input.input_ids}; 29 | decoder_output = this->decoder.forward(decoder_input); 30 | } 31 | 32 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 33 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 34 | 35 | struct OPTForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 36 | PROFILE_END(profile_name); 37 | return LMoutput; 38 | } 39 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Fp32llamaDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Fp32llamaDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Fp32llamaDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Fp32llamaDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Fp32llamaDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Fp32llamaDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Fp32llamaDecoder { 27 | public: 28 | Fp32llamaDecoder(std::string param_path, const struct model_config config); 29 | Fp32llamaDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | struct Fp32llamaDecoder_output forward(const struct Fp32llamaDecoder_input& input); 32 | Embedding embed_tokens; 33 | LlamaRMSNorm norm; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | std::string profile_name = "Fp32llamaDecoder"; 37 | 38 | private: 39 | float* attention_mask_buf; 40 | float* pos_embeds_buf; 41 | float* last_hidden_states_buf; 42 | float* hidden_states_buf; 43 | }; 44 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int4llamaDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Int4llamaDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Int4llamaDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Int4llamaDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Int4llamaDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Int4llamaDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Int4llamaDecoder { 27 | public: 28 | Int4llamaDecoder(std::string param_path, const struct model_config config); 29 | Int4llamaDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | struct Int4llamaDecoder_output forward(const struct Int4llamaDecoder_input& input); 32 | Embedding embed_tokens; 33 | LlamaRMSNorm norm; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | std::string profile_name = "Int4llamaDecoder"; 37 | 38 | private: 39 | float* attention_mask_buf; 40 | float* pos_embeds_buf; 41 | float* last_hidden_states_buf; 42 | float* hidden_states_buf; 43 | }; 44 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/Fp32llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Fp32LlamaForCausalLM::Fp32LlamaForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, config.embed_dim * config.vocsize * sizeof(float)); 11 | 12 | this->decoder = Fp32llamaDecoder(param_path + "/decoder", config); 13 | this->lm_head = 14 | Linear_FP(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim), param_path + "/lm_head.bin"); 15 | } 16 | 17 | struct Fp32LlamaForCausalLM_output Fp32LlamaForCausalLM::forward(const struct Fp32LlamaForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Fp32llamaDecoder_output decoder_output; 22 | 23 | // Call decoder 24 | if (input.has_past_keys_values) { 25 | struct Fp32llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 26 | decoder_output = this->decoder.forward(decoder_input); 27 | 28 | } else { 29 | struct Fp32llamaDecoder_input decoder_input = {input.input_ids}; 30 | decoder_output = this->decoder.forward(decoder_input); 31 | } 32 | 33 | // Get logits 34 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 35 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 36 | 37 | struct Fp32LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 38 | PROFILE_END(profile_name); 39 | return LMoutput; 40 | } 41 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int8OPTDecoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Int8OPTDecoderLayer.h" 6 | #include "common.h" 7 | #include "operators.h" 8 | 9 | struct Int8OPTDecoder_output { 10 | Matrix3D last_hidden_state; 11 | std::vector> past_keys, past_values; 12 | }; 13 | struct Int8OPTDecoder_input { 14 | Matrix3D input_ids; 15 | std::vector> past_keys, past_values; 16 | bool has_past_keys_values; 17 | 18 | Int8OPTDecoder_input(Matrix3D input_ids_) : input_ids(input_ids_) { has_past_keys_values = false; } 19 | Int8OPTDecoder_input(Matrix3D input_ids_, std::vector> past_keys_, 20 | std::vector> past_values_) 21 | : input_ids(input_ids_), past_keys(past_keys_), past_values(past_values_) { 22 | has_past_keys_values = true; 23 | } 24 | }; 25 | 26 | class Int8OPTDecoder { 27 | public: 28 | Int8OPTDecoder(std::string param_path, const struct model_config config); 29 | Int8OPTDecoder(){}; 30 | Matrix3D prepare_decoder_attention_mask(int length, int past_length); 31 | Matrix3D get_position_embed(int sql_length, int past_length); 32 | struct Int8OPTDecoder_output forward(const struct Int8OPTDecoder_input& input); 33 | Embedding embed_tokens, embed_positions; 34 | int voc_size, embed_dim, padding_idx, hidden_dim, num_heads; 35 | std::vector layers; 36 | LayerNorm final_layer_norm; 37 | std::string profile_name = "Int8OPTDecoder"; 38 | 39 | private: 40 | float* attention_mask_buf; 41 | float* pos_embeds_buf; 42 | float* last_hidden_states_buf; 43 | float* hidden_states_buf; 44 | }; 45 | -------------------------------------------------------------------------------- /transformer/download_assets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # List of files to download, their corresponding MD5 checksums, and target local paths 4 | files_and_checksums=( 5 | "https://www.dropbox.com/s/8q5cupqw00twvoa/assets.zip 6014d43716e6516a4f7b7161088d3e74 assets.zip" 6 | ) 7 | 8 | OS=`uname` 9 | 10 | # Function to download a file if it doesn't exist or if its MD5 checksum is incorrect 11 | download_if_needed() { 12 | url="$1" 13 | expected_md5="$2" 14 | target_path="$3" 15 | 16 | # Ensure the target directory exists 17 | target_dir=$(dirname "$target_path") 18 | mkdir -p "$target_dir" 19 | 20 | # Download the file if it does not exist 21 | if [ ! -e "$target_path" ]; then 22 | echo "File '$target_path' does not exist. Downloading..." 23 | wget -q -O "$target_path" "$url" 24 | fi 25 | 26 | # Use md5 on MacOS 27 | if [ $OS = "Darwin" ] 28 | then 29 | actual_md5=$(md5 -q "$target_path") 30 | # Use md5sum on Ubuntu 31 | elif [ $OS = "Linux" ] 32 | then 33 | actual_md5=$(md5sum "$target_path" | cut -d ' ' -f1) 34 | fi 35 | 36 | if [ "$actual_md5" != "$expected_md5" ]; then 37 | echo "MD5 checksum for '$target_path' is incorrect. Downloading again..." 38 | wget -q -O "$target_path" "$url" 39 | else 40 | echo "File '$target_path' exists and its MD5 checksum is correct." 41 | fi 42 | } 43 | 44 | # Process each file, its corresponding MD5 checksum, and target local path 45 | for file_and_checksum in "${files_and_checksums[@]}"; do 46 | url=$(echo "$file_and_checksum" | awk '{ print $1 }') 47 | expected_md5=$(echo "$file_and_checksum" | awk '{ print $2 }') 48 | target_path=$(echo "$file_and_checksum" | awk '{ print $3 }') 49 | 50 | download_if_needed "$url" "$expected_md5" "$target_path" 51 | unzip "$target_path" 52 | done 53 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/Int4llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include "Int4llamaForCausalLM.h" 2 | 3 | #include 4 | 5 | #include "operators.h" 6 | #include "utils.h" 7 | 8 | Int4LlamaForCausalLM::Int4LlamaForCausalLM(std::string param_path, const struct model_config config) { 9 | allocate_aligned_memory(logits_output, config.max_sqlen * config.vocsize * sizeof(float)); 10 | allocate_aligned_memory(lm_head_weight, (config.embed_dim * config.vocsize * sizeof(uint8_t)) / 2); 11 | 12 | this->decoder = Int4llamaDecoder(param_path + "/decoder", config); 13 | this->lm_head = Linear_FP_int4(Matrix3D(lm_head_weight, 1, config.vocsize, config.embed_dim / 2), 14 | param_path + "/lm_head"); 15 | } 16 | 17 | struct Int4LlamaForCausalLM_output Int4LlamaForCausalLM::forward(const struct Int4LlamaForCausalLM_input &input) { 18 | PROFILE_START(profile_name); 19 | int sqlen = input.input_ids.m_dim_z; 20 | 21 | struct Int4llamaDecoder_output decoder_output; 22 | 23 | // Call decoder 24 | if (input.has_past_keys_values) { 25 | struct Int4llamaDecoder_input decoder_input = {input.input_ids, input.past_keys, input.past_values}; 26 | decoder_output = this->decoder.forward(decoder_input); 27 | 28 | } else { 29 | struct Int4llamaDecoder_input decoder_input = {input.input_ids}; 30 | decoder_output = this->decoder.forward(decoder_input); 31 | } 32 | 33 | // Get logits 34 | Matrix3D logits(logits_output, 1, sqlen, this->decoder.voc_size); 35 | this->lm_head.forward(decoder_output.last_hidden_state, logits); 36 | 37 | struct Int4LlamaForCausalLM_output LMoutput = {logits, decoder_output.past_keys, decoder_output.past_values}; 38 | PROFILE_END(profile_name); 39 | return LMoutput; 40 | } 41 | -------------------------------------------------------------------------------- /transformer/include/model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_H 2 | #define MODEL_H 3 | #include 4 | 5 | struct model_config { 6 | int batch; 7 | int num_heads; 8 | int num_layers; 9 | int max_sqlen; 10 | int embed_dim; 11 | int hidden_dim; 12 | int vocsize; 13 | int padding_idx; 14 | int qk; // group size 15 | 16 | model_config() : model_config(1, 12, 12, 512, 768, 3072, 50272, 1) {} 17 | model_config(int batch, int num_heads, int num_layers, int max_sqlen, int embed_dim, int hidden_dim, int vocsize, 18 | int padding_idx) 19 | : batch(batch), 20 | num_heads(num_heads), 21 | num_layers(num_layers), 22 | max_sqlen(max_sqlen), 23 | embed_dim(embed_dim), 24 | hidden_dim(hidden_dim), 25 | vocsize(vocsize), 26 | padding_idx(padding_idx) {} 27 | }; 28 | 29 | enum { OPT_125M, OPT_1_3B, OPT_6_7B, LLaMA_7B }; 30 | enum { FP32, INT8, INT4 }; 31 | 32 | const struct model_config opt_6_7B(1, 32, 32, 2048, 4096, 16384, 50272, 1); 33 | const struct model_config opt_1_3B(1, 32, 24, 2048, 2048, 8192, 50272, 1); 34 | const struct model_config opt_125m(1, 12, 12, 2048, 768, 3072, 50272, 1); 35 | const struct model_config llama_7B(1, 32, 32, 2048, 4096, 11008, 32000, 1); 36 | static struct model_config get_opt_model_config(int choise) { 37 | struct model_config ret; 38 | switch (choise) { 39 | case OPT_125M: 40 | ret = opt_125m; 41 | break; 42 | case OPT_1_3B: 43 | ret = opt_1_3B; 44 | break; 45 | case OPT_6_7B: 46 | ret = opt_6_7B; 47 | break; 48 | case LLaMA_7B:; 49 | ret = llama_7B; 50 | break; 51 | default: 52 | throw("Unsupported model choise."); 53 | break; 54 | } 55 | return ret; 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /transformer/include/LLaMATokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef LLaMA_TOKENIZER_H 2 | #define LLaMA_TOKENIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | static int llama_token_bos() { return 1; } 14 | 15 | static int llama_token_eos() { return 2; } 16 | 17 | static int llama_token_nl() { return 13; } 18 | 19 | struct llama_vocab { 20 | struct token_score { 21 | std::string tok; 22 | float score; 23 | }; 24 | 25 | std::unordered_map token_to_id; 26 | std::vector id_to_token; 27 | }; 28 | 29 | /* 30 | * Tokenizer 31 | */ 32 | static size_t utf8_len(char src) { 33 | const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; 34 | uint8_t highbits = static_cast(src) >> 4; 35 | 36 | return lookup[highbits]; 37 | } 38 | 39 | struct llama_sp_symbol { 40 | using index = int; 41 | index prev; 42 | index next; 43 | const char* text; 44 | size_t n; 45 | }; 46 | 47 | struct llama_sp_bigram { 48 | struct comparator { 49 | bool operator()(llama_sp_bigram& l, llama_sp_bigram& r) { 50 | return (l.score < r.score) || (l.score == r.score && l.left > r.left); 51 | } 52 | }; 53 | using queue_storage = std::vector; 54 | using queue = std::priority_queue; 55 | llama_sp_symbol::index left; 56 | llama_sp_symbol::index right; 57 | float score; 58 | size_t size; 59 | }; 60 | 61 | llama_vocab llama_init_vocab(const char* vocab_file); 62 | 63 | const char* llama_id_to_token(const llama_vocab& vocab, int id); 64 | 65 | int llama_tokenize(const llama_vocab& vocab, const char* text, int* tokens, int n_max_tokens, bool add_bos); 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /transformer/include/OPTTokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef OPT_TOKENIZER_H 2 | #define OPT_TOKENIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | // #include // Tricky to support this in windows 22 | #include 23 | 24 | // std::vector OPT_tokenize(const OPT_vocab & vocab, const std::string & text, bool add_bos); 25 | 26 | struct pair_hash { 27 | template 28 | std::size_t operator()(const std::pair &p) const { 29 | auto h1 = std::hash{}(p.first); 30 | auto h2 = std::hash{}(p.second); 31 | return h1 ^ h2; 32 | } 33 | }; 34 | 35 | class Encoder { 36 | public: 37 | Encoder(std::map encoder, std::vector> bpe_merges); 38 | std::unordered_map bytes_to_unicode(); 39 | std::set> get_pairs(std::vector word); 40 | std::string bpe(std::string token); 41 | std::vector encode(std::string text); 42 | std::string decode(std::vector tokens); 43 | 44 | private: 45 | std::map encoder; 46 | std::map decoder; 47 | std::unordered_map byte_encoder; 48 | std::unordered_map byte_decoder; 49 | std::unordered_map, int, pair_hash> bpe_ranks; 50 | std::unordered_map cache; 51 | }; 52 | 53 | Encoder get_encoder(std::string vocab_file, std::string bpe_file); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /transformer/upload.py: -------------------------------------------------------------------------------- 1 | """Uploading models and asset to the dropbox storage. 2 | 3 | Example commandline: 4 | python upload.py 5 | """ 6 | import argparse 7 | import os 8 | 9 | import dropbox 10 | 11 | files_to_upload = [ 12 | "assets.zip", 13 | "models.zip", 14 | ] 15 | 16 | 17 | def subebackups(file_path, target_path, token): 18 | """Upload a file to the dropbox storage.""" 19 | dbx = dropbox.Dropbox(token, timeout=36000) 20 | file_size = os.path.getsize(file_path) 21 | CHUNK_SIZE = 50 * 1024 * 1024 22 | dest_path = target_path 23 | 24 | with open(file_path, "rb") as f: 25 | if file_size <= CHUNK_SIZE: 26 | dbx.files_upload(f.read(), dest_path) 27 | 28 | else: 29 | upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE)) 30 | cursor = dropbox.files.UploadSessionCursor( 31 | session_id=upload_session_start_result.session_id, offset=f.tell() 32 | ) 33 | commit = dropbox.files.CommitInfo(path=dest_path, mode=dropbox.files.WriteMode("overwrite")) 34 | 35 | while f.tell() < file_size: 36 | if (file_size - f.tell()) <= CHUNK_SIZE: 37 | print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE), cursor, commit)) 38 | else: 39 | dbx.files_upload_session_append(f.read(CHUNK_SIZE), cursor.session_id, cursor.offset) 40 | cursor.offset = f.tell() 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser(description="Upload a file to Dropbox.") 45 | parser.add_argument("token", help="Your Dropbox OAuth2 token.") 46 | args = parser.parse_args() 47 | 48 | db_prefix = "/MIT/transformer_assets/" 49 | local_prefix = "uploads" 50 | 51 | for file in files_to_upload: 52 | subebackups(file, db_prefix + file, args.token) 53 | -------------------------------------------------------------------------------- /transformer/include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "profiler.h" 11 | 12 | #define STATS_START(x) Profiler::getInstance().start(x) 13 | #define STATS_FLOPS(x, y) Profiler::getInstance().start(x, y) 14 | #define STATS_END(x) Profiler::getInstance().stop(x) 15 | 16 | #ifdef PROFILER 17 | #define PROFILE_START(x) Profiler::getInstance().start(x) 18 | #define PROFILE_START_FLOPS(x, y) Profiler::getInstance().start(x, y) 19 | #define PROFILE_END(x) Profiler::getInstance().stop(x) 20 | #else 21 | #define PROFILE_START(x) 22 | #define PROFILE_START_FLOPS(x, y) 23 | #define PROFILE_END(x) 24 | #endif 25 | 26 | #define MAX_SQ_ERROR_MAX 5e-6 27 | #define ERROR_MAX 1e-9 28 | #define INT_ERROR_MAX 1e-5 29 | 30 | template 31 | void read_to_array(const char* path, T* array, int size); 32 | 33 | template 34 | bool check_two_equal(T* array, T* array2, int size); 35 | 36 | template <> 37 | bool check_two_equal(int8_t* array, int8_t* array2, int size); 38 | 39 | bool check_two_equal(int8_t* array, int8_t* array2, int size, float error); 40 | 41 | bool check_two_equal(float* array, float* array2, int size, float error); 42 | bool check_two_exact_equal(int8_t* array, int8_t* array2, int size); 43 | void print_MSE_max_diff(float* a, float* a2, int size); 44 | 45 | void print_first_k_elelment(std::string name, const int8_t* arr, int k, int start_idx = 0); 46 | void print_first_k_elelment(std::string name, const int32_t* arr, int k, int start_idx = 0); 47 | void print_first_k_elelment(std::string name, const float* arr, int k, int start_idx = 0); 48 | 49 | #ifdef QM_METAL 50 | template 51 | void allocate_aligned_memory(T*& ptr, size_t size); 52 | #else 53 | template 54 | void allocate_aligned_memory(T*& ptr, size_t size); 55 | #endif 56 | 57 | void deallocate_memory(void* ptr); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Fp32llamaAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Fp32llamaAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Fp32llamaAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Fp32llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Fp32llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Fp32llamaAttention { 32 | public: 33 | Fp32llamaAttention(std::string param_path, const struct model_config config); 34 | Fp32llamaAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Fp32llamaAttention_output forward(const struct Fp32llamaAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | int embed_dim, num_heads, head_dim; 42 | Linear_FP k_proj, v_proj, q_proj, o_proj; 43 | RotaryPosEmb rotary_pos_emb; 44 | BMM_F32T qk_bmm, pv_bmm; 45 | std::string profile_name = "Fp32llamaAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int4llamaAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Int4llamaAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Int4llamaAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Int4llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Int4llamaAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Int4llamaAttention { 32 | public: 33 | Int4llamaAttention(std::string param_path, const struct model_config config); 34 | Int4llamaAttention() {} 35 | static void initialized_memory(const struct model_config config); 36 | struct Int4llamaAttention_output forward(const struct Int4llamaAttention_input &input); 37 | 38 | private: 39 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 40 | void shape(Matrix3D unshape, Matrix3D shaped, int sqlen); 41 | int embed_dim, num_heads, head_dim; 42 | Linear_FP_int4 k_proj, v_proj, q_proj, o_proj; 43 | RotaryPosEmb rotary_pos_emb; 44 | BMM_F32T qk_bmm, pv_bmm; 45 | std::string profile_name = "Int4llamaAttention"; 46 | }; 47 | -------------------------------------------------------------------------------- /transformer/src/ops/RotaryPosEmb.cc: -------------------------------------------------------------------------------- 1 | #include "operators.h" 2 | #include 3 | 4 | float q_buf[4096], k_buf[4096]; 5 | // TODO: optimize this with multithreading 6 | void RotaryPosEmb::forward(Matrix3D &query, Matrix3D &key, 7 | int start_idx, int len) { 8 | PROFILE_START(profile_name); 9 | int num_heads = query.m_dim_x; 10 | int head_embed = cos.m_dim_z; 11 | int max_sqlen = cos.m_dim_y; 12 | 13 | assert(query.m_dim_z == cos.m_dim_z); 14 | assert(key.m_dim_z == cos.m_dim_z); 15 | assert(max_sqlen > len + start_idx); 16 | 17 | // cos, sin = self.rotary_emb(key_states, seq_len=kv_seq_len) 18 | // query_states, key_states = apply_rotary_pos_emb(query_states, key_states, 19 | // cos, sin, position_ids) cos = cos[position_ids].unsqueeze(1) # [bs, 1, 20 | // seq_len, dim] sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] 21 | // q_embed = (q * cos) + (rotate_half(q) * sin) 22 | // k_embed = (k * cos) + (rotate_half(k) * sin) 23 | // x1 = x[..., : x.shape[-1] // 2] 24 | // x2 = x[..., x.shape[-1] // 2 :] 25 | // rotate_half: torch.cat((-x2, x1), dim=-1) 26 | 27 | int half = head_embed / 2; 28 | for (int b = 0; b < num_heads; b++) { 29 | for (int i = 0; i < len; i++) { 30 | // first half 31 | for (int j = 0; j < half; j++) { 32 | q_buf[j] = -1 * query(b, i, j + half); 33 | k_buf[j] = -1 * key(b, i, j + half); 34 | } 35 | // second half 36 | for (int j = half; j < head_embed; j++) { 37 | q_buf[j] = query(b, i, j - half); 38 | k_buf[j] = key(b, i, j - half); 39 | } 40 | 41 | for (int j = 0; j < head_embed; j++) { 42 | query(b, i, j) = ((query(b, i, j) * cos(0, i + start_idx, j)) + 43 | (q_buf[j] * sin(0, i + start_idx, j))); 44 | key(b, i, j) = ((key(b, i, j) * cos(0, i + start_idx, j)) + 45 | (k_buf[j] * sin(0, i + start_idx, j))); 46 | } 47 | } 48 | } 49 | 50 | PROFILE_END(profile_name); 51 | } 52 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Fp32llamaDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Fp32llamaDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Fp32llamaDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Fp32llamaDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Fp32llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Fp32llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Fp32llamaDecoderLayer { 40 | public: 41 | Fp32llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Fp32llamaDecoderLayer_output forward(const struct Fp32llamaDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LlamaRMSNorm input_layernorm, post_attention_layernorm; 46 | Linear_FP gate_proj, down_proj, up_proj; 47 | Fp32llamaAttention attn; 48 | std::string profile_name = "Fp32llamaDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int4llamaDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Int4llamaAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Int4llamaDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Int4llamaDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Int4llamaDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Int4llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Int4llamaDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Int4llamaDecoderLayer { 40 | public: 41 | Int4llamaDecoderLayer(std::string param_path, const struct model_config config, int layer_idx); 42 | struct Int4llamaDecoderLayer_output forward(const struct Int4llamaDecoderLayer_input &input); 43 | 44 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 45 | LlamaRMSNorm input_layernorm, post_attention_layernorm; // from torch_int.nn 46 | Linear_FP_int4 gate_proj, down_proj, up_proj; 47 | Int4llamaAttention attn; 48 | std::string profile_name = "Int4llamaDecoderLayer"; 49 | }; 50 | -------------------------------------------------------------------------------- /transformer/src/ops/LayerNorm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void load_LayerNorm(LayerNorm &op, std::string prefix) { 8 | read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length()); 9 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length()); 10 | } 11 | 12 | void LayerNorm::forward(const Matrix3D &x, Matrix3D &output) { 13 | PROFILE_START(profile_name); 14 | Matrix3D weight = params.weight; 15 | Matrix3D bias = params.bias; 16 | const int last_dims = 2; 17 | const float eps = 1e-5; 18 | 19 | assert(last_dims == 2); // support the last dim for now 20 | assert(output.m_dim_x == x.m_dim_x); 21 | assert(output.m_dim_y == x.m_dim_y); 22 | assert(output.m_dim_z == x.m_dim_z); 23 | assert(x.m_dim_z == weight.m_dim_z); 24 | assert(x.m_dim_z == bias.m_dim_z); 25 | 26 | for (int i = 0; i < x.m_dim_x; i++) { // batches 27 | for (int j = 0; j < x.m_dim_y; j++) { // samples 28 | float mean = 0; 29 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 30 | mean += x(i, j, k); 31 | } 32 | mean /= static_cast(x.m_dim_z); 33 | float squared_diff_sum = 0; 34 | for (int k = 0; k < x.m_dim_z; k++) { 35 | float value = static_cast(x(i, j, k)); 36 | squared_diff_sum += (value - mean) * (value - mean); 37 | } 38 | float std_dev = sqrtl(squared_diff_sum / static_cast(x.m_dim_z) + eps); 39 | 40 | for (int k = 0; k < x.m_dim_z; k++) { 41 | float value = static_cast(x(i, j, k)); 42 | float fp_out = (((value - mean) / (std_dev)) * static_cast(weight(0, 0, k))) + 43 | static_cast(bias(0, 0, k)); 44 | output(i, j, k) = static_cast(fp_out); 45 | } 46 | } 47 | } 48 | PROFILE_END(profile_name); 49 | } 50 | -------------------------------------------------------------------------------- /transformer/src/ops/LayerNormQ.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "operators.h" 5 | #include "utils.h" 6 | 7 | void load_LayerNormQ(LayerNormQ &op, std::string prefix) { 8 | read_to_array((prefix + "/weight.bin").c_str(), op.params.weight.m_data, op.params.weight.length()); 9 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.m_data, op.params.bias.length()); 10 | } 11 | 12 | void LayerNormQ::forward(const Matrix3D &x, Matrix3D &output) { 13 | PROFILE_START(profile_name); 14 | Matrix3D weight = params.weight; 15 | Matrix3D bias = params.bias; 16 | const int last_dims = 2; 17 | const float eps = 0.00001; 18 | 19 | assert(last_dims == 2); // support the last dim for now 20 | assert(output.m_dim_x == x.m_dim_x); 21 | assert(output.m_dim_y == x.m_dim_y); 22 | assert(output.m_dim_z == x.m_dim_z); 23 | assert(x.m_dim_z == weight.m_dim_z); 24 | assert(x.m_dim_z == bias.m_dim_z); 25 | 26 | for (int i = 0; i < x.m_dim_x; i++) { // batches 27 | for (int j = 0; j < x.m_dim_y; j++) { // samples 28 | float mean = 0; 29 | for (int k = 0; k < x.m_dim_z; k++) { // hideden states 30 | mean += x(i, j, k); 31 | } 32 | mean /= static_cast(x.m_dim_z); 33 | float squared_diff_sum = 0; 34 | for (int k = 0; k < x.m_dim_z; k++) { 35 | float value = static_cast(x(i, j, k)); 36 | squared_diff_sum += (value - mean) * (value - mean); 37 | } 38 | 39 | float var = squared_diff_sum / static_cast(x.m_dim_z); 40 | float std_dev = sqrt(var + eps); 41 | 42 | for (int k = 0; k < x.m_dim_z; k++) { 43 | float value = static_cast(x(i, j, k)); 44 | float fp_out = ((value - mean) / (std_dev) * static_cast(weight(0, 0, k))) + 45 | static_cast(bias(0, 0, k)); 46 | output(i, j, k) = static_cast(std::round(fp_out)); 47 | } 48 | } 49 | } 50 | 51 | PROFILE_END(profile_name); 52 | } 53 | -------------------------------------------------------------------------------- /transformer/tests/test_linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | #include "utils_memalloc.h" 6 | 7 | void test_FPLinear_int4() { 8 | const int m = 1, n = 32000, k = 4096; 9 | 10 | MemoryAllocator mem_buf; 11 | 12 | Matrix3D hidden_states(mem_buf.get_fpbuffer(m * k), 1, m, k); 13 | Matrix3D weight(mem_buf.get_fpbuffer(n * k), 1, n, k); 14 | Matrix3D outputGT(mem_buf.get_fpbuffer(m * n), 1, m, n); 15 | Matrix3D output(mem_buf.get_fpbuffer(m * n), 1, m, n); 16 | 17 | hidden_states.load("tests/assets/input.bin"); 18 | outputGT.load("tests/assets/output.bin"); 19 | 20 | // quantize the weight to int4 21 | Matrix3D int4_weight((uint8_t *)mem_buf.get_int8buffer(n * k / 2), 1, n, k / 2); 22 | // Linear_FP_int4 int4_op; 23 | Linear_FP_int4 int4_op = Linear_FP_int4(int4_weight, "INT4/models/LLaMA_7B_2_chat/lm_head/"); 24 | 25 | Matrix3D outputQ(mem_buf.get_fpbuffer(m * n), 1, m, n); 26 | Matrix3D outputQ_simd(mem_buf.get_fpbuffer(m * n), 1, m, n); 27 | Matrix3D outputQ_fast(mem_buf.get_fpbuffer(m * n), 1, m, n); 28 | 29 | // warm up 30 | for (int i = 0; i < 1; i++) { 31 | int4_op.forward(hidden_states, outputQ_fast); 32 | } 33 | 34 | const int flops = k * m * n * 2; 35 | int4_op.forward_ref(hidden_states, outputQ); 36 | 37 | for (int i = 0; i < 10; i++) { 38 | STATS_FLOPS(int4_op.profile_name, flops); 39 | int4_op.forward(hidden_states, outputQ_fast); 40 | STATS_END(int4_op.profile_name); 41 | } 42 | bool success = check_two_equal(outputQ.m_data, outputQ_fast.m_data, outputQ_fast.length(), 1e-3); 43 | 44 | if (!success) { 45 | std::cout << "-------- Sanity check of " << int4_op.profile_name << " implementation: Fail! -------- " 46 | << std::endl; 47 | exit(-1); 48 | } else 49 | std::cout << "-------- Sanity check of " << int4_op.profile_name << " implementation: Passed! -------- " 50 | << std::endl; 51 | } 52 | 53 | int main() { 54 | test_FPLinear_int4(); 55 | Profiler::getInstance().report_internal(); 56 | } 57 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int8OPTAttention.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.h" 4 | #include "operators.h" 5 | 6 | struct Int8OPTAttention_output { 7 | Matrix3D attn_output; 8 | Matrix3D attn_probs_reshaped; 9 | std::pair, Matrix3D> past_key_value; 10 | }; 11 | struct Int8OPTAttention_input { 12 | Matrix3D hidden_states; 13 | Matrix3D attention_mask; 14 | Matrix3D past_key, past_value; 15 | bool has_past_key_value = false; 16 | int layer_idx; 17 | 18 | Int8OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, int layer_idx_) 19 | : hidden_states(hidden_states_), attention_mask(attention_mask_), layer_idx(layer_idx_) {} 20 | 21 | Int8OPTAttention_input(Matrix3D hidden_states_, Matrix3D attention_mask_, Matrix3D past_key_, 22 | Matrix3D past_value_, bool has_past_key_value_, int layer_idx_) 23 | : hidden_states(hidden_states_), 24 | attention_mask(attention_mask_), 25 | past_key(past_key_), 26 | past_value(past_value_), 27 | has_past_key_value(has_past_key_value_), 28 | layer_idx(layer_idx_) {} 29 | }; 30 | 31 | class Int8OPTAttention { 32 | public: 33 | Int8OPTAttention(std::string param_path, const struct model_config config, BMM_S8T_S8N_F32T &qk_bmm, 34 | BMM_S8T_S8N_S8T &pv_bmm, W8A8B8O8Linear &k_proj, W8A8B8O8Linear &v_proj, W8A8B8O8Linear &q_proj, 35 | W8A8BFP32OFP32Linear &out_proj); 36 | Int8OPTAttention() {} 37 | static void initialized_memory(const struct model_config config); 38 | struct Int8OPTAttention_output forward(const struct Int8OPTAttention_input &input); 39 | 40 | private: 41 | void unshape(Matrix3D shaped, Matrix3D unshape, int sqlen); 42 | void shpae(Matrix3D unshape, Matrix3D shaped, int sqlen); 43 | int embed_dim, num_heads, head_dim; 44 | BMM_S8T_S8N_F32T qk_bmm; 45 | BMM_S8T_S8N_S8T pv_bmm; 46 | W8A8B8O8Linear k_proj, v_proj, q_proj; 47 | W8A8BFP32OFP32Linear out_proj; 48 | std::string profile_name = "Int8OPTAttention"; 49 | }; 50 | -------------------------------------------------------------------------------- /transformer/tests/test_OPTTokenizer.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "OPTTokenizer.h" 4 | 5 | void test_OPTEncode() { 6 | std::string bpe_file = "models/opt_merges.txt"; 7 | std::string vocab_file = "models/opt_vocab.json"; 8 | 9 | Encoder encoder = get_encoder(vocab_file, bpe_file); 10 | std::vector encoded = encoder.encode( 11 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume they " 12 | "don't have basic concepts."); 13 | std::vector encoded_answer = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 14 | 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 15 | bool is_equal = true; 16 | for (int i = 0; i < encoded.size(); i++) { 17 | if (encoded[i] != encoded_answer[i]) { 18 | is_equal = false; 19 | break; 20 | } 21 | } 22 | if (!is_equal) 23 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 24 | else 25 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 26 | } 27 | 28 | void test_OPTDecode() { 29 | std::string bpe_file = "models/opt_merges.txt"; 30 | std::string vocab_file = "models/opt_vocab.json"; 31 | ; 32 | 33 | Encoder encoder = get_encoder(vocab_file, bpe_file); 34 | std::vector encoded_answer = {37500, 10, 998, 64, 28, 626, 11, 158, 2007, 2402, 4, 152, 1579, 16, 35 | 13, 937, 82, 6, 98, 52, 6876, 51, 218, 75, 33, 3280, 14198, 4}; 36 | std::string decoded = encoder.decode(encoded_answer); 37 | std::string decoded_answer = 38 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume they " 39 | "don't have basic concepts."; 40 | bool is_equal = true; 41 | if (decoded != decoded_answer) is_equal = false; 42 | if (!is_equal) 43 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 44 | else 45 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 46 | } 47 | 48 | int main() { 49 | test_OPTEncode(); 50 | test_OPTDecode(); 51 | }; 52 | -------------------------------------------------------------------------------- /transformer/quantize_and_upload.py: -------------------------------------------------------------------------------- 1 | """A script to quantize supported models and updload to model zoo. 2 | 3 | Example usage: 4 | python quantize_and_upload.py --method --token 5 | 6 | Note: This script is for developers. 7 | """ 8 | import argparse 9 | import hashlib 10 | import os 11 | 12 | from upload import subebackups 13 | 14 | model_paths = ["models/LLaMA_7B", "models/LLaMA_7B_2_chat", "models/LLaMA_7B_AWQ"] 15 | 16 | quantized_dir = "INT4" 17 | db_prefix = "/MIT/transformer_assets/" 18 | 19 | 20 | def _get_md5sum(file_path): 21 | hash_md5 = hashlib.md5() 22 | with open(file_path, "rb") as f: 23 | for chunk in iter(lambda: f.read(4096), b""): 24 | hash_md5.update(chunk) 25 | return hash_md5.hexdigest() 26 | 27 | 28 | def main(): 29 | """Take arguments and quantize all models and upload to dropbox.""" 30 | 31 | def _get_parser(): 32 | parser = argparse.ArgumentParser(description="Quantize model") 33 | parser.add_argument("--method", type=str, help="Quantization method") 34 | parser.add_argument("--token", help="Your Dropbox OAuth2 token.") 35 | return parser 36 | 37 | parser = _get_parser() 38 | args = parser.parse_args() 39 | 40 | if args.method not in ["QM_x86", "QM_ARM"]: 41 | raise ValueError("expect method to be one of ['QM_x86', 'QM_ARM']") 42 | QM_method = args.method 43 | 44 | for model_path in model_paths: 45 | # quantize 46 | quantize_cmd = ( 47 | f"python model_quantizer.py --model_path {model_path} --method {QM_method} --output_path {quantized_dir}" 48 | ) 49 | os.system(quantize_cmd) 50 | # zip 51 | print("zipping...") 52 | model_name_size = model_path.rsplit("/", maxsplit=1)[-1] 53 | zip_path = model_name_size + ".zip" 54 | zip_cmd = f"zip -qq -r {zip_path} {os.path.join(quantized_dir, model_path)}" 55 | os.system(zip_cmd) 56 | # md5sum 57 | print(f"md5sum is {_get_md5sum(zip_path)}.") 58 | print("uploading...") 59 | # upload 60 | upload_path = os.path.join(db_prefix, QM_method, zip_path) 61 | subebackups(zip_path, upload_path, args.token) 62 | print("removing temporary zip file...") 63 | # rm zip 64 | os.system(f"rm {zip_path}") 65 | 66 | 67 | if __name__ == "__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /transformer/include/nn_modules/Int8OPTDecoderLayer.h: -------------------------------------------------------------------------------- 1 | #include "Int8OPTAttention.h" 2 | #include "common.h" 3 | #include "operators.h" 4 | 5 | struct Int8OPTDecoderLayer_output { 6 | Matrix3D hidden_states; 7 | Matrix3D attentions; 8 | std::pair, Matrix3D> past_key_value; 9 | 10 | Int8OPTDecoderLayer_output(Matrix3D hidden_states_, Matrix3D attentions_, 11 | std::pair, Matrix3D> past_key_value_) { 12 | hidden_states = hidden_states_; 13 | attentions = attentions_; 14 | past_key_value = past_key_value_; 15 | }; 16 | }; 17 | struct Int8OPTDecoderLayer_input { 18 | Matrix3D hidden_states; 19 | Matrix3D attention_mask; 20 | Matrix3D past_key, past_value; 21 | bool has_past_key_value = false; 22 | 23 | Int8OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_) { 24 | hidden_states = hidden_states_; 25 | attention_mask = attention_mask_; 26 | has_past_key_value = false; 27 | } 28 | 29 | Int8OPTDecoderLayer_input(Matrix3D &hidden_states_, Matrix3D &attention_mask_, 30 | Matrix3D past_key_, Matrix3D past_value_) { 31 | hidden_states = hidden_states_; 32 | attention_mask = attention_mask_; 33 | past_key = past_key_; 34 | past_value = past_value_; 35 | has_past_key_value = true; 36 | } 37 | }; 38 | 39 | class Int8OPTDecoderLayer { 40 | public: 41 | Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx, 42 | LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm, W8A8B8O8LinearReLU fc1, 43 | W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm, BMM_S8T_S8N_S8T pv_bmm, 44 | W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj, W8A8B8O8Linear q_proj, 45 | W8A8BFP32OFP32Linear out_proj); 46 | struct Int8OPTDecoderLayer_output forward(const struct Int8OPTDecoderLayer_input &input); 47 | 48 | int embed_dim, num_attention_heads, hidden_dim, layer_idx; 49 | LayerNormQ self_attn_layer_norm, final_layer_norm; // from torch_int.nn 50 | W8A8B8O8LinearReLU fc1; 51 | W8A8BFP32OFP32Linear fc2; 52 | Int8OPTAttention attn; 53 | std::string profile_name = "Int8OPTDecoderLayer"; 54 | }; 55 | -------------------------------------------------------------------------------- /transformer/src/ops/BMM_S8T_S8N_S8T.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_BMM_S8T_S8N_S8T(BMM_S8T_S8N_S8T &op, std::string prefix) { 7 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 8 | } 9 | 10 | BMM_S8T_S8N_S8T::BMM_S8T_S8N_S8T(struct BMM_S8T_S8N_S8T_params &op_params) { alpha = op_params.alpha; } 11 | 12 | void BMM_S8T_S8N_S8T::forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output) { 13 | const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x; 14 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 15 | PROFILE_START_FLOPS(profile_name, ops); 16 | assert(output.m_dim_x == x.m_dim_x); 17 | assert(output.m_dim_y == x.m_dim_y); 18 | assert(output.m_dim_z == weight.m_dim_y); 19 | assert(x.m_dim_z == weight.m_dim_z); 20 | 21 | struct matmul_params params; 22 | 23 | params.A.row = m; 24 | params.A.column = k; 25 | params.A.int8_data_ptr = x.m_data; 26 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 27 | params.B.qparams.scale = 1.0; 28 | params.C.qparams.scale = 1.0; 29 | params.A.qparams.zero_point = 0; 30 | params.B.row = k; 31 | params.B.column = n; 32 | params.B.int8_data_ptr = weight.m_data; 33 | params.B.qparams.zero_point = 0; 34 | params.C.row = m; 35 | params.C.column = n; 36 | params.C.int8_data_ptr = output.m_data; 37 | params.C.qparams.zero_point = 0; 38 | params.opt_params.blk_size = BLK_SIZE; 39 | params.opt_params.num_thread = NUM_THREAD; 40 | params.C.qparams.q_max = 127; 41 | params.C.qparams.q_min = -128; 42 | params.alpha = alpha; 43 | 44 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 45 | 46 | // process each batch 47 | if (m == 1 && x.m_dim_x > 1) { 48 | // merge each batch 49 | params.A.row = x.m_dim_x; 50 | params.C.row = x.m_dim_x; 51 | // B is batched, need a new op for this! 52 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(¶ms); 53 | } else { 54 | for (int bz = 0; bz < x.m_dim_x; bz++) { 55 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(¶ms); 56 | params.A.int8_data_ptr += m * k; 57 | params.B.int8_data_ptr += k * n; 58 | params.C.int8_data_ptr += m * n; 59 | } 60 | } 61 | 62 | PROFILE_END(profile_name); 63 | } 64 | -------------------------------------------------------------------------------- /transformer/src/ops/BMM_S8T_S8N_F32T.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_BMM_S8T_S8N_F32T(BMM_S8T_S8N_F32T &op, std::string prefix) { 7 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 8 | } 9 | 10 | BMM_S8T_S8N_F32T::BMM_S8T_S8N_F32T(struct BMM_S8T_S8N_F32T_params &op_params) { alpha = op_params.alpha; } 11 | 12 | void BMM_S8T_S8N_F32T::forward(const Matrix3D &x, const Matrix3D &weight, Matrix3D &output) { 13 | const int m = x.m_dim_y, k = x.m_dim_z, n = weight.m_dim_y, b = x.m_dim_x; 14 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 15 | PROFILE_START_FLOPS(profile_name, ops); 16 | assert(output.m_dim_x == x.m_dim_x); 17 | assert(output.m_dim_y == x.m_dim_y); 18 | assert(output.m_dim_z == weight.m_dim_y); 19 | assert(x.m_dim_z == weight.m_dim_z); 20 | 21 | struct matmul_params params; 22 | 23 | params.A.row = m; 24 | params.A.column = k; 25 | params.A.int8_data_ptr = x.m_data; 26 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 27 | params.B.qparams.scale = 1.0; 28 | params.C.qparams.scale = 1.0; 29 | params.A.qparams.zero_point = 0; 30 | params.B.row = k; 31 | params.B.column = n; 32 | params.B.int8_data_ptr = weight.m_data; 33 | params.B.qparams.zero_point = 0; 34 | params.C.row = m; 35 | params.C.column = n; 36 | params.C.data_ptr = output.m_data; 37 | params.C.qparams.zero_point = 0; 38 | params.opt_params.blk_size = BLK_SIZE; 39 | params.opt_params.num_thread = NUM_THREAD; 40 | params.C.qparams.q_max = 127; 41 | params.C.qparams.q_min = -128; 42 | params.alpha = alpha; 43 | 44 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 45 | if (m == 1 && x.m_dim_x > 1) { 46 | // merge each batch 47 | params.A.row = x.m_dim_x; 48 | params.C.row = x.m_dim_x; 49 | // B is batched, need a new op for this! 50 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(¶ms); 51 | } else { 52 | // process each batch 53 | for (int bz = 0; bz < x.m_dim_x; bz++) { 54 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(¶ms); 55 | params.A.int8_data_ptr += m * k; 56 | params.B.int8_data_ptr += k * n; 57 | params.C.data_ptr += m * n; 58 | } 59 | } 60 | 61 | PROFILE_END(profile_name); 62 | } 63 | -------------------------------------------------------------------------------- /transformer/include/profiler.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | class Profiler { 7 | public: 8 | static Profiler& getInstance() { 9 | static Profiler instance; 10 | return instance; 11 | } 12 | 13 | void start(const std::string& section) { start_times[section] = std::chrono::high_resolution_clock::now(); } 14 | 15 | void start(const std::string& section, const long long section_flops) { 16 | start_times[section] = std::chrono::high_resolution_clock::now(); 17 | if (flops.count(section) == 0) 18 | flops[section] = section_flops; 19 | else 20 | flops[section] += section_flops; 21 | } 22 | 23 | void reset() { 24 | start_times.clear(); 25 | durations.clear(); 26 | counts.clear(); 27 | flops.clear(); 28 | } 29 | 30 | void stop(const std::string& section) { 31 | auto end_time = std::chrono::high_resolution_clock::now(); 32 | auto duration = std::chrono::duration_cast(end_time - start_times[section]).count(); 33 | durations[section] += duration; 34 | counts[section]++; 35 | } 36 | 37 | void report_internal() const { 38 | std::cout << "Section, Total time(ms), Average time(ms), Count, GOPs" << std::endl; 39 | for (const auto& entry : durations) { 40 | std::string row; 41 | row += entry.first + ", "; 42 | row += std::to_string((float)(entry.second) / 1000) + ", "; 43 | row += std::to_string((float)(entry.second / counts.at(entry.first)) / 1000) + ", "; 44 | if (flops.count(entry.first) == 0) 45 | row += std::to_string(counts.at(entry.first)) + ", N/A"; 46 | else { 47 | row += std::to_string(counts.at(entry.first)) + ", "; 48 | // ops and microsecond 49 | row += std::to_string((((float)flops.at(entry.first)) / (float)(entry.second)) / 1000.0); 50 | } 51 | std::cout << row << std::endl; 52 | } 53 | } 54 | 55 | void report() const { 56 | #ifdef PROFILER 57 | report_internal(); 58 | #endif 59 | } 60 | 61 | private: 62 | Profiler() {} 63 | Profiler(const Profiler&) = delete; 64 | Profiler& operator=(const Profiler&) = delete; 65 | 66 | std::map start_times; 67 | std::map flops; 68 | std::map durations; 69 | std::map counts; 70 | }; 71 | -------------------------------------------------------------------------------- /transformer/Makefile: -------------------------------------------------------------------------------- 1 | # Compiler and flags 2 | CXX = g++ 3 | CXXFLAGS = -std=c++11 -pthread -g -O0 -w 4 | CXXFLAGS += -DIMP=$(IMP) 5 | 6 | # Executable and source files 7 | TEST_TARGET = test_linear 8 | PROFILE_TARGET = 9 | APP_TARGET = chat 10 | TARGET = $(TEST_TARGET) $(PROFILE_TARGET) $(APP_TARGET) 11 | 12 | BUILDDIR := build/transformer 13 | PROFILEDIR := build_profile/transformer 14 | LIB_DIR = ../kernels 15 | LIB_SRC = $(wildcard $(LIB_DIR)/*.cc) 16 | INCLUDE_DIRS = -I$(LIB_DIR) -I./include -I./include/nn_modules -I./json/single_include/ 17 | LIB = 18 | 19 | ifeq ($(shell uname -m),x86_64) 20 | # For Intel machines with AVX 21 | CXXFLAGS += -mavx2 -mfma -ffast-math -fpermissive -DQM_x86 22 | LIB_SRC += $(wildcard $(LIB_DIR)/avx/*.cc) 23 | else ifeq ($(shell uname -p),arm) 24 | CXX = /opt/homebrew/opt/llvm/bin/clang++ 25 | LIB += -L/opt/homebrew/opt/boost/lib 26 | CXXFLAGS += -march=native -DQM_ARM -fPIC -march=armv8.2-a+dotprod 27 | INCLUDE_DIRS += -I/opt/homebrew/opt/boost/include 28 | LIB_SRC += $(wildcard $(LIB_DIR)/neon/*.cc) 29 | else 30 | @echo "Device unsupported!. 31 | LIB_REF_SRC = $(wildcard $(LIB_DIR)/ref/*.cc) 32 | LIB_SRC += $(LIB_REF_SRC) 33 | endif 34 | LIB_REF_SRC = $(wildcard $(LIB_DIR)/starter_code/*.cc) 35 | LIB_SRC += $(LIB_REF_SRC) 36 | 37 | SRC_DIR = src 38 | SRC = $(wildcard src/*.cc) 39 | SRC += $(wildcard src/nn_modules/*.cc) 40 | OPS = $(wildcard src/ops/*.cc) 41 | SRC += $(OPS) 42 | SRC += $(LIB_SRC) 43 | 44 | # Default target 45 | all: $(TARGET) 46 | 47 | # Phony targets 48 | .PHONY: all clean 49 | 50 | # Metal lib 51 | library.air: $(LIB_DIR)/metal/kernel/op.metal 52 | xcrun -sdk macosx metal -ffast-math -fno-fast-math $(LIB_ACC_INC) -c $< -o library.air 53 | default.metallib: library.air 54 | xcrun -sdk macosx metallib library.air -o default.metallib 55 | 56 | OBJS = $(addprefix $(BUILDDIR)/,$(SRC:.cc=.o)) 57 | PROFILE_OBJS = $(addprefix $(PROFILEDIR)/,$(SRC:.cc=.o)) 58 | 59 | # Pattern rules 60 | $(BUILDDIR)/%.o: %.cc 61 | @mkdir -p $(dir $@) 62 | @$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -c $< -o $@ 63 | 64 | $(PROFILEDIR)/%.o: %.cc 65 | @mkdir -p $(dir $@) 66 | @$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -DPROFILER -c $< -o $@ 67 | 68 | # Linking 69 | # Rule for TEST_TARGET 70 | $(TEST_TARGET): %: tests/%.cc $(OBJS) 71 | @ $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS) 72 | 73 | # Rule for APP_TARGET 74 | $(APP_TARGET): %: application/%.cc $(OBJS) 75 | @ $(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -o $@ $^ $(LIB) $(LDFLAGS) 76 | 77 | # Clean up 78 | clean: 79 | @ rm -f $(TARGET) 80 | @ rm -rf *.dSYM 81 | @ rm -rf build/kernels 82 | @ rm -rf $(BUILDDIR) 83 | @ rm -rf $(PROFILEDIR) 84 | -------------------------------------------------------------------------------- /transformer/include/ops/linear.h: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "utils.h" 3 | class Linear_FP { 4 | public: 5 | Linear_FP(Matrix3D weight_, std::string weight_path) : weight(weight_) { 6 | read_to_array((weight_path).c_str(), this->weight.m_data, this->weight.length()); 7 | }; 8 | Linear_FP(){}; 9 | void forward(const Matrix3D &x, Matrix3D &output); 10 | Matrix3D weight; 11 | 12 | private: 13 | std::string profile_name = "Linear_FP"; 14 | }; 15 | 16 | class Linear_FP_int4 { 17 | public: 18 | Linear_FP_int4(Matrix3D weight_, std::string weight_path) : weight(weight_) { 19 | float *scale_ptr, *zero_point_ptr; 20 | float *offset_ptr; 21 | // length of int8_t weight = elements / 2 22 | // length of scales/offset = elements / QK = weight / (QK/2) 23 | // length of zero_point = 1 24 | assert((weight.m_dim_z * 2) % (QK) == 0); 25 | allocate_aligned_memory(scale_ptr, (this->weight.length() * 2 * sizeof(float)) / QK); 26 | allocate_aligned_memory(offset_ptr, (this->weight.length() * 2 * sizeof(float)) / QK); 27 | allocate_aligned_memory(zero_point_ptr, 1 * sizeof(float)); 28 | 29 | int x = this->weight.m_dim_x, y = this->weight.m_dim_y, z = (this->weight.m_dim_z * 2) / QK; 30 | scale = Matrix3D(scale_ptr, x, y, z); 31 | offset = Matrix3D(offset_ptr, x, y, z); 32 | zero_point = Matrix3D(zero_point_ptr, 1, 1, 1); 33 | weight.load((weight_path + "/weight_int4.bin").c_str()); 34 | offset.load((weight_path + "/offset_int4.bin").c_str()); 35 | scale.load((weight_path + "/scaling_factor_int4.bin").c_str()); 36 | zero_point.load((weight_path + "/zero_point_int4.bin").c_str()); 37 | }; 38 | Linear_FP_int4(){}; 39 | void forward(const Matrix3D &x, Matrix3D &output); 40 | void forward_ref(const Matrix3D &x, Matrix3D &output); 41 | void forward_fast(const Matrix3D &x, Matrix3D &output); 42 | static void initialize_memory(const int block_size); 43 | Matrix3D weight; 44 | Matrix3D scale, zero_point; 45 | Matrix3D offset; 46 | 47 | #if IMP == 0 48 | std::string profile_name = "reference"; 49 | #elif IMP == 1 50 | std::string profile_name = "loop_unrolling"; 51 | #elif IMP == 2 52 | std::string profile_name = "multithreading"; 53 | #elif IMP == 3 54 | std::string profile_name = "simd_programming"; 55 | #elif IMP == 4 56 | std::string profile_name = "multithreading_loop_unrolling"; 57 | #elif IMP == 5 58 | std::string profile_name = "all_techniques"; 59 | #else 60 | std::string profile_name = "Unkown"; 61 | #endif 62 | }; 63 | -------------------------------------------------------------------------------- /transformer/tests/test_Fp32llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Fp32llamaForCausalLM.h" 5 | #include "operators.h" 6 | #include "utils.h" 7 | #include "utils_memalloc.h" 8 | 9 | void test_Fp32LlamaForCausalLM() { 10 | struct model_config config = get_opt_model_config(LLaMA_7B); 11 | const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1, 12 | hidden_dim = config.hidden_dim; 13 | const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers; 14 | MemoryAllocator mem_buf; 15 | 16 | // reasoning phase: 1st run 17 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 18 | input_ids.load("assets/llama/tests/model/1st_input_ids.bin"); 19 | struct Fp32LlamaForCausalLM_input input_1st = {input_ids}; 20 | 21 | Fp32LlamaForCausalLM model = Fp32LlamaForCausalLM("models/LLaMA_7B", config); 22 | 23 | struct Fp32LlamaForCausalLM_output output_1st = model.forward(input_1st); 24 | 25 | Matrix3D logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size); 26 | logits.load("assets/llama/tests/model/1st_logits.bin"); 27 | // print_first_k_elelment("O", output_1st.logits.m_data, 20); 28 | // print_first_k_elelment("G", logits.m_data, 20); 29 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8); 30 | 31 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 32 | embed_dim / num_heads); 33 | Profiler::getInstance().report(); 34 | Profiler::getInstance().reset(); 35 | 36 | // generating phase: 2nd run 37 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 38 | input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin"); 39 | struct Fp32LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 40 | 41 | struct Fp32LlamaForCausalLM_output output_2nd = model.forward(input_2nd); 42 | 43 | logits = Matrix3D(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size); 44 | logits.load("assets/llama/tests/model/2nd_logits.bin"); 45 | // print_first_k_elelment("O", output_2nd.logits.m_data, 20); 46 | // print_first_k_elelment("G", logits.m_data, 20); 47 | success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8); 48 | 49 | Profiler::getInstance().report(); 50 | if (!success) 51 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 52 | else 53 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 54 | } 55 | 56 | int main() { test_Fp32LlamaForCausalLM(); } 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorial for TinyChat: Optimizing LLM on Edge Devices 2 | 3 | This is a lab for [efficientml.ai course](https://efficientml.ai/). 4 | 5 | Running large language models (LLMs) on the edge is of great importance. By embedding LLMs directly into real-world systems such as in-car entertainment systems or spaceship control interfaces, users can access instant responses and services without relying on a stable internet connection. Moreover, this approach alleviates the inconvenience of queuing delays often associated with cloud services. As such, running LLMs on the edge not only enhances user experience but also addresses privacy concerns, as sensitive data remains localized and reduces the risk of potential breaches. 6 | 7 | However, despite their impressive capabilities, LLMs have traditionally been quite resource-intensive. They require considerable computational power and memory resources, which makes it challenging to run these models on edge devices with limited capabilities. 8 | 9 | In this lab, you will learn the following: 10 | * How to deploy an LLaMA2-7B-chat with TinyChatEngine on your computer. 11 | * Implement different optimization techniques (loop unrolling, multithreading, and SIMD programming) for the linear kernel. 12 | * Observe the end-to-end latency improvement achieved by each technique. 13 | 14 | 15 | ## TinyChatEngine 16 | 17 | This tutorial is based on [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine), a powerful neural network library specifically designed for the efficient deployment of quantized large language models (LLMs) on edge devices. 18 | 19 | ![demo](assets/figures/chat.gif) 20 | 21 | ## Tutorial document 22 | 23 | Please check this document and follow the instructions which will walk you through the tutorial: https://docs.google.com/document/d/13IaTfPKjp0KiSBEhPdX9IxgXMIAZfiFjor37OWQJhMM/edit?usp=sharing 24 | 25 | ## Submission 26 | 27 | * Report: Please write a report ([form](https://docs.google.com/document/d/17Z_ab8EhDvjcigLXdDqMqd2LTVsZ4CnpOYNkRTrnTmU/edit?usp=sharing)) that includes your code and the performance improvement for each starter code. 28 | * Code: Use `git diff` to generate a patch for your implementation. We will use this patch to test the correctness of your code. Please name your patch as `{studentID}-{ISA}.patch` where {ISA} should be one of x86 and ARM, depending on your computer. 29 | 30 | ## Related Projects 31 | 32 | [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine). 33 | 34 | [TinyEngine](https://github.com/mit-han-lab/tinyengine). 35 | 36 | [Smoothquant](https://github.com/mit-han-lab/smoothquant). 37 | 38 | [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://github.com/mit-han-lab/llm-awq) 39 | 40 | ## Acknowledgement 41 | 42 | [llama.cpp](https://github.com/ggerganov/llama.cpp) 43 | 44 | [transformers](https://github.com/huggingface/transformers) 45 | -------------------------------------------------------------------------------- /transformer/tests/test_Int4llamaForCausalLM.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Int4llamaForCausalLM.h" 5 | #include "operators.h" 6 | #include "utils.h" 7 | #include "utils_memalloc.h" 8 | 9 | void test_Int4LlamaForCausalLM() { 10 | struct model_config config = get_opt_model_config(LLaMA_7B); 11 | const int num_heads = config.num_heads, embed_dim = config.embed_dim, sqlen = 9, b = 1, 12 | hidden_dim = config.hidden_dim; 13 | const int voc_size = config.vocsize, padding_idx = 1, num_layers = config.num_layers; 14 | MemoryAllocator mem_buf; 15 | 16 | // reasoning phase: 1st run 17 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 18 | input_ids.load("assets/llama/tests/model/1st_input_ids.bin"); 19 | struct Int4LlamaForCausalLM_input input_1st = {input_ids}; 20 | 21 | Int4LlamaForCausalLM model = Int4LlamaForCausalLM("models/LLaMA_7B", config); 22 | 23 | struct Int4LlamaForCausalLM_output output_1st = model.forward(input_1st); 24 | 25 | Matrix3D logits(mem_buf.get_fpbuffer(b * sqlen * voc_size), b, sqlen, voc_size); 26 | logits.load("assets/llama/tests/model/1st_logits.bin"); 27 | // print_first_k_elelment("O", output_1st.logits.m_data, 20); 28 | // print_first_k_elelment("G", logits.m_data, 20); 29 | bool success = check_two_equal(output_1st.logits.m_data, logits.m_data, logits.length(), 1e-8); 30 | 31 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 32 | embed_dim / num_heads); 33 | Profiler::getInstance().report(); 34 | Profiler::getInstance().reset(); 35 | 36 | // generating phase: 2nd run 37 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 38 | input_ids_2nd.load("assets/llama/tests/model/2nd_input_ids.bin"); 39 | struct Int4LlamaForCausalLM_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 40 | 41 | struct Int4LlamaForCausalLM_output output_2nd; 42 | for (int i = 0; i < 10; i++) output_2nd = model.forward(input_2nd); 43 | 44 | logits = Matrix3D(mem_buf.get_fpbuffer(b * 1 * voc_size), b, 1, voc_size); 45 | logits.load("assets/llama/tests/model/2nd_logits.bin"); 46 | // print_first_k_elelment("O", output_2nd.logits.m_data, 20); 47 | // print_first_k_elelment("G", logits.m_data, 20); 48 | success &= check_two_equal(output_2nd.logits.m_data, logits.m_data, logits.length(), 1e-8); 49 | 50 | Profiler::getInstance().report(); 51 | if (!success) 52 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 53 | else 54 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 55 | } 56 | 57 | int main() { 58 | // This tests are directly from fp32 and are not completed yet! 59 | test_Int4LlamaForCausalLM(); 60 | } 61 | -------------------------------------------------------------------------------- /transformer/src/ops/W8A8BFP32OFP32Linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_W8A8BFP32OFP32Linear_params(W8A8BFP32OFP32Linear &op, std::string prefix) { 7 | read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length()); 8 | read_to_array((prefix + "/bias.bin").c_str(), op.params.bias.data_ptr, op.params.bias.length()); 9 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 10 | } 11 | 12 | W8A8BFP32OFP32Linear::W8A8BFP32OFP32Linear(struct W8A8BFP32OFP32Linear_params &op_params) { 13 | Matrix3D weight = op_params.weight; 14 | Matrix3D bias = op_params.bias; 15 | alpha = op_params.alpha; 16 | 17 | int k = weight.m_dim_z, n = weight.m_dim_y; 18 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 19 | params.B.qparams.scale = 1.0; 20 | params.C.qparams.scale = 1.0; 21 | params.A.qparams.zero_point = 0; 22 | params.B.row = k; 23 | params.B.column = n; 24 | params.B.int8_data_ptr = weight.m_data; 25 | params.B.qparams.zero_point = 0; 26 | params.C.column = n; 27 | params.C.qparams.zero_point = 0; 28 | params.opt_params.blk_size = BLK_SIZE; 29 | params.opt_params.num_thread = NUM_THREAD; 30 | params.bias.data_ptr = bias.m_data; 31 | params.bias.row = 1; 32 | params.bias.column = bias.m_dim_z; 33 | } 34 | 35 | void W8A8BFP32OFP32Linear::forward(const Matrix3D &x, Matrix3D &output) { 36 | const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x; 37 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 38 | PROFILE_START_FLOPS(profile_name, ops); 39 | assert(output.m_dim_x == x.m_dim_x); 40 | assert(output.m_dim_y == x.m_dim_y); 41 | assert(output.m_dim_z == params.B.column); 42 | assert(x.m_dim_z == params.B.row); 43 | assert(output.m_dim_z == params.bias.column); 44 | 45 | params.A.row = m; 46 | params.A.column = k; 47 | params.A.int8_data_ptr = x.m_data; 48 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 49 | params.C.row = m; 50 | params.C.column = n; 51 | params.C.data_ptr = output.m_data; 52 | params.C.qparams.zero_point = 0; 53 | params.alpha = alpha; 54 | 55 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 56 | 57 | if (m == 1) { 58 | // let's loop over the column dim instead of row 59 | for (int bz = 0; bz < x.m_dim_x; bz++) { 60 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(¶ms); 61 | params.A.int8_data_ptr += m * k; 62 | params.C.data_ptr += m * n; 63 | } 64 | } else { 65 | for (int bz = 0; bz < x.m_dim_x; bz++) { 66 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(¶ms); 67 | params.A.int8_data_ptr += m * k; 68 | params.C.data_ptr += m * n; 69 | } 70 | } 71 | 72 | PROFILE_END(profile_name); 73 | } 74 | -------------------------------------------------------------------------------- /transformer/tests/test_LLaMATokenizer.cc: -------------------------------------------------------------------------------- 1 | #include "LLaMATokenizer.h" 2 | 3 | static const std::map> &test_LLaMATokenizer() { 4 | static std::map> llama_answer = { 5 | /* 1. */ { 6 | "Hello World", 7 | { 8 | 1, 9 | 10994, 10 | 2787, 11 | }, 12 | }, 13 | /* 2. */ 14 | { 15 | " Hello World!", 16 | { 17 | 1, 18 | 15043, 19 | 2787, 20 | 29991, 21 | }, 22 | }, 23 | /* 3. */ 24 | { 25 | "This is Tiny LLM Engine.", 26 | { 27 | 1, 28 | 4013, 29 | 338, 30 | 323, 31 | 4901, 32 | 365, 33 | 26369, 34 | 10863, 35 | 29889, 36 | }, 37 | }, 38 | /* 4. */ 39 | { 40 | "Please introduce Massachusetts Institute of Technology (MIT)", 41 | { 42 | 1, 43 | 12148, 44 | 14944, 45 | 16167, 46 | 8907, 47 | 310, 48 | 17968, 49 | 313, 50 | 26349, 51 | 29897, 52 | }, 53 | }, 54 | /* 5. */ 55 | { 56 | "Building a website can be done in 10 simple steps. This message is for general people, so we assume " 57 | "they don't have basic concepts.", 58 | { 59 | 1, 8893, 292, 263, 4700, 508, 367, 2309, 297, 29871, 29896, 29900, 2560, 6576, 29889, 910, 2643, 60 | 338, 363, 2498, 2305, 29892, 577, 591, 5251, 896, 1016, 29915, 29873, 505, 6996, 22001, 29889, 61 | }, 62 | }, 63 | }; 64 | 65 | return llama_answer; 66 | }; 67 | 68 | int main(int argc, char **argv) { 69 | // load the vocab 70 | const std::string fname = "models/llama_vocab.bin"; 71 | llama_vocab vocab = llama_init_vocab(fname.c_str()); 72 | 73 | bool is_equal; 74 | int test_count = 1; 75 | for (const auto &llama_answer : test_LLaMATokenizer()) { 76 | std::vector input_ids(llama_answer.first.size()); 77 | const int n = llama_tokenize(vocab, llama_answer.first.c_str(), input_ids.data(), input_ids.size(), true); 78 | input_ids.resize(n); 79 | 80 | is_equal = input_ids.size() == llama_answer.second.size(); 81 | 82 | for (int i = 0; i < (int)input_ids.size() && is_equal; ++i) { 83 | if (input_ids[i] != llama_answer.second[i]) { 84 | is_equal = false; 85 | } 86 | } 87 | 88 | test_count++; 89 | } 90 | 91 | if (!is_equal) 92 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 93 | else 94 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 95 | 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /transformer/src/ops/W8A8B8O8LinearReLU.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_W8A8B8O8LinearReLU_params(W8A8B8O8LinearReLU &op, std::string prefix) { 7 | read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length()); 8 | read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length()); 9 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 10 | read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1); 11 | read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1); 12 | read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1); 13 | } 14 | 15 | W8A8B8O8LinearReLU::W8A8B8O8LinearReLU(struct W8A8B8O8LinearReLU_params &op_params) { 16 | Matrix3D weight = op_params.weight; 17 | Matrix3D bias = op_params.bias_int8; 18 | 19 | int k = weight.m_dim_z, n = weight.m_dim_y; 20 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 21 | params.B.qparams.scale = 1.0; 22 | params.C.qparams.scale = 1.0; 23 | params.A.qparams.zero_point = 0; 24 | params.B.row = k; 25 | params.B.column = n; 26 | params.B.int8_data_ptr = weight.m_data; 27 | params.B.qparams.zero_point = 0; 28 | params.C.qparams.zero_point = 0; 29 | params.opt_params.blk_size = BLK_SIZE; 30 | params.opt_params.num_thread = NUM_THREAD; 31 | params.C.qparams.q_max = 127; 32 | params.C.qparams.q_min = 0; 33 | params.bias.int8_data_ptr = bias.m_data; 34 | params.bias.row = 1; 35 | params.bias.column = bias.m_dim_z; 36 | params.alpha = alpha; 37 | params.beta = op_params.beta; 38 | } 39 | 40 | void W8A8B8O8LinearReLU::forward(const Matrix3D &x, Matrix3D &output) { 41 | const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x; 42 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 43 | PROFILE_START_FLOPS(profile_name, ops); 44 | assert(output.m_dim_x == x.m_dim_x); 45 | assert(output.m_dim_y == x.m_dim_y); 46 | assert(output.m_dim_z == params.B.column); 47 | assert(x.m_dim_z == params.B.row); 48 | assert(output.m_dim_z == params.bias.column); 49 | 50 | params.A.row = m; 51 | params.A.column = k; 52 | params.A.int8_data_ptr = x.m_data; 53 | params.C.row = m; 54 | params.C.column = n; 55 | params.C.int8_data_ptr = output.m_data; 56 | params.A.qparams.scale = alpha; 57 | params.alpha = alpha; 58 | params.beta = beta; 59 | 60 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 61 | 62 | if (m == 1) { 63 | // let's loop over the column dim instead of row 64 | for (int bz = 0; bz < x.m_dim_x; bz++) { 65 | matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(¶ms); 66 | params.A.int8_data_ptr += m * k; 67 | params.C.int8_data_ptr += m * n; 68 | } 69 | } else { 70 | for (int bz = 0; bz < x.m_dim_x; bz++) { 71 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(¶ms); 72 | params.A.int8_data_ptr += m * k; 73 | params.C.int8_data_ptr += m * n; 74 | } 75 | } 76 | 77 | PROFILE_END(profile_name); 78 | } 79 | -------------------------------------------------------------------------------- /transformer/src/ops/W8A8B8O8Linear.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "operators.h" 4 | #include "utils.h" 5 | 6 | void load_W8A8B8O8Linear_params(W8A8B8O8Linear &op, std::string prefix) { 7 | read_to_array((prefix + "/weight.bin").c_str(), op.params.B.int8_data_ptr, op.params.B.length()); 8 | read_to_array((prefix + "/bias_int8.bin").c_str(), op.params.bias.int8_data_ptr, op.params.bias.length()); 9 | read_to_array((prefix + "/alpha.bin").c_str(), &op.params.alpha, 1); 10 | read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); 11 | read_to_array((prefix + "/beta.bin").c_str(), &op.params.beta, 1); 12 | read_to_array((prefix + "/beta.bin").c_str(), &op.beta, 1); 13 | } 14 | 15 | W8A8B8O8Linear::W8A8B8O8Linear(struct W8A8B8O8Linear_params &op_params) { 16 | Matrix3D weight = op_params.weight; 17 | Matrix3D bias = op_params.bias; 18 | 19 | int k = weight.m_dim_z, n = weight.m_dim_y; 20 | params.A.qparams.scale = alpha; // effective_scale = a * B / C 21 | params.B.qparams.scale = 1.0; 22 | params.C.qparams.scale = 1.0; 23 | params.A.qparams.zero_point = 0; 24 | params.B.row = k; 25 | params.B.column = n; 26 | params.B.int8_data_ptr = weight.m_data; 27 | params.B.qparams.zero_point = 0; 28 | params.C.qparams.zero_point = 0; 29 | params.opt_params.blk_size = BLK_SIZE; 30 | params.opt_params.num_thread = NUM_THREAD; 31 | params.C.qparams.q_max = 127; 32 | params.C.qparams.q_min = -128; 33 | params.bias.int8_data_ptr = bias.m_data; 34 | params.bias.row = 1; 35 | params.bias.column = n; 36 | } 37 | 38 | void W8A8B8O8Linear::forward(const Matrix3D &x, Matrix3D &output) { 39 | const int m = x.m_dim_y, k = x.m_dim_z, n = params.B.column, b = x.m_dim_x; 40 | const long long ops = (long long)b * 2 * (long long)m * (long long)n * (long long)k + (long long)m * (long long)n; 41 | PROFILE_START_FLOPS(profile_name, ops); 42 | assert(output.m_dim_x == x.m_dim_x); 43 | assert(output.m_dim_y == x.m_dim_y); 44 | assert(output.m_dim_z == params.B.column); 45 | assert(x.m_dim_z == params.B.row); 46 | assert(output.m_dim_z == params.bias.column); 47 | 48 | params.A.row = m; 49 | params.A.column = k; 50 | params.A.int8_data_ptr = x.m_data; 51 | params.C.row = m; 52 | params.C.column = n; 53 | params.C.int8_data_ptr = output.m_data; 54 | params.A.qparams.scale = alpha; 55 | params.alpha = alpha; 56 | params.beta = beta; 57 | 58 | matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); 59 | 60 | // printf("W8A8B8O8Linear-m,n,k: %d, %d, %d\n", m,n,k); 61 | if (m == 1) { 62 | // params.opt_params.num_thread = 8; 63 | // let's loop over the column dim instead of row 64 | for (int bz = 0; bz < x.m_dim_x; bz++) { 65 | matmul_op.mat_mul_accelerator_int8_fast_32unroll_over_column(¶ms); 66 | params.A.int8_data_ptr += m * k; 67 | params.C.int8_data_ptr += m * n; 68 | } 69 | } else { 70 | for (int bz = 0; bz < x.m_dim_x; bz++) { 71 | matmul_op.mat_mul_accelerator_int8_fast_2x2_32unroll(¶ms); 72 | params.A.int8_data_ptr += m * k; 73 | params.C.int8_data_ptr += m * n; 74 | } 75 | } 76 | 77 | PROFILE_END(profile_name); 78 | } 79 | -------------------------------------------------------------------------------- /kernels/metal/matmul_metal_int4.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | #define NS_PRIVATE_IMPLEMENTATION 10 | #define CA_PRIVATE_IMPLEMENTATION 11 | #define MTL_PRIVATE_IMPLEMENTATION 12 | #include "matmul_metal_int4_imp.h" 13 | 14 | namespace matmul { 15 | void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { 16 | int i, j, k; 17 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 18 | const int block_size = params->block_size; 19 | float *scale = params->scales, *offset = params->offset; 20 | 21 | assert(params->block_size == 32); // support block size 32 for now 22 | 23 | for (i = 0; i < C->row; i++) { 24 | for (j = 0; j < C->column; j++) { 25 | float acc = 0; 26 | for (k = 0; k < B->row * 2; k += block_size) { 27 | float s = scale[j * (B->row / 16) + k / 32]; // /16:B->column is packed 4bits 28 | float o = offset[j * (B->row / 16) + k / 32]; // /16:B->column is packed 4bits 29 | uint8_t *weight_32_int4 = &B->int4_data_ptr[j * B->row + k / 2]; 30 | float *x_ptr = &A->data_ptr[i * A->column + k]; 31 | for (int qi = 0; qi < block_size / 2; qi += 4) { 32 | uint8_t packed_int4_0 = weight_32_int4[qi]; 33 | uint8_t packed_int4_1 = weight_32_int4[qi + 1]; 34 | uint8_t packed_int4_2 = weight_32_int4[qi + 2]; 35 | uint8_t packed_int4_3 = weight_32_int4[qi + 3]; 36 | float deq_0 = (float)((packed_int4_0 & 0x0F) - 8.0) * s + o; 37 | float deq_1 = (float)((packed_int4_1 & 0x0F) - 8.0) * s + o; 38 | float deq_2 = (float)((packed_int4_2 & 0x0F) - 8.0) * s + o; 39 | float deq_3 = (float)((packed_int4_3 & 0x0F) - 8.0) * s + o; 40 | float deq_4 = (float)((packed_int4_0 >> 4) - 8.0) * s + o; 41 | float deq_5 = (float)((packed_int4_1 >> 4) - 8.0) * s + o; 42 | float deq_6 = (float)((packed_int4_2 >> 4) - 8.0) * s + o; 43 | float deq_7 = (float)((packed_int4_3 >> 4) - 8.0) * s + o; 44 | acc += *x_ptr++ * deq_0; 45 | acc += *x_ptr++ * deq_1; 46 | acc += *x_ptr++ * deq_2; 47 | acc += *x_ptr++ * deq_3; 48 | acc += *x_ptr++ * deq_4; 49 | acc += *x_ptr++ * deq_5; 50 | acc += *x_ptr++ * deq_6; 51 | acc += *x_ptr++ * deq_7; 52 | } 53 | } 54 | C->data_ptr[i * C->column + j] = acc; 55 | } 56 | } 57 | }; 58 | 59 | void MatmulOperator::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { 60 | int i, j, k; 61 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 62 | const int block_size = params->block_size; 63 | float *scale = params->scales, *offset = params->offset; 64 | 65 | assert(params->block_size % 32 == 0); // support block size to be multiply of 32 66 | assert(A->row == C->row); // support block size to be multiply of 32 67 | 68 | MetalMatMulParams matmulparams = {(unsigned int)A->row, (unsigned int)C->column, (unsigned int)A->column, 69 | (unsigned int)block_size}; 70 | MetalMatmulBuffers bufferparams = {A->data_ptr, C->data_ptr, scale, offset, B->int4_data_ptr}; 71 | MetalMatmulInt4IMP::run(matmulparams, &bufferparams); 72 | }; 73 | } // namespace matmul 74 | -------------------------------------------------------------------------------- /transformer/include/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "model.h" 10 | 11 | #define MAX_LINEAR_LENGTH 1024 * 1024 * 16 // 16MB, TO BE REMOVED with better memory allocation! 12 | #define DEBUG false 13 | 14 | #define DEBUG_INS(x) \ 15 | if (DEBUG) x 16 | 17 | #define QK 32 18 | 19 | struct pack_q4_tensor { 20 | uint8_t qx[QK / 2]; 21 | float scale; 22 | }; 23 | 24 | struct pack_q8_tensor { 25 | int8_t qx[QK]; 26 | float scale; 27 | }; 28 | 29 | template 30 | class Matrix3D { 31 | public: 32 | Matrix3D(T *data, int dim_x, int dim_y, int dim_z) : m_data(data), m_dim_x(dim_x), m_dim_y(dim_y), m_dim_z(dim_z) {} 33 | 34 | T &operator()(int x, int y, int z) { 35 | if (x < 0 || x >= m_dim_x || y < 0 || y >= m_dim_y || z < 0 || z >= m_dim_z) { 36 | printf("%d, %d, %d\n", x, y, z); 37 | printf("%d, %d, %d\n", m_dim_x, m_dim_y, m_dim_z); 38 | throw std::out_of_range("Matrix3D: Indices out of range."); 39 | } 40 | return m_data[x * m_dim_y * m_dim_z + y * m_dim_z + z]; 41 | } 42 | 43 | const T &operator()(int x, int y, int z) const { 44 | if (x < 0 || x >= m_dim_x || y < 0 || y >= m_dim_y || z < 0 || z >= m_dim_z) { 45 | printf("%d, %d, %d\n", x, y, z); 46 | printf("%d, %d, %d\n", m_dim_x, m_dim_y, m_dim_z); 47 | throw std::out_of_range("Matrix3D: Indices out of range."); 48 | } 49 | return m_data[x * m_dim_y * m_dim_z + y * m_dim_z + z]; 50 | } 51 | 52 | bool operator==(const Matrix3D &other) const { 53 | if (m_dim_x != other.m_dim_x || m_dim_y != other.m_dim_y || m_dim_z != other.m_dim_z) { 54 | return false; 55 | } 56 | 57 | for (int x = 0; x < m_dim_x; ++x) { 58 | for (int y = 0; y < m_dim_y; ++y) { 59 | for (int z = 0; z < m_dim_z; ++z) { 60 | if ((*this)(x, y, z) != other(x, y, z)) { 61 | return false; 62 | } 63 | } 64 | } 65 | } 66 | 67 | return true; 68 | } 69 | 70 | int length() const { return m_dim_x * m_dim_y * m_dim_z; } 71 | T sum() const { 72 | T sum = 0; 73 | for (int i = 0; i < this->length(); i++) { 74 | sum += this->m_data[i]; 75 | } 76 | return sum; 77 | } 78 | T sum(int size) const { 79 | T sum = 0; 80 | for (int i = 0; i < size; i++) { 81 | sum += this->m_data[i]; 82 | } 83 | return sum; 84 | } 85 | 86 | T sum(int size, int start_idx) const { 87 | T sum = 0; 88 | for (int i = 0; i < size; i++) { 89 | sum += this->m_data[start_idx + i]; 90 | } 91 | return sum; 92 | } 93 | 94 | void load(const char *path) { 95 | std::ifstream infile(path, std::ios::binary | std::ios::in); 96 | if (infile.fail()) { 97 | std::cout << strerror(errno) << ": " << path << std::endl; 98 | throw("Expected error..."); 99 | } else { 100 | infile.read(reinterpret_cast(this->m_data), this->length() * sizeof(T)); 101 | infile.close(); 102 | } 103 | } 104 | T *m_data; 105 | int m_dim_x, m_dim_y, m_dim_z; 106 | 107 | // Default constructor 108 | Matrix3D() { m_data = NULL; } 109 | }; 110 | 111 | static inline void debug_info(std::string s) { 112 | #ifdef DEBUG 113 | std::cout << s << std::endl; 114 | #endif 115 | } 116 | #endif 117 | -------------------------------------------------------------------------------- /kernels/cuda/matmul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../matmul.h" 8 | 9 | #include 10 | #include 11 | #include "gemm_cuda.h" 12 | #include "dequantize.cuh" 13 | #include 14 | #include 15 | 16 | const int threadDim = 32; 17 | const int TILE_SIZE = threadDim; 18 | 19 | __global__ void matrixMul_blockC(float *A, float *B, float *C, int A_row, int A_column, int B_column){ 20 | int i = blockIdx.x * blockDim.x + threadIdx.x; 21 | int j = blockIdx.y * blockDim.y + threadIdx.y; 22 | 23 | float acc = 0; 24 | for (int k = 0; k < A_column; k++) 25 | acc += A[j * A_column + k] * B[k * B_column + i]; 26 | C[j * B_column +i] = acc; 27 | } 28 | 29 | __global__ void matrixMultiplyShared(const float *A, const float *B, float *C, int A_row, int A_column, int B_column) { 30 | int row = blockIdx.y * blockDim.y + threadIdx.y; 31 | int col = blockIdx.x * blockDim.x + threadIdx.x; 32 | 33 | __shared__ float As[TILE_SIZE][TILE_SIZE]; 34 | __shared__ float Bs[TILE_SIZE][TILE_SIZE]; 35 | 36 | float value = 0; 37 | 38 | for (int i = 0; i < A_column / TILE_SIZE; i++){ 39 | As[threadIdx.y][threadIdx.x] = A[(blockIdx.y * TILE_SIZE + threadIdx.y) * A_column + TILE_SIZE * i + threadIdx.x]; 40 | Bs[threadIdx.y][threadIdx.x] = B[(i * TILE_SIZE + threadIdx.y) * B_column + blockIdx.x * TILE_SIZE + threadIdx.x]; 41 | 42 | __syncthreads(); 43 | 44 | for (int k = 0; k < TILE_SIZE; k++) 45 | value += As[threadIdx.y][k] * Bs[k][threadIdx.x]; 46 | 47 | __syncthreads(); 48 | } 49 | 50 | 51 | C[row * B_column + col] = value; 52 | } 53 | 54 | namespace matmul{ 55 | 56 | void MatmulOperator::mat_mul_cuda(const struct matmul_params *params){ 57 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 58 | assert(A->column == B->row); 59 | assert(C->column == B->column); 60 | assert(C->row == A->row); 61 | 62 | float *d_A; 63 | float *d_B; 64 | float *d_C; 65 | 66 | // Initailize C 67 | /*for (int i = 0; i < C->row; i++) 68 | for (int j = 0; j < C->column; j++) 69 | C->data_ptr[j + C->column * i] = 0;*/ 70 | 71 | // Allocate memory 72 | cudaMalloc(&d_A, A->column*A->row*sizeof(float)); 73 | cudaMalloc(&d_B, B->column*B->row*sizeof(float)); 74 | cudaMalloc(&d_C, C->column*C->row*sizeof(float)); 75 | 76 | // Copy data to GPU 77 | cudaMemcpy(d_A, A->data_ptr, A->column*A->row*sizeof(float), cudaMemcpyHostToDevice); 78 | cudaMemcpy(d_B, B->data_ptr, B->column*B->row*sizeof(float), cudaMemcpyHostToDevice); 79 | cudaMemcpy(d_C, C->data_ptr, C->column*C->row*sizeof(float), cudaMemcpyHostToDevice); 80 | 81 | // Make sure we can break the input matrix into blocks 82 | assert(A->column % threadDim == 0); 83 | assert(A->row % threadDim == 0); 84 | assert(B->column % threadDim == 0); 85 | const dim3 threadsPerBlock(threadDim, threadDim); 86 | const dim3 numBlocks(C->column / threadsPerBlock.x, C->row / threadsPerBlock.y); 87 | 88 | // Invoke the cuda imp. 89 | 90 | // struct timeval start, end; 91 | // gettimeofday(&start, NULL); 92 | //matrixMul_blockC<<< numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, A->row, A->column, B->column); 93 | matrixMultiplyShared<<< numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, A->row, A->column, B->column); 94 | cudaDeviceSynchronize(); 95 | // gettimeofday(&end, NULL); 96 | // int us = interval_to_us(&start, &end); 97 | // std::cout << "cuda kernel: " << us / 1000 << " ms" << std::endl; 98 | 99 | // Get the result back 100 | cudaMemcpy(C->data_ptr, d_C, C->column*C->row*sizeof(float), cudaMemcpyDeviceToHost); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /kernels/matmul.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void quantize_fp32_to_int8(float *A, int8_t *qA, float *sA, int size, int block_size); 5 | 6 | // Data structures 7 | struct quantization_params { 8 | float scale; 9 | bool per_channel = false; 10 | int32_t zero_point; 11 | int8_t q_min = -128, q_max = 127; 12 | }; 13 | 14 | struct matrix { 15 | int row; 16 | int column; 17 | float *data_ptr; 18 | int32_t *int32_data_ptr; 19 | int8_t *int8_data_ptr; 20 | uint8_t *uint8_data_ptr; 21 | uint8_t *int4_data_ptr; 22 | struct quantization_params qparams; 23 | int length() { return row * column; } 24 | }; 25 | 26 | struct optimization_params { 27 | int blk_size; 28 | int num_thread = 8; 29 | }; 30 | 31 | struct matmul_params { 32 | struct matrix A, B, C, bias; 33 | struct optimization_params opt_params; 34 | float alpha, beta; 35 | // for int4 36 | float *scales, *offset, *zero_point; 37 | int block_size; 38 | // for int8 activation 39 | float *A_scales; 40 | int8_t A_zero_point; 41 | }; 42 | 43 | struct thread_args { 44 | const struct matrix *A; 45 | const struct matrix *B; 46 | const struct matrix *C; 47 | const struct matmul_params *params; 48 | int start_i, end_i, blk_size; 49 | }; 50 | 51 | #define MAX(A, B) ((A) > (B) ? (A) : (B)) 52 | #define MIN(A, B) ((A) < (B) ? (A) : (B)) 53 | namespace matmul { 54 | class MatmulOperator { 55 | public: 56 | void mat_mul_transposed(const struct matmul_params *params); 57 | void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params); 58 | // int8 59 | void naive_mat_mul_int8(const struct matmul_params *params); 60 | void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params); 61 | void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params); 62 | void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params); 63 | void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params); 64 | void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params); 65 | void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params); 66 | void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params); 67 | void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params *params); 68 | // int4 69 | void mat_mul_accelerator_int4_fast(const struct matmul_params *params); 70 | void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params); 71 | void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params); 72 | void naive_mat_mul_int4(const struct matmul_params *params); 73 | void naive_mat_mul_int4_with_offset(const struct matmul_params *params); 74 | // w8a4 code template functions 75 | void mat_mul_reference(struct matmul_params *params); 76 | void mat_mul_loop_unrolling(struct matmul_params *params); 77 | void mat_mul_multithreading(struct matmul_params *params); 78 | void mat_mul_multithreading_loop_unrolling(struct matmul_params *params); 79 | void mat_mul_simd_programming(struct matmul_params *params); 80 | void mat_mul_all_techniques(struct matmul_params *params); 81 | // cuda 82 | void mat_mul_cuda(const struct matmul_params *params); 83 | 84 | private: 85 | float interval_to_us(struct timeval *start, struct timeval *end); 86 | void CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C); 87 | void CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C); 88 | }; 89 | } // namespace matmul 90 | -------------------------------------------------------------------------------- /kernels/avx/matmul_avx_fp32.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include // AVX intrinsic 3 | #include 4 | #include 5 | #include 6 | #include // intel SSE intrinsic 7 | 8 | #include "../matmul.h" 9 | 10 | namespace matmul { 11 | 12 | inline void simd_mul_fp_128(const float *a, const float *b, float *c) { 13 | __m128 val = _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)); 14 | __m128 acc = _mm_add_ps(_mm_load_ps(c), val); 15 | _mm_store_ps(c, acc); 16 | } 17 | 18 | void *mat_mul_transposed_fastover_column_func(void *args) { 19 | int i, j, k; 20 | struct thread_args *mat_args = (struct thread_args *)args; 21 | const struct matrix *A = mat_args->A; 22 | const struct matrix *B = mat_args->B; 23 | const struct matrix *C = mat_args->C; 24 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 25 | int start_i = mat_args->start_i, end_i = mat_args->end_i; 26 | 27 | __m256 zero256 = _mm256_setzero_ps(); 28 | for (i = 0; i < C->row; i++) { 29 | for (j = start_i; j + 1 < end_i; j += 2) { 30 | __m256 acc = zero256, acc1 = zero256; 31 | __m256 *A256 = (__m256 *)&data_A[i * A->column]; 32 | __m256 *B256 = (__m256 *)&data_B[j * B->row]; 33 | __m256 *B256_1 = (__m256 *)&data_B[(j + 1) * B->row]; 34 | for (k = 0; k < A->column; k += 8) { 35 | __m256 Aik = _mm256_load_ps((const float *)A256++); 36 | __m256 Bjk = _mm256_load_ps((const float *)B256++); 37 | __m256 Bj1k = _mm256_load_ps((const float *)B256_1++); 38 | acc = _mm256_add_ps(acc, _mm256_mul_ps(Aik, Bjk)); 39 | acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(Aik, Bj1k)); 40 | } 41 | float *ptr = (float *)&acc; 42 | data_C[i * C->column + j] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7]; 43 | ptr = (float *)&acc1; 44 | data_C[i * C->column + j + 1] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7]; 45 | } 46 | // leftover 47 | if (j < end_i) { 48 | __m256 acc = zero256; 49 | for (k = 0; k < A->column; k += 8) { 50 | __m256 Aik = _mm256_load_ps(&data_A[i * A->column + k]); 51 | __m256 Bjk = _mm256_load_ps(&data_B[j * B->row + k]); 52 | acc = _mm256_add_ps(acc, _mm256_mul_ps(Aik, Bjk)); 53 | } 54 | float *ptr = (float *)&acc; 55 | data_C[i * C->column + j] = ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7]; 56 | j++; 57 | } 58 | } 59 | 60 | return NULL; 61 | } 62 | 63 | void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { 64 | int i, j, k; 65 | 66 | int num_thread = params->opt_params.num_thread; 67 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 68 | float *data_A = A->data_ptr, *data_B = B->data_ptr, *data_C = C->data_ptr; 69 | 70 | assert(A->column % 8 == 0); 71 | 72 | if (num_thread > C->column) num_thread = C->column; 73 | 74 | pthread_t thread_pool[num_thread]; 75 | struct thread_args threads_args[num_thread]; 76 | 77 | // Thread creation 78 | for (j = 0; j < num_thread; j++) { 79 | threads_args[j].start_i = j * (C->column / num_thread); 80 | threads_args[j].end_i = (j + 1) * (C->column / num_thread); 81 | threads_args[j].blk_size = params->opt_params.blk_size; 82 | threads_args[j].A = A; 83 | threads_args[j].B = B; 84 | threads_args[j].C = C; 85 | pthread_create(&thread_pool[j], NULL, mat_mul_transposed_fastover_column_func, &threads_args[j]); 86 | } 87 | // Join threads 88 | for (j = 0; j < num_thread; j++) { 89 | pthread_join(thread_pool[j], NULL); 90 | } 91 | } 92 | 93 | } // namespace matmul 94 | -------------------------------------------------------------------------------- /transformer/include/Generate.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "Fp32llamaForCausalLM.h" 12 | #include "Int4llamaForCausalLM.h" 13 | #include "OPTForCausalLM.h" 14 | #include "OPTTokenizer.h" 15 | #include "operators.h" 16 | #include "utils.h" 17 | 18 | inline std::mt19937 OPT_rng; 19 | 20 | typedef struct OPT_token_data { 21 | int id; // token id 22 | float logit; // log-odds of the token 23 | float p; // probability of the token 24 | } OPT_token_data; 25 | 26 | typedef struct OPT_token_data_array { 27 | OPT_token_data* data; 28 | size_t size; 29 | bool sorted; 30 | } OPT_token_data_array; 31 | 32 | struct opt_params { 33 | int32_t seed = -1; // RNG seed 34 | int32_t n_threads = 1; // TODO: fix this 35 | int32_t n_predict = 128; // new tokens to predict 36 | int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) 37 | int32_t n_ctx = 512; // context size 38 | int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) 39 | int32_t n_keep = 0; // number of tokens to keep from initial prompt 40 | int32_t n_vocab = 50272; // vocabulary size 41 | 42 | // sampling parameters 43 | std::unordered_map logit_bias; // logit bias for specific tokens 44 | int32_t top_k = 40; // <= 0 to use vocab size 45 | float top_p = 0.95f; // 1.0 = disabled 46 | float tfs_z = 1.00f; // 1.0 = disabled 47 | float typical_p = 1.00f; // 1.0 = disabled 48 | float temp = 0.80f; // 1.0 = disabled 49 | float repeat_penalty = 1.10f; // 1.0 = disabled 50 | int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) 51 | float frequency_penalty = 0.00f; // 0.0 = disabled 52 | float presence_penalty = 0.00f; // 0.0 = disabled 53 | int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 54 | float mirostat_tau = 5.00f; // target entropy 55 | float mirostat_eta = 0.10f; // learning rate 56 | }; 57 | 58 | void sample_repetition_penalty(OPT_token_data_array* candidates, const int* last_tokens, size_t last_tokens_size, 59 | float penalty); 60 | 61 | void sample_frequency_and_presence_penalties(OPT_token_data_array* candidates, const int* last_tokens_p, 62 | size_t last_tokens_size, float alpha_frequency, float alpha_presence); 63 | 64 | int sample_token_greedy(OPT_token_data_array* candidates); 65 | 66 | void sample_temperature(OPT_token_data_array* candidates_p, float temp); 67 | 68 | void sample_softmax(OPT_token_data_array* candidates); 69 | 70 | int sample_token(OPT_token_data_array* candidates); 71 | 72 | void sample_top_k(OPT_token_data_array* candidates, int k, size_t min_keep); 73 | 74 | int sample_token_mirostat(const int n_vocab, OPT_token_data_array* candidates, float tau, float eta, int m, float* mu); 75 | 76 | int sample_token_mirostat_v2(OPT_token_data_array* candidates, float tau, float eta, float* mu); 77 | 78 | void sample_tail_free(OPT_token_data_array* candidates, float z, size_t min_keep); 79 | 80 | void sample_typical(OPT_token_data_array* candidates, float p, size_t min_keep); 81 | 82 | void sample_top_p(OPT_token_data_array* candidates, float p, size_t min_keep); 83 | 84 | std::vector OPTGenerate(OPTForCausalLM model, std::vector input_ids, 85 | const struct opt_params generation_config, Encoder* encoder = NULL, 86 | bool interactive = false); 87 | 88 | enum { OPT, LLaMA_FP32, LLaMA_INT4 }; 89 | std::vector LLaMAGenerate(void* model, int model_type, std::string text, const struct opt_params generation_config, 90 | std::string voc_path, bool interactive); 91 | -------------------------------------------------------------------------------- /transformer/src/ops/BMM_F32T.cc: -------------------------------------------------------------------------------- 1 | #include "operators.h" 2 | #include "utils.h" 3 | 4 | void load_BMM_F32T(BMM_F32T &op, std::string prefix) { read_to_array((prefix + "/alpha.bin").c_str(), &op.alpha, 1); } 5 | 6 | BMM_F32T::BMM_F32T(float _alpha) { this->alpha = _alpha; } 7 | 8 | void BMM_F32T::forward(const Matrix3D &a, const Matrix3D &weight, Matrix3D &c) { 9 | const Matrix3D b = weight; 10 | const int m = a.m_dim_y, n = b.m_dim_y, k = a.m_dim_z, b_size = b.m_dim_x; 11 | const long long ops = (long long)b_size * 2 * (long long)m * (long long)n * (long long)k; 12 | PROFILE_START_FLOPS(profile_name, ops); 13 | 14 | // a: m x k b: n x k c: m x n 15 | assert(a.m_dim_x == b.m_dim_x); // batch dim 16 | assert(a.m_dim_z == b.m_dim_z); // k 17 | assert(a.m_dim_y == c.m_dim_y); // m 18 | assert(b.m_dim_y == c.m_dim_z); // n 19 | 20 | struct matmul_params params; 21 | params.A.row = a.m_dim_y; 22 | params.A.column = a.m_dim_z; 23 | params.A.data_ptr = a.m_data; 24 | params.B.row = b.m_dim_y; 25 | params.B.column = b.m_dim_z; 26 | params.B.data_ptr = b.m_data; 27 | params.C.row = c.m_dim_y; 28 | params.C.column = c.m_dim_z; 29 | params.C.data_ptr = c.m_data; 30 | params.opt_params.blk_size = BLK_SIZE; 31 | params.opt_params.num_thread = NUM_THREAD; 32 | params.alpha = alpha; 33 | 34 | matmul::MatmulOperator op = matmul::MatmulOperator(); 35 | 36 | for (int bz = 0; bz < a.m_dim_x; bz++) { 37 | // if (params.A.column % 8 == 0) // TODO: debug this 38 | // op.mat_mul_transposed_fastover_column((const struct matmul_params 39 | // *)¶ms); 40 | // else 41 | op.mat_mul_transposed(¶ms); // TODO: optimize this 42 | // TODO: apply SIMD here 43 | for (int i = 0; i < m * n; i++) { 44 | params.C.data_ptr[i] *= this->alpha; 45 | } 46 | params.A.data_ptr += m * k; 47 | params.B.data_ptr += k * n; 48 | params.C.data_ptr += m * n; 49 | } 50 | 51 | PROFILE_END(profile_name); 52 | } 53 | 54 | void BMM_F32T::forward_weight_untransposed(const Matrix3D &a, const Matrix3D &weight, 55 | Matrix3D &c) { 56 | const Matrix3D b = weight; 57 | const int m = a.m_dim_y, n = c.m_dim_z, k = a.m_dim_z, b_size = b.m_dim_x; 58 | const long long ops = (long long)b_size * 2 * (long long)m * (long long)n * (long long)k; 59 | PROFILE_START_FLOPS(profile_name, ops); 60 | 61 | // a: m x k b: n x k c: m x n 62 | assert(a.m_dim_x == b.m_dim_x); // batch dim 63 | assert(a.m_dim_z == b.m_dim_y); // k 64 | assert(a.m_dim_y == c.m_dim_y); // m 65 | assert(b.m_dim_z == c.m_dim_z); // n 66 | 67 | struct matmul_params params; 68 | params.A.row = a.m_dim_y; 69 | params.A.column = a.m_dim_z; 70 | params.A.data_ptr = a.m_data; 71 | params.B.row = b.m_dim_y; 72 | params.B.column = b.m_dim_z; 73 | params.B.data_ptr = b.m_data; 74 | params.C.row = c.m_dim_y; 75 | params.C.column = c.m_dim_z; 76 | params.C.data_ptr = c.m_data; 77 | params.opt_params.blk_size = BLK_SIZE; 78 | params.opt_params.num_thread = NUM_THREAD; 79 | params.alpha = alpha; 80 | 81 | matmul::MatmulOperator op = matmul::MatmulOperator(); 82 | 83 | for (int i = 0; i < m * n * a.m_dim_x; i++) { 84 | params.C.data_ptr[i] = 0; 85 | } 86 | 87 | for (int bz = 0; bz < a.m_dim_x; bz++) { 88 | float *data_A = params.A.data_ptr + bz * m * k, *data_B = params.B.data_ptr + bz * k * n, 89 | *data_C = params.C.data_ptr + bz * m * n; 90 | for (int i = 0; i < m; i++) 91 | for (int kk = 0; kk < k; kk++) { 92 | float Aikk0 = data_A[i * k + kk]; 93 | for (int j = 0; j < n; j++) { 94 | float Bjk0 = data_B[kk * n + j]; 95 | data_C[i * n + j] += Aikk0 * Bjk0; 96 | } 97 | } 98 | } 99 | 100 | PROFILE_END(profile_name); 101 | } 102 | -------------------------------------------------------------------------------- /transformer/tests/test_Fp32llamaDecoder.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Fp32llamaDecoder.h" 4 | #include "operators.h" 5 | #include "utils.h" 6 | #include "utils_memalloc.h" 7 | 8 | void test_Decoder() { 9 | const struct model_config llama7B = llama_7B; 10 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 11 | head_dim = embed_dim / num_heads, num_layers = llama7B.num_layers; 12 | MemoryAllocator mem_buf; 13 | 14 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 15 | input_ids.load("assets/llama/tests/decoder/1st_input_ids.bin"); 16 | struct Fp32llamaDecoder_input input_1st = {input_ids}; 17 | 18 | Fp32llamaDecoder decoder = Fp32llamaDecoder("models/LLaMA_7B/decoder/", llama7B); 19 | 20 | struct Fp32llamaDecoder_output output_1st = decoder.forward(input_1st); 21 | 22 | // reasoning phase: 1st run 23 | Matrix3D last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 24 | last_hidden_state1_GT.load("assets/llama/tests/decoder/1st_last_hidden_state.bin"); 25 | 26 | // print_first_k_elelment("output_1st.last_hidden_state", output_1st.last_hidden_state.m_data, 20); 27 | // print_first_k_elelment("last_hidden_state1_GT", last_hidden_state1_GT.m_data, 20); 28 | bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data, 29 | last_hidden_state1_GT.length(), 1e-8); 30 | 31 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 32 | embed_dim / num_heads); 33 | for (int i = 0; i < num_layers; i++) { 34 | std::string path = "assets/llama/tests/decoder/1st/past_key_value/key" + std::to_string(i) + ".bin"; 35 | temp_key_value.load(path.c_str()); 36 | success &= 37 | check_two_equal(output_1st.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 38 | 39 | path = "assets/llama/tests/decoder/1st/past_key_value/value" + std::to_string(i) + ".bin"; 40 | temp_key_value.load(path.c_str()); 41 | success &= 42 | check_two_equal(output_1st.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 43 | } 44 | 45 | // generating phase: 2nd run 46 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 47 | input_ids_2nd.load("assets/llama/tests/decoder/2nd/input_ids.bin"); 48 | struct Fp32llamaDecoder_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 49 | 50 | struct Fp32llamaDecoder_output output_2nd = decoder.forward(input_2nd); 51 | 52 | Matrix3D last_hidden_state2_GT(mem_buf.get_fpbuffer(b * 1 * embed_dim), b, 1, embed_dim); 53 | last_hidden_state2_GT.load("assets/llama/tests/decoder/2nd/last_hidden_state.bin"); 54 | success &= check_two_equal(output_2nd.last_hidden_state.m_data, last_hidden_state2_GT.m_data, 55 | last_hidden_state2_GT.length(), 1e-8); 56 | 57 | temp_key_value = Matrix3D(mem_buf.get_fpbuffer(b * (sqlen + 1) * embed_dim), num_heads, (sqlen + 1), 58 | embed_dim / num_heads); 59 | for (int i = 0; i < num_layers; i++) { 60 | std::string path = "assets/llama/tests/decoder/2nd/past_key_value/key" + std::to_string(i) + ".bin"; 61 | temp_key_value.load(path.c_str()); 62 | success &= 63 | check_two_equal(output_2nd.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 64 | 65 | path = "assets/llama/tests/decoder/2nd/past_key_value/value" + std::to_string(i) + ".bin"; 66 | temp_key_value.load(path.c_str()); 67 | success &= 68 | check_two_equal(output_2nd.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 69 | } 70 | 71 | if (!success) 72 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 73 | else 74 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 75 | } 76 | 77 | int main() { test_Decoder(); } 78 | -------------------------------------------------------------------------------- /kernels/cuda/dequantize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h 3 | 4 | @article{lin2023awq, 5 | title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, 6 | author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, 7 | journal={arXiv}, 8 | year={2023} 9 | } 10 | */ 11 | 12 | #pragma once 13 | 14 | 15 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) 16 | { 17 | uint4 result; 18 | 19 | uint32_t* h = reinterpret_cast(&result); 20 | uint32_t const i4s = reinterpret_cast(source); 21 | 22 | // First, we extract the i4s and construct an intermediate fp16 number. 23 | static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; 24 | static constexpr uint32_t BOTTOM_MASK = 0x000f000f; 25 | static constexpr uint32_t TOP_MASK = 0x00f000f0; 26 | static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; 27 | 28 | // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing 29 | // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. 30 | // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and 31 | // elt_67 to fp16 without having to shift them to the bottom bits before hand. 32 | 33 | // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue 34 | // immediately before required. 35 | const uint32_t top_i4s = i4s >> 8; 36 | // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 37 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 38 | : "=r"(h[0]) 39 | : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 40 | // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 41 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 42 | : "=r"(h[1]) 43 | : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 44 | // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 45 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 46 | : "=r"(h[2]) 47 | : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 48 | // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 49 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 50 | : "=r"(h[3]) 51 | : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 52 | 53 | // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the 54 | // half2 ctor. In this case, I chose performance reliability over code readability. 55 | 56 | // This is the half2 {1032, 1032} represented as an integer. 57 | // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; 58 | // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] 59 | static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; 60 | // This is the half2 {1 / 16, 1 / 16} represented as an integer. 61 | static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; 62 | // This is the half2 {-72, -72} represented as an integer. 63 | // static constexpr uint32_t NEG_72 = 0xd480d480; 64 | // Haotian: Let's use {-64, -64}. 65 | static constexpr uint32_t NEG_64 = 0xd400d400; 66 | 67 | // Finally, we construct the output numbers. 68 | // Convert elt_01 69 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); 70 | // Convert elt_23 71 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 72 | // Convert elt_45 73 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); 74 | // Convert elt_67 75 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 76 | 77 | return result; 78 | } 79 | -------------------------------------------------------------------------------- /transformer/tests/test_Int4llamaDecoder.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Int4llamaDecoder.h" 4 | #include "operators.h" 5 | #include "utils.h" 6 | #include "utils_memalloc.h" 7 | 8 | void test_Decoder() { 9 | const struct model_config llama7B = llama_7B; 10 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 11 | head_dim = embed_dim / num_heads, num_layers = llama7B.num_layers; 12 | MemoryAllocator mem_buf; 13 | 14 | Matrix3D input_ids(mem_buf.get_intbuffer(sqlen), b, 1, sqlen); 15 | input_ids.load("assets/llama/tests/decoder/1st_input_ids.bin"); 16 | struct Int4llamaDecoder_input input_1st = {input_ids}; 17 | 18 | Int4llamaDecoder decoder = Int4llamaDecoder("models/LLaMA_7B/decoder/", llama7B); 19 | 20 | struct Int4llamaDecoder_output output_1st = decoder.forward(input_1st); 21 | 22 | // reasoning phase: 1st run 23 | Matrix3D last_hidden_state1_GT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 24 | last_hidden_state1_GT.load("assets/llama/tests/decoder/1st_last_hidden_state.bin"); 25 | 26 | // print_first_k_elelment("output_1st.last_hidden_state", output_1st.last_hidden_state.m_data, 20); 27 | // print_first_k_elelment("last_hidden_state1_GT", last_hidden_state1_GT.m_data, 20); 28 | bool success = check_two_equal(output_1st.last_hidden_state.m_data, last_hidden_state1_GT.m_data, 29 | last_hidden_state1_GT.length(), 1e-8); 30 | 31 | Matrix3D temp_key_value(mem_buf.get_fpbuffer(b * sqlen * embed_dim), num_heads, sqlen, 32 | embed_dim / num_heads); 33 | for (int i = 0; i < num_layers; i++) { 34 | std::string path = "assets/llama/tests/decoder/1st/past_key_value/key" + std::to_string(i) + ".bin"; 35 | temp_key_value.load(path.c_str()); 36 | success &= 37 | check_two_equal(output_1st.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 38 | 39 | path = "assets/llama/tests/decoder/1st/past_key_value/value" + std::to_string(i) + ".bin"; 40 | temp_key_value.load(path.c_str()); 41 | success &= 42 | check_two_equal(output_1st.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 43 | } 44 | 45 | // generating phase: 2nd run 46 | Matrix3D input_ids_2nd(mem_buf.get_intbuffer(sqlen), b, 1, 1); 47 | input_ids_2nd.load("assets/llama/tests/decoder/2nd/input_ids.bin"); 48 | struct Int4llamaDecoder_input input_2nd = {input_ids_2nd, output_1st.past_keys, output_1st.past_values}; 49 | 50 | struct Int4llamaDecoder_output output_2nd = decoder.forward(input_2nd); 51 | 52 | Matrix3D last_hidden_state2_GT(mem_buf.get_fpbuffer(b * 1 * embed_dim), b, 1, embed_dim); 53 | last_hidden_state2_GT.load("assets/llama/tests/decoder/2nd/last_hidden_state.bin"); 54 | success &= check_two_equal(output_2nd.last_hidden_state.m_data, last_hidden_state2_GT.m_data, 55 | last_hidden_state2_GT.length(), 1e-8); 56 | 57 | temp_key_value = Matrix3D(mem_buf.get_fpbuffer(b * (sqlen + 1) * embed_dim), num_heads, (sqlen + 1), 58 | embed_dim / num_heads); 59 | for (int i = 0; i < num_layers; i++) { 60 | std::string path = "assets/llama/tests/decoder/2nd/past_key_value/key" + std::to_string(i) + ".bin"; 61 | temp_key_value.load(path.c_str()); 62 | success &= 63 | check_two_equal(output_2nd.past_keys[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 64 | 65 | path = "assets/llama/tests/decoder/2nd/past_key_value/value" + std::to_string(i) + ".bin"; 66 | temp_key_value.load(path.c_str()); 67 | success &= 68 | check_two_equal(output_2nd.past_values[i].m_data, temp_key_value.m_data, temp_key_value.length(), 1e-8); 69 | } 70 | 71 | if (!success) 72 | std::cout << "-------- Test of " << __func__ << ": Fail! -------- " << std::endl; 73 | else 74 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 75 | } 76 | 77 | int main() { 78 | // This tests are directly from fp32 and are not completed yet! 79 | // test_Decoder(); 80 | } 81 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/Int8OPTDecoderLayer.cc: -------------------------------------------------------------------------------- 1 | #include "Int8OPTDecoderLayer.h" 2 | 3 | #include "utils.h" 4 | 5 | // Shared memory space across all layers 6 | static float *hidden_states_float_arr; 7 | static int8_t *final_layer_norm_arr; 8 | static int8_t *fc_1_arr; 9 | static float *fc_2_arr; 10 | static float *temp; 11 | static int8_t *hidden_states_int8_arr; 12 | 13 | template 14 | void add(Matrix3D a, Matrix3D b, Matrix3D c) { 15 | PROFILE_START("Int8OPTDecoderLayer::add"); 16 | assert(c.length() == a.length() && a.length() == b.length()); 17 | 18 | for (int i = 0; i < a.length(); i++) { 19 | c.m_data[i] = a.m_data[i] + b.m_data[i]; 20 | } 21 | PROFILE_END("Int8OPTDecoderLayer::add"); 22 | } 23 | 24 | struct Int8OPTDecoderLayer_output Int8OPTDecoderLayer::forward(const struct Int8OPTDecoderLayer_input &input) { 25 | PROFILE_START(profile_name); 26 | // Layernorm 27 | Matrix3D hidden_states_int8(hidden_states_int8_arr, input.hidden_states.m_dim_x, 28 | input.hidden_states.m_dim_y, input.hidden_states.m_dim_z); 29 | this->self_attn_layer_norm.forward(input.hidden_states, hidden_states_int8); 30 | 31 | // Attention 32 | struct Int8OPTAttention_input attn_param(hidden_states_int8, input.attention_mask, input.past_key, input.past_value, 33 | input.has_past_key_value, this->layer_idx); 34 | struct Int8OPTAttention_output attn_output = this->attn.forward(attn_param); 35 | 36 | // Residual add 37 | Matrix3D residual_add(hidden_states_float_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y, 38 | input.hidden_states.m_dim_z); 39 | add(input.hidden_states, attn_output.attn_output, residual_add); 40 | 41 | // Layernorm 42 | Matrix3D final_layer_norm(final_layer_norm_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y, 43 | input.hidden_states.m_dim_z); 44 | this->final_layer_norm.forward(residual_add, final_layer_norm); 45 | 46 | // FC 47 | Matrix3D fc1_out(fc_1_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y, this->hidden_dim); 48 | this->fc1.forward(final_layer_norm, fc1_out); 49 | Matrix3D fc2_out(fc_2_arr, input.hidden_states.m_dim_x, input.hidden_states.m_dim_y, 50 | input.hidden_states.m_dim_z); 51 | this->fc2.forward(fc1_out, fc2_out); 52 | 53 | // Reidual add 54 | add(residual_add, fc2_out, residual_add); 55 | 56 | struct Int8OPTDecoderLayer_output output(residual_add, attn_output.attn_probs_reshaped, attn_output.past_key_value); 57 | PROFILE_END(profile_name); 58 | return output; 59 | } 60 | 61 | Int8OPTDecoderLayer::Int8OPTDecoderLayer(std::string param_path, const struct model_config config, int layer_idx, 62 | LayerNormQ self_attn_layer_norm, LayerNormQ final_layer_norm, 63 | W8A8B8O8LinearReLU fc1, W8A8BFP32OFP32Linear fc2, BMM_S8T_S8N_F32T qk_bmm, 64 | BMM_S8T_S8N_S8T pv_bmm, W8A8B8O8Linear k_proj, W8A8B8O8Linear v_proj, 65 | W8A8B8O8Linear q_proj, W8A8BFP32OFP32Linear out_proj) { 66 | if (layer_idx == 0) { 67 | allocate_aligned_memory(hidden_states_float_arr, config.max_sqlen * config.embed_dim * sizeof(float)); 68 | allocate_aligned_memory(final_layer_norm_arr, config.max_sqlen * config.embed_dim * sizeof(int8_t)); 69 | allocate_aligned_memory(fc_1_arr, config.max_sqlen * config.hidden_dim * sizeof(int8_t)); 70 | allocate_aligned_memory(fc_2_arr, config.max_sqlen * config.embed_dim * sizeof(float)); 71 | allocate_aligned_memory(hidden_states_int8_arr, config.max_sqlen * config.embed_dim * sizeof(int8_t)); 72 | Int8OPTAttention::initialized_memory(config); 73 | } 74 | 75 | load_LayerNormQ(self_attn_layer_norm, param_path + "/self_attn_layer_norm"); 76 | load_W8A8B8O8LinearReLU_params(fc1, param_path + "/fc1"); 77 | load_W8A8BFP32OFP32Linear_params(fc2, param_path + "/fc2"); 78 | load_LayerNormQ(final_layer_norm, param_path + "/final_layer_norm"); 79 | 80 | this->embed_dim = config.embed_dim; 81 | this->num_attention_heads = config.num_heads; 82 | this->hidden_dim = config.hidden_dim; 83 | this->layer_idx = layer_idx; 84 | this->self_attn_layer_norm = self_attn_layer_norm; 85 | this->fc1 = fc1; 86 | this->fc2 = fc2; 87 | this->final_layer_norm = final_layer_norm; 88 | 89 | this->attn = Int8OPTAttention(param_path + "/self_attn", config, qk_bmm, pv_bmm, k_proj, v_proj, q_proj, out_proj); 90 | } 91 | -------------------------------------------------------------------------------- /transformer/tests/test_Fp32llamaDecoderLayer.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaDecoderLayer.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | void test_Fp32llamaDecoderLayer() { 7 | const struct model_config llama7B = llama_7B; 8 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads; 9 | 10 | MemoryAllocator mem_buf; 11 | 12 | Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0); 13 | 14 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 15 | hidden_states.load("assets/llama/tests/layer0/sqlen9/hidden_states.bin"); 16 | // print_first_k_elelment("hidden_states", hidden_states.m_data, 10); 17 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 18 | attention_mask.load("assets/llama/tests/layer0/sqlen9/attention_mask.bin"); 19 | 20 | struct Fp32llamaDecoderLayer_input input(hidden_states, attention_mask); 21 | 22 | struct Fp32llamaDecoderLayer_output output = layer.forward(input); 23 | 24 | Matrix3D outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 25 | outputGT.load("assets/llama/tests/layer0/sqlen9/output_hidden_states.bin"); 26 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 27 | key_statesGT.load("assets/llama/tests/layer0/sqlen9/present_key.bin"); 28 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 29 | value_statesGT.load("assets/llama/tests/layer0/sqlen9/present_value.bin"); 30 | 31 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 32 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 33 | success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length()); 34 | if (!success) 35 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 36 | else 37 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 38 | } 39 | 40 | void test_Fp32llamaDecoderLayer_gen() { 41 | const struct model_config llama7B = llama_7B; 42 | const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 43 | head_dim = embed_dim / num_heads; 44 | const int tgz = (sqlen + past_sqlen); 45 | 46 | MemoryAllocator mem_buf; 47 | 48 | Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0); 49 | 50 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 51 | hidden_states.load("assets/llama/tests/layer0/sqlen1/hidden_states.bin"); 52 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * tgz), 1, sqlen, tgz); 53 | attention_mask.load("assets/llama/tests/layer0/sqlen1/attention_mask.bin"); 54 | Matrix3D past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 55 | past_key.load("assets/llama/tests/layer0/sqlen1/past_key.bin"); 56 | Matrix3D past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 57 | past_value.load("assets/llama/tests/layer0/sqlen1/past_value.bin"); 58 | 59 | struct Fp32llamaDecoderLayer_input input(hidden_states, attention_mask, past_key, past_value); 60 | 61 | struct Fp32llamaDecoderLayer_output output = layer.forward(input); 62 | 63 | Matrix3D outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 64 | outputGT.load("assets/llama/tests/layer0/sqlen1/output_hidden_states.bin"); 65 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, sqlen, embed_dim / num_heads); 66 | key_statesGT.load("assets/llama/tests/layer0/sqlen1/present_key.bin"); 67 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, tgz, embed_dim / num_heads); 68 | value_statesGT.load("assets/llama/tests/layer0/sqlen1/present_value.bin"); 69 | 70 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 71 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 72 | success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length()); 73 | if (!success) 74 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 75 | else 76 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 77 | } 78 | 79 | int main() { 80 | test_Fp32llamaDecoderLayer(); 81 | test_Fp32llamaDecoderLayer_gen(); 82 | } 83 | -------------------------------------------------------------------------------- /transformer/tests/test_Int4llamaDecoderLayer.cc: -------------------------------------------------------------------------------- 1 | #include "Int4llamaDecoderLayer.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | void test_Int4llamaDecoderLayer() { 7 | const struct model_config llama7B = llama_7B; 8 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads; 9 | 10 | MemoryAllocator mem_buf; 11 | 12 | Int4llamaDecoderLayer layer = Int4llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0); 13 | 14 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 15 | hidden_states.load("assets/llama/tests/layer0/sqlen9/hidden_states.bin"); 16 | // print_first_k_elelment("hidden_states", hidden_states.m_data, 10); 17 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 18 | attention_mask.load("assets/llama/tests/layer0/sqlen9/attention_mask.bin"); 19 | 20 | struct Int4llamaDecoderLayer_input input(hidden_states, attention_mask); 21 | 22 | struct Int4llamaDecoderLayer_output output = layer.forward(input); 23 | 24 | Matrix3D outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 25 | outputGT.load("assets/llama/tests/layer0/sqlen9/output_hidden_states.bin"); 26 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 27 | key_statesGT.load("assets/llama/tests/layer0/sqlen9/present_key.bin"); 28 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 29 | value_statesGT.load("assets/llama/tests/layer0/sqlen9/present_value.bin"); 30 | 31 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 32 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 33 | success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length()); 34 | if (!success) 35 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 36 | else 37 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 38 | } 39 | 40 | void test_Int4llamaDecoderLayer_gen() { 41 | const struct model_config llama7B = llama_7B; 42 | const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 43 | head_dim = embed_dim / num_heads; 44 | const int tgz = (sqlen + past_sqlen); 45 | 46 | MemoryAllocator mem_buf; 47 | 48 | Int4llamaDecoderLayer layer = Int4llamaDecoderLayer("models/LLaMA_7B/decoder/layer0", llama7B, 0); 49 | 50 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 51 | hidden_states.load("assets/llama/tests/layer0/sqlen1/hidden_states.bin"); 52 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * tgz), 1, sqlen, tgz); 53 | attention_mask.load("assets/llama/tests/layer0/sqlen1/attention_mask.bin"); 54 | Matrix3D past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 55 | past_key.load("assets/llama/tests/layer0/sqlen1/past_key.bin"); 56 | Matrix3D past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 57 | past_value.load("assets/llama/tests/layer0/sqlen1/past_value.bin"); 58 | 59 | struct Int4llamaDecoderLayer_input input(hidden_states, attention_mask, past_key, past_value); 60 | 61 | struct Int4llamaDecoderLayer_output output = layer.forward(input); 62 | 63 | Matrix3D outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 64 | outputGT.load("assets/llama/tests/layer0/sqlen1/output_hidden_states.bin"); 65 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, sqlen, embed_dim / num_heads); 66 | key_statesGT.load("assets/llama/tests/layer0/sqlen1/present_key.bin"); 67 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(tgz * embed_dim), num_heads, tgz, embed_dim / num_heads); 68 | value_statesGT.load("assets/llama/tests/layer0/sqlen1/present_value.bin"); 69 | 70 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 71 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 72 | success &= check_two_equal(outputGT.m_data, output.hidden_states.m_data, outputGT.length()); 73 | if (!success) 74 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 75 | else 76 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 77 | } 78 | 79 | int main() { 80 | // This tests are directly from fp32 and are not completed yet! 81 | test_Int4llamaDecoderLayer(); 82 | test_Int4llamaDecoderLayer_gen(); 83 | } 84 | -------------------------------------------------------------------------------- /kernels/starter_code/reference.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | #include "common.h" 10 | 11 | namespace matmul { 12 | void MatmulOperator::mat_mul_reference(struct matmul_params *params) { 13 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 14 | const int block_size = params->block_size; // block_size = 32 15 | float *scale = params->scales, *offset = params->offset; 16 | 17 | quantize_fp32_to_int8(A->data_ptr, A->int8_data_ptr, params->A_scales, A->row * A->column, block_size); 18 | 19 | int m = C->row, n = C->column, k = A->column; 20 | // A: m x k; B: n x k; C: m x n 21 | for (int row = 0; row < m; row++) { 22 | for (int col = 0; col < n; col++) { 23 | float acc = 0; 24 | // Compute each block 25 | for (int ch = 0; ch < k;) { 26 | // pointer of the int4 weights 27 | uint8_t *w_int4 = &B->int4_data_ptr[(col * k + ch) / 2]; 28 | // pointer of the int8 activation 29 | const signed char *a_int8 = &A->int8_data_ptr[row * k + ch]; 30 | // scale of weight 31 | float s_w = params->scales[(col * k + ch) / block_size]; 32 | // scale of activation 33 | float s_a = params->A_scales[(row * k + ch) / block_size]; 34 | #ifdef QM_ARM 35 | // order of weights with QM_ARM: 36 | // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w30,w31) 37 | // QM_ARM order: (w0,w16),(w1,w17),(w2,w18),(w3,w19),(w4, w20),... (w15,w31) 38 | // |--| 39 | // 4 bits 40 | // |------| 41 | // 8 bits (byte) 42 | // low|----------------------------------------------------------|high 43 | // 0 128 bit 127 44 | // process 16 bytes of weigths (128 bit) = 1 block 45 | // intermediate variable to store sum of integer multiplication and accumulation 46 | int intermediate_sum = 0; 47 | for (int qj = 0; qj < 16; qj++) { 48 | // decode a packed byte into two int8 in the range of (-8, 7) 49 | uint8_t packed_int4_0 = w_int4[qj]; 50 | signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0; 51 | signed char w_de_16 = (packed_int4_0 >> 4) - 8.0; 52 | // int8 multiply and accumulate operation 53 | intermediate_sum += a_int8[qj] * w_de_0; 54 | intermediate_sum += a_int8[qj + 16] * w_de_16; 55 | } 56 | // dequantize the sum into floating point 57 | acc += (float)intermediate_sum * s_a * s_w; 58 | ch += block_size; 59 | #endif 60 | #ifdef QM_x86 61 | // scales of the second block 62 | float s_w_2nd = params->scales[(col * k + ch) / block_size + 1]; 63 | float s_a_2nd = params->A_scales[(row * k + ch) / block_size + 1]; 64 | // order of weights with QM_x86: 65 | // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w62,w63) 66 | // QM_ARM order: (w0,w32),(w1,w33),(w2,w34),(w3,w35),(w4, w36),... (w31,w63) 67 | // |--| 68 | // 4 bits 69 | // |------| 70 | // 8 bits (byte) 71 | // low|----------------------------------------------------------|high 72 | // 0 256 bit 73 | // process 32 bytes of weigths (256 bit) = 2 blocks 74 | // intermediate variable to store sum of integer multiplication and accumulation 75 | int intermediate_sum = 0, intermediate_sum_2nd = 0; 76 | for (int qj = 0; qj < 32; qj++) { 77 | // decode a packed byte into two int8 in the range of (-8, 7) 78 | uint8_t packed_int4_0 = w_int4[qj]; 79 | signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0; 80 | signed char w_de_16 = (packed_int4_0 >> 4) - 8.0; 81 | // int8 multiply and accumulate operation 82 | intermediate_sum += a_int8[qj] * w_de_0; 83 | intermediate_sum_2nd += a_int8[qj + 32] * w_de_16; 84 | } 85 | // dequantize the sum into floating point 86 | acc += (float)intermediate_sum * s_a * s_w; 87 | acc += (float)intermediate_sum_2nd * s_a_2nd * s_w_2nd; 88 | ch += block_size * 2; 89 | #endif 90 | } 91 | C->data_ptr[row * n + col] = acc; 92 | } 93 | } 94 | }; 95 | } // namespace matmul 96 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/Fp32llamaDecoder.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaDecoder.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "utils.h" 7 | 8 | Matrix3D Fp32llamaDecoder::prepare_decoder_attention_mask(int length, int past_length) { 9 | PROFILE_START("Fp32llamaDecoder::prepare_decoder_attention_mask"); 10 | assert(length - past_length > 0); 11 | Matrix3D causal_attention_mask(attention_mask_buf, 1, length - past_length, length); 12 | float min = std::numeric_limits::lowest(); 13 | for (int i = 0; i < length - past_length; i++) { 14 | for (int j = 0; j < length; j++) { 15 | if (i + past_length < j) { 16 | causal_attention_mask(0, i, j) = min; 17 | } else { 18 | causal_attention_mask(0, i, j) = 0.0; 19 | } 20 | } 21 | } 22 | 23 | PROFILE_END("Fp32llamaDecoder::prepare_decoder_attention_mask"); 24 | return causal_attention_mask; 25 | } 26 | 27 | Fp32llamaDecoder::Fp32llamaDecoder(std::string param_path, const struct model_config config) { 28 | allocate_aligned_memory(attention_mask_buf, config.max_sqlen * config.max_sqlen * sizeof(float)); 29 | allocate_aligned_memory(pos_embeds_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 30 | allocate_aligned_memory(last_hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 31 | allocate_aligned_memory(hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 32 | 33 | this->voc_size = config.vocsize; 34 | this->embed_dim = config.embed_dim; 35 | this->hidden_dim = config.hidden_dim; 36 | this->num_heads = config.num_heads; 37 | this->padding_idx = config.padding_idx; 38 | 39 | int max_sqlen = config.max_sqlen; 40 | 41 | // Embedding 42 | Matrix3D embweight(new float[voc_size * embed_dim], 1, voc_size, embed_dim); 43 | this->embed_tokens = Embedding(embed_dim, voc_size, padding_idx, embweight); 44 | load_Embedding_params(this->embed_tokens, param_path + "/embed_tokens"); 45 | 46 | // Norm 47 | Matrix3D norm_weight(new float[embed_dim], 1, 1, embed_dim); 48 | norm_weight.load((param_path + "/norm/weight.bin").c_str()); 49 | this->norm = LlamaRMSNorm(norm_weight); 50 | 51 | // Load all the decoder layers 52 | for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) { 53 | DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;) 54 | 55 | std::string path = param_path + "/layer" + std::to_string(layer_idx); 56 | Fp32llamaDecoderLayer layer = Fp32llamaDecoderLayer(path, config, layer_idx); 57 | 58 | this->layers.push_back(layer); 59 | } 60 | }; 61 | 62 | // Fp32llamaDecoder: 63 | struct Fp32llamaDecoder_output Fp32llamaDecoder::forward(const struct Fp32llamaDecoder_input &input) { 64 | PROFILE_START(profile_name); 65 | int sqlen = input.input_ids.m_dim_z, batch_size = input.input_ids.m_dim_x, past_key_values_length = 0; 66 | 67 | // Input token -> Embedding 68 | float inputs_embeds_buf[sqlen * this->embed_dim]; 69 | Matrix3D inputs_embeds(inputs_embeds_buf, 1, sqlen, this->embed_dim); 70 | this->embed_tokens.forward(input.input_ids, inputs_embeds); 71 | 72 | if (input.has_past_keys_values) { 73 | past_key_values_length = input.past_keys[0].m_dim_y; 74 | } 75 | 76 | // Attention mask 77 | Matrix3D causal_attention_mask = 78 | this->prepare_decoder_attention_mask(sqlen + past_key_values_length, past_key_values_length); 79 | 80 | // Go through each layer 81 | Matrix3D hidden_states = inputs_embeds; 82 | std::vector> past_keys, past_values; 83 | for (int i = 0; i < this->layers.size(); i++) { 84 | if (!input.has_past_keys_values) { 85 | struct Fp32llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask}; 86 | struct Fp32llamaDecoderLayer_output l_o = this->layers[i].forward(l_i); 87 | hidden_states = l_o.hidden_states; 88 | past_keys.push_back(l_o.past_key_value.first); 89 | past_values.push_back(l_o.past_key_value.second); 90 | } else { 91 | struct Fp32llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask, input.past_keys[i], 92 | input.past_values[i]}; 93 | struct Fp32llamaDecoderLayer_output l_o = this->layers[i].forward(l_i); 94 | hidden_states = l_o.hidden_states; 95 | past_keys.push_back(l_o.past_key_value.first); 96 | past_values.push_back(l_o.past_key_value.second); 97 | } 98 | } 99 | 100 | // Layernorm 101 | Matrix3D last_hidden_states(last_hidden_states_buf, 1, sqlen, this->embed_dim); 102 | this->norm.forward(hidden_states, last_hidden_states); 103 | 104 | struct Fp32llamaDecoder_output output = {last_hidden_states, past_keys, past_values}; 105 | PROFILE_END(profile_name); 106 | return output; 107 | } 108 | -------------------------------------------------------------------------------- /transformer/src/nn_modules/Int4llamaDecoder.cc: -------------------------------------------------------------------------------- 1 | #include "Int4llamaDecoder.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "utils.h" 7 | 8 | Matrix3D Int4llamaDecoder::prepare_decoder_attention_mask(int length, int past_length) { 9 | PROFILE_START("Int4llamaDecoder::prepare_decoder_attention_mask"); 10 | assert(length - past_length > 0); 11 | Matrix3D causal_attention_mask(attention_mask_buf, 1, length - past_length, length); 12 | float min = std::numeric_limits::lowest(); 13 | for (int i = 0; i < length - past_length; i++) { 14 | for (int j = 0; j < length; j++) { 15 | if (i + past_length < j) { 16 | causal_attention_mask(0, i, j) = min; 17 | } else { 18 | causal_attention_mask(0, i, j) = 0.0; 19 | } 20 | } 21 | } 22 | 23 | PROFILE_END("Int4llamaDecoder::prepare_decoder_attention_mask"); 24 | return causal_attention_mask; 25 | } 26 | 27 | Int4llamaDecoder::Int4llamaDecoder(std::string param_path, const struct model_config config) { 28 | allocate_aligned_memory(attention_mask_buf, config.max_sqlen * config.max_sqlen * sizeof(float)); 29 | allocate_aligned_memory(pos_embeds_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 30 | allocate_aligned_memory(last_hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 31 | allocate_aligned_memory(hidden_states_buf, config.max_sqlen * config.embed_dim * sizeof(float)); 32 | 33 | this->voc_size = config.vocsize; 34 | this->embed_dim = config.embed_dim; 35 | this->hidden_dim = config.hidden_dim; 36 | this->num_heads = config.num_heads; 37 | this->padding_idx = config.padding_idx; 38 | 39 | int max_sqlen = config.max_sqlen; 40 | 41 | // Embedding 42 | Matrix3D embweight(new float[voc_size * embed_dim], 1, voc_size, embed_dim); 43 | this->embed_tokens = Embedding(embed_dim, voc_size, padding_idx, embweight); 44 | load_Embedding_params(this->embed_tokens, param_path + "/embed_tokens"); 45 | 46 | // Norm 47 | Matrix3D norm_weight(new float[embed_dim], 1, 1, embed_dim); 48 | norm_weight.load((param_path + "/norm/weight.bin").c_str()); 49 | this->norm = LlamaRMSNorm(norm_weight); 50 | 51 | // Load all the decoder layers 52 | for (int layer_idx = 0; layer_idx < config.num_layers; layer_idx++) { 53 | DEBUG_INS(std::cout << "Start loading layer:" << layer_idx << "..." << std::endl;) 54 | 55 | std::string path = param_path + "/layer" + std::to_string(layer_idx); 56 | Int4llamaDecoderLayer layer = Int4llamaDecoderLayer(path, config, layer_idx); 57 | 58 | this->layers.push_back(layer); 59 | } 60 | }; 61 | 62 | // Int4llamaDecoder: 63 | struct Int4llamaDecoder_output Int4llamaDecoder::forward(const struct Int4llamaDecoder_input &input) { 64 | PROFILE_START(profile_name); 65 | int sqlen = input.input_ids.m_dim_z, batch_size = input.input_ids.m_dim_x, past_key_values_length = 0; 66 | 67 | // Input token -> Embedding 68 | float inputs_embeds_buf[sqlen * this->embed_dim]; 69 | Matrix3D inputs_embeds(inputs_embeds_buf, 1, sqlen, this->embed_dim); 70 | this->embed_tokens.forward(input.input_ids, inputs_embeds); 71 | 72 | if (input.has_past_keys_values) { 73 | past_key_values_length = input.past_keys[0].m_dim_y; 74 | } 75 | 76 | // Attention mask 77 | Matrix3D causal_attention_mask = 78 | this->prepare_decoder_attention_mask(sqlen + past_key_values_length, past_key_values_length); 79 | 80 | // Go through each layer 81 | Matrix3D hidden_states = inputs_embeds; 82 | std::vector> past_keys, past_values; 83 | for (int i = 0; i < this->layers.size(); i++) { 84 | if (!input.has_past_keys_values) { 85 | struct Int4llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask}; 86 | struct Int4llamaDecoderLayer_output l_o = this->layers[i].forward(l_i); 87 | hidden_states = l_o.hidden_states; 88 | past_keys.push_back(l_o.past_key_value.first); 89 | past_values.push_back(l_o.past_key_value.second); 90 | } else { 91 | struct Int4llamaDecoderLayer_input l_i = {hidden_states, causal_attention_mask, input.past_keys[i], 92 | input.past_values[i]}; 93 | struct Int4llamaDecoderLayer_output l_o = this->layers[i].forward(l_i); 94 | hidden_states = l_o.hidden_states; 95 | past_keys.push_back(l_o.past_key_value.first); 96 | past_values.push_back(l_o.past_key_value.second); 97 | } 98 | } 99 | 100 | // Layernorm 101 | Matrix3D last_hidden_states(last_hidden_states_buf, 1, sqlen, this->embed_dim); 102 | this->norm.forward(hidden_states, last_hidden_states); 103 | 104 | struct Int4llamaDecoder_output output = {last_hidden_states, past_keys, past_values}; 105 | PROFILE_END(profile_name); 106 | return output; 107 | } 108 | -------------------------------------------------------------------------------- /kernels/quantizer.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #ifdef QM_ARM 4 | #include 5 | void quantize_fp32_to_int8(float* A, int8_t* qA, float* sA, int size, int block_size) { 6 | assert(size % block_size == 0); 7 | assert(block_size == 32); 8 | int num_block = size / 32; 9 | 10 | for (int i = 0; i < num_block; i++) { 11 | float32x4_t srcv[8]; 12 | float32x4_t asrcv[8]; 13 | float32x4_t amaxv[8]; 14 | 15 | int8_t* start_qA = &qA[i * 32]; 16 | 17 | for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(A + i * 32 + 4 * l); 18 | for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); 19 | 20 | for (int l = 0; l < 4; l++) amaxv[2 * l] = vmaxq_f32(asrcv[2 * l], asrcv[2 * l + 1]); 21 | for (int l = 0; l < 2; l++) amaxv[4 * l] = vmaxq_f32(amaxv[4 * l], amaxv[4 * l + 2]); 22 | for (int l = 0; l < 1; l++) amaxv[8 * l] = vmaxq_f32(amaxv[8 * l], amaxv[8 * l + 4]); 23 | 24 | const float amax = vmaxvq_f32(amaxv[0]); 25 | 26 | const float d = amax / ((1 << 7) - 1); 27 | const float id = d ? 1.0f / d : 0.0f; 28 | 29 | sA[i] = d; 30 | 31 | // low half 32 | for (int l = 0; l < 4; l++) { 33 | const float32x4_t v = vmulq_n_f32(srcv[l], id); 34 | const int32x4_t vi = vcvtnq_s32_f32(v); 35 | 36 | start_qA[4 * l + 0] = vgetq_lane_s32(vi, 0); 37 | start_qA[4 * l + 1] = vgetq_lane_s32(vi, 1); 38 | start_qA[4 * l + 2] = vgetq_lane_s32(vi, 2); 39 | start_qA[4 * l + 3] = vgetq_lane_s32(vi, 3); 40 | } 41 | 42 | // high half 43 | for (int l = 4; l < 8; l++) { 44 | const float32x4_t v = vmulq_n_f32(srcv[l], id); 45 | const int32x4_t vi = vcvtnq_s32_f32(v); 46 | 47 | start_qA[4 * l + 0] = vgetq_lane_s32(vi, 0); 48 | start_qA[4 * l + 1] = vgetq_lane_s32(vi, 1); 49 | start_qA[4 * l + 2] = vgetq_lane_s32(vi, 2); 50 | start_qA[4 * l + 3] = vgetq_lane_s32(vi, 3); 51 | } 52 | } 53 | } 54 | #endif 55 | #ifdef QM_x86 56 | #include 57 | void quantize_fp32_to_int8(float* A, int8_t* qA, float* sA, int size, int block_size) { 58 | int nb = size / 32; 59 | for (int i = 0; i < nb; i++) { 60 | // Load elements into 4 AVX vectors 61 | __m256 v0 = _mm256_loadu_ps(A); 62 | __m256 v1 = _mm256_loadu_ps(A + 8); 63 | __m256 v2 = _mm256_loadu_ps(A + 16); 64 | __m256 v3 = _mm256_loadu_ps(A + 24); 65 | A += 32; 66 | 67 | // Compute max(abs(e)) for the block 68 | const __m256 signBit = _mm256_set1_ps(-0.0f); 69 | __m256 maxAbs = _mm256_andnot_ps(signBit, v0); 70 | maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v1)); 71 | maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v2)); 72 | maxAbs = _mm256_max_ps(maxAbs, _mm256_andnot_ps(signBit, v3)); 73 | 74 | __m128 max4 = _mm_max_ps(_mm256_extractf128_ps(maxAbs, 1), _mm256_castps256_ps128(maxAbs)); 75 | max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4)); 76 | max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4)); 77 | const float maxScalar = _mm_cvtss_f32(max4); 78 | 79 | // Quantize these floats 80 | const float d = maxScalar / 127.f; 81 | *sA++ = d; 82 | const float id = (maxScalar != 0.0f) ? 127.f / maxScalar : 0.0f; 83 | const __m256 mul = _mm256_set1_ps(id); 84 | 85 | // Apply the multiplier 86 | v0 = _mm256_mul_ps(v0, mul); 87 | v1 = _mm256_mul_ps(v1, mul); 88 | v2 = _mm256_mul_ps(v2, mul); 89 | v3 = _mm256_mul_ps(v3, mul); 90 | 91 | // Round to nearest integer 92 | v0 = _mm256_round_ps(v0, _MM_ROUND_NEAREST); 93 | v1 = _mm256_round_ps(v1, _MM_ROUND_NEAREST); 94 | v2 = _mm256_round_ps(v2, _MM_ROUND_NEAREST); 95 | v3 = _mm256_round_ps(v3, _MM_ROUND_NEAREST); 96 | 97 | // Convert floats to integers 98 | __m256i i0 = _mm256_cvtps_epi32(v0); 99 | __m256i i1 = _mm256_cvtps_epi32(v1); 100 | __m256i i2 = _mm256_cvtps_epi32(v2); 101 | __m256i i3 = _mm256_cvtps_epi32(v3); 102 | 103 | // Convert int32 to int16 104 | i0 = _mm256_packs_epi32(i0, i1); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 105 | i2 = _mm256_packs_epi32(i2, i3); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 106 | // Convert int16 to int8 107 | i0 = _mm256_packs_epi16(i0, i2); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 108 | // 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 109 | 110 | // We got our precious signed bytes, but the order is now wrong 111 | // These AVX2 pack instructions process 16-byte pieces independently 112 | // The following instruction is fixing the order 113 | const __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); 114 | i0 = _mm256_permutevar8x32_epi32(i0, perm); 115 | 116 | _mm256_storeu_si256((__m256i*)qA, i0); 117 | qA += 32; 118 | } 119 | } 120 | #endif 121 | -------------------------------------------------------------------------------- /transformer/llama_exporter.py: -------------------------------------------------------------------------------- 1 | """Implementation of exporting LLaMA PyTorch model to TinyLLMEngine format. 2 | 3 | Usage: 4 | python llama_exporter.py 5 | 6 | Example commandline: 7 | python llama_exporter.py ~/llama2-chat/hf7B models/LLaMA_7B_2_chat 8 | """ 9 | import argparse 10 | import math 11 | import os 12 | import struct 13 | 14 | import torch 15 | from transformers import LlamaForCausalLM 16 | 17 | 18 | @torch.no_grad() 19 | def _export_model(model, prefix): 20 | 21 | outpath = prefix 22 | os.makedirs(outpath, exist_ok=True) 23 | with open(os.path.join(f"{outpath}", "lm_head.bin"), "wb") as f: 24 | f.write(model.lm_head._parameters["weight"].cpu().float().numpy().tobytes()) 25 | _export_llama_model(model.model, os.path.join(f"{outpath}", "decoder")) 26 | 27 | 28 | def _export_embed_tokens(embed_tokens, prefix): 29 | outpath = prefix 30 | os.makedirs(outpath, exist_ok=True) 31 | with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f: 32 | f.write(embed_tokens.weight.cpu().float().numpy().tobytes()) 33 | 34 | 35 | def _export_llama_model(model, prefix): 36 | outpath = prefix 37 | os.makedirs(outpath, exist_ok=True) 38 | 39 | _export_embed_tokens(model.embed_tokens, os.path.join(outpath, "embed_tokens")) 40 | _export_LlamaRMSNorm(model.norm, os.path.join(outpath, "norm")) 41 | for idx, layer in enumerate(model.layers): 42 | _export_llama_layer(layer, os.path.join(outpath, f"layer{idx}")) 43 | 44 | 45 | def _export_LlamaRMSNorm(op, prefix): 46 | outpath = prefix 47 | os.makedirs(outpath, exist_ok=True) 48 | with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f: 49 | f.write(op.weight.cpu().float().numpy().tobytes()) 50 | 51 | 52 | def _export_llama_layer(layer, prefix): 53 | outpath = prefix 54 | os.makedirs(outpath, exist_ok=True) 55 | _export_attention_params(layer.self_attn, os.path.join(outpath, "self_attn")) 56 | _export_LlamaRMSNorm(layer.input_layernorm, os.path.join(outpath, "input_layernorm")) 57 | _export_LlamaRMSNorm( 58 | layer.post_attention_layernorm, 59 | os.path.join(outpath, "post_attention_layernorm"), 60 | ) 61 | _export_linearfp(layer.mlp.gate_proj, os.path.join(outpath, "gate_proj")) 62 | _export_linearfp(layer.mlp.down_proj, os.path.join(outpath, "down_proj")) 63 | _export_linearfp(layer.mlp.up_proj, os.path.join(outpath, "up_proj")) 64 | 65 | 66 | def _export_linearfp(op, prefix): 67 | outpath = prefix 68 | os.makedirs(outpath, exist_ok=True) 69 | with open(os.path.join(f"{outpath}", "weight.bin"), "wb") as f: 70 | f.write(op._parameters["weight"].cpu().float().numpy().tobytes()) 71 | 72 | 73 | def _export_rotaryEmbedding(op, prefix): 74 | outpath = prefix 75 | os.makedirs(outpath, exist_ok=True) 76 | with open(os.path.join(f"{outpath}", "cos_cached.bin"), "wb") as f: 77 | f.write(op.cos_cached.cpu().float().numpy().tobytes()) 78 | with open(os.path.join(f"{outpath}", "sin_cached.bin"), "wb") as f: 79 | f.write(op.sin_cached.cpu().float().numpy().tobytes()) 80 | 81 | 82 | def _export_BMM_F32T(alpha, prefix): 83 | outpath = prefix 84 | os.makedirs(outpath, exist_ok=True) 85 | with open(os.path.join(f"{outpath}", "alpha.bin"), "wb") as f: 86 | f.write(struct.pack("f", alpha)) 87 | 88 | 89 | def _export_attention_params(attn, prefix: str): 90 | outpath = prefix 91 | os.makedirs(outpath, exist_ok=True) 92 | _export_linearfp(attn.k_proj, os.path.join(outpath, "k_proj")) 93 | _export_linearfp(attn.v_proj, os.path.join(outpath, "v_proj")) 94 | _export_linearfp(attn.q_proj, os.path.join(outpath, "q_proj")) 95 | _export_linearfp(attn.o_proj, os.path.join(outpath, "o_proj")) 96 | qk_bmm_alpha = 1 / math.sqrt(attn.head_dim) 97 | _export_BMM_F32T(qk_bmm_alpha, os.path.join(outpath, "qk_bmm")) 98 | _export_rotaryEmbedding(attn.rotary_emb, os.path.join(outpath, "rotary_emb")) 99 | 100 | 101 | def main(): 102 | """Export a LLaMA model to TinyLLMEngine format.""" 103 | parser = argparse.ArgumentParser(description="export LLaMA pytorch model to TinyLLMEngine format.") 104 | parser.add_argument("model", type=str, help="Path of the LLaMA torch model") 105 | parser.add_argument("output", type=str, help="Output directory of the exported model") 106 | 107 | args = parser.parse_args() 108 | 109 | if not os.path.exists(args.model): 110 | print(f"The model path '{args.model}' does not exist.") 111 | return 112 | 113 | if not os.path.exists(args.output): 114 | print(f"The model path '{args.output}' does not exist.") 115 | return 116 | 117 | print("Loading model...") 118 | if args.model.endswith(".pt"): 119 | model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", torch_dtype=torch.float16) 120 | model.load_state_dict(torch.load(args.model)) 121 | else: 122 | model = LlamaForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16) 123 | 124 | print("Start exporting the model...") 125 | _export_model(model, args.output) 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /transformer/tests/test_Int4llamaAttention.cc: -------------------------------------------------------------------------------- 1 | #include "Int4llamaAttention.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | void test_Int4llamaAttention() { 7 | const struct model_config llama7B = llama_7B; 8 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads; 9 | 10 | MemoryAllocator mem_buf; 11 | 12 | Int4llamaAttention attn = Int4llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B); 13 | 14 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 15 | read_to_array("assets/llama/tests/atten/sqlen9/hidden_states.bin", hidden_states.m_data, b * sqlen * embed_dim); 16 | // print_first_k_elelment("hidden_states", hidden_states.m_data, 10); 17 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 18 | read_to_array("assets/llama/tests/atten/sqlen9/attention_mask.bin", attention_mask.m_data, attention_mask.length()); 19 | 20 | attn.initialized_memory(llama7B); 21 | struct Int4llamaAttention_input input(hidden_states, attention_mask, 0); 22 | 23 | struct Int4llamaAttention_output output = attn.forward(input); 24 | 25 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 26 | read_to_array("assets/llama/tests/atten/sqlen9/attn_output.bin", attn_outputGT.m_data, b * sqlen * embed_dim); 27 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 28 | read_to_array("assets/llama/tests/atten/sqlen9/past_key.bin", key_statesGT.m_data, b * sqlen * embed_dim); 29 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 30 | read_to_array("assets/llama/tests/atten/sqlen9/past_value.bin", value_statesGT.m_data, b * sqlen * embed_dim); 31 | 32 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 33 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length()); 34 | success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length()); 35 | if (!success) 36 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 37 | else 38 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 39 | } 40 | 41 | void test_Int4llamaAttention_gen() { 42 | const struct model_config llama7B = llama_7B; 43 | const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 44 | head_dim = embed_dim / num_heads; 45 | 46 | MemoryAllocator mem_buf; 47 | 48 | Int4llamaAttention attn = Int4llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B); 49 | 50 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 51 | hidden_states.load("assets/llama/tests/atten/sqlen1/hidden_states.bin"); 52 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * (sqlen + past_sqlen)), b, sqlen, sqlen + past_sqlen); 53 | attention_mask.load("assets/llama/tests/atten/sqlen1/attention_mask.bin"); 54 | Matrix3D past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 55 | past_key.load("assets/llama/tests/atten/sqlen9/past_key.bin"); 56 | Matrix3D past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 57 | past_value.load("assets/llama/tests/atten/sqlen9/past_value.bin"); 58 | 59 | attn.initialized_memory(llama7B); 60 | struct Int4llamaAttention_input input(hidden_states, attention_mask, past_key, past_value, true, 0); 61 | 62 | struct Int4llamaAttention_output output = attn.forward(input); 63 | 64 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 65 | attn_outputGT.load("assets/llama/tests/atten/sqlen1/attn_output.bin"); 66 | Matrix3D key_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, sqlen + past_sqlen, 67 | embed_dim / num_heads); 68 | key_statesGT.load("assets/llama/tests/atten/sqlen1/past_key.bin"); 69 | Matrix3D value_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, 70 | sqlen + past_sqlen, embed_dim / num_heads); 71 | value_statesGT.load("assets/llama/tests/atten/sqlen1/past_value.bin"); 72 | 73 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 74 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length()); 75 | success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length()); 76 | if (!success) 77 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 78 | else 79 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 80 | } 81 | 82 | int main() { 83 | // This tests are directly from fp32 and are not completed yet! 84 | // test_Int4llamaAttention(); 85 | // test_Int4llamaAttention_gen(); 86 | } 87 | -------------------------------------------------------------------------------- /transformer/tests/test_Fp32llamaAttention.cc: -------------------------------------------------------------------------------- 1 | #include "Fp32llamaAttention.h" 2 | #include "operators.h" 3 | #include "utils.h" 4 | #include "utils_memalloc.h" 5 | 6 | void test_Fp32llamaAttention() { 7 | const struct model_config llama7B = llama_7B; 8 | const int sqlen = 9, b = 1, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads; 9 | 10 | MemoryAllocator mem_buf; 11 | 12 | Fp32llamaAttention attn = Fp32llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B); 13 | 14 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 15 | read_to_array("assets/llama/tests/atten/sqlen9/hidden_states.bin", hidden_states.m_data, b * sqlen * embed_dim); 16 | // print_first_k_elelment("hidden_states", hidden_states.m_data, 10); 17 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * sqlen), 1, sqlen, sqlen); 18 | read_to_array("assets/llama/tests/atten/sqlen9/attention_mask.bin", attention_mask.m_data, attention_mask.length()); 19 | 20 | attn.initialized_memory(llama7B); 21 | struct Fp32llamaAttention_input input(hidden_states, attention_mask, 0); 22 | 23 | struct Fp32llamaAttention_output output = attn.forward(input); 24 | 25 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 26 | read_to_array("assets/llama/tests/atten/sqlen9/attn_output.bin", attn_outputGT.m_data, b * sqlen * embed_dim); 27 | Matrix3D key_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 28 | read_to_array("assets/llama/tests/atten/sqlen9/past_key.bin", key_statesGT.m_data, b * sqlen * embed_dim); 29 | Matrix3D value_statesGT(mem_buf.get_fpbuffer(sqlen * embed_dim), num_heads, sqlen, embed_dim / num_heads); 30 | read_to_array("assets/llama/tests/atten/sqlen9/past_value.bin", value_statesGT.m_data, b * sqlen * embed_dim); 31 | 32 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 33 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 34 | success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length()); 35 | // print_first_k_elelment("output.attn_output", output.attn_output.m_data, 20); 36 | // print_first_k_elelment("attn_outputGT", attn_outputGT.m_data, 20); 37 | if (!success) 38 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 39 | else 40 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 41 | } 42 | 43 | void test_Fp32llamaAttention_gen() { 44 | const struct model_config llama7B = llama_7B; 45 | const int sqlen = 1, b = 1, past_sqlen = 9, embed_dim = llama7B.embed_dim, num_heads = llama7B.num_heads, 46 | head_dim = embed_dim / num_heads; 47 | 48 | MemoryAllocator mem_buf; 49 | 50 | Fp32llamaAttention attn = Fp32llamaAttention("models/LLaMA_7B/decoder/layer0/self_attn", llama7B); 51 | 52 | Matrix3D hidden_states(mem_buf.get_fpbuffer(embed_dim * sqlen), b, sqlen, embed_dim); 53 | hidden_states.load("assets/llama/tests/atten/sqlen1/hidden_states.bin"); 54 | Matrix3D attention_mask(mem_buf.get_fpbuffer(sqlen * (sqlen + past_sqlen)), b, sqlen, sqlen + past_sqlen); 55 | attention_mask.load("assets/llama/tests/atten/sqlen1/attention_mask.bin"); 56 | Matrix3D past_key(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 57 | past_key.load("assets/llama/tests/atten/sqlen9/past_key.bin"); 58 | Matrix3D past_value(mem_buf.get_fpbuffer(past_sqlen * embed_dim), num_heads, past_sqlen, head_dim); 59 | past_value.load("assets/llama/tests/atten/sqlen9/past_value.bin"); 60 | 61 | attn.initialized_memory(llama7B); 62 | struct Fp32llamaAttention_input input(hidden_states, attention_mask, past_key, past_value, true, 0); 63 | 64 | struct Fp32llamaAttention_output output = attn.forward(input); 65 | 66 | Matrix3D attn_outputGT(mem_buf.get_fpbuffer(b * sqlen * embed_dim), b, sqlen, embed_dim); 67 | attn_outputGT.load("assets/llama/tests/atten/sqlen1/attn_output.bin"); 68 | Matrix3D key_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, sqlen + past_sqlen, 69 | embed_dim / num_heads); 70 | key_statesGT.load("assets/llama/tests/atten/sqlen1/past_key.bin"); 71 | Matrix3D value_statesGT(mem_buf.get_fpbuffer((sqlen + past_sqlen) * embed_dim), num_heads, 72 | sqlen + past_sqlen, embed_dim / num_heads); 73 | value_statesGT.load("assets/llama/tests/atten/sqlen1/past_value.bin"); 74 | 75 | bool success = check_two_equal(value_statesGT.m_data, output.past_key_value.second.m_data, value_statesGT.length()); 76 | success &= check_two_equal(key_statesGT.m_data, output.past_key_value.first.m_data, key_statesGT.length(), 1e-9); 77 | success &= check_two_equal(attn_outputGT.m_data, output.attn_output.m_data, attn_outputGT.length()); 78 | // print_first_k_elelment("output.attn_output", output.attn_output.m_data, 20); 79 | // print_first_k_elelment("attn_outputGT", attn_outputGT.m_data, 20); 80 | if (!success) 81 | std::cout << "Test of " << __func__ << ": Fail!" << std::endl; 82 | else 83 | std::cout << "-------- Test of " << __func__ << ": Passed! -------- " << std::endl; 84 | } 85 | 86 | int main() { 87 | test_Fp32llamaAttention(); 88 | test_Fp32llamaAttention_gen(); 89 | } 90 | -------------------------------------------------------------------------------- /kernels/starter_code/multithreading.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "../matmul.h" 9 | #include "common.h" 10 | struct multithreading_thread_args { 11 | int start, end; 12 | const struct matmul_params* params; 13 | }; 14 | static void* multithreading_worker_func(void* args) { 15 | struct multithreading_thread_args* mat_args = (struct multithreading_thread_args*)args; 16 | const struct matmul_params* params = mat_args->params; 17 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 18 | const int block_size = params->block_size; 19 | 20 | int m = C->row, n = C->column, k = A->column; 21 | // A: m x k; B: n x k; C: m x n 22 | for (int row = 0; row < m; row++) { 23 | for (int col = mat_args->start; col < mat_args->end; col++) { 24 | float acc = 0; 25 | // Compute each block 26 | for (int ch = 0; ch < k;) { 27 | // pointer of the int4 weights 28 | uint8_t* w_int4 = &B->int4_data_ptr[(col * k + ch) / 2]; 29 | // pointer of the int8 activation 30 | const signed char* a_int8 = &A->int8_data_ptr[row * k + ch]; 31 | // scale of weight 32 | float s_w = params->scales[(col * k + ch) / block_size]; 33 | // scale of activation 34 | float s_a = params->A_scales[(row * k + ch) / block_size]; 35 | #ifdef QM_ARM 36 | // order of weights with QM_ARM: 37 | // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w30,w31) 38 | // QM_ARM order: (w0,w16),(w1,w17),(w2,w18),(w3,w19),(w4, w20),... (w15,w31) 39 | // |--| 40 | // 4 bits 41 | // |------| 42 | // 8 bits (byte) 43 | // low|----------------------------------------------------------|high 44 | // 0 128 bit 127 45 | // process 16 bytes of weigths (128 bit) = 1 block 46 | // intermediate variable to store sum of integer multiplication and accumulation 47 | int intermediate_sum = 0; 48 | // process 16 bytes of weigths (128 bit) 49 | for (int qj = 0; qj < 16; qj++) { 50 | // decode a packed byte into two int8 in the range of (-8, 7) 51 | uint8_t packed_int4_0 = w_int4[qj]; 52 | signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0; 53 | signed char w_de_16 = (packed_int4_0 >> 4) - 8.0; 54 | // int8 multiply and accumulate operation 55 | intermediate_sum += a_int8[qj] * w_de_0; 56 | intermediate_sum += a_int8[qj + 16] * w_de_16; 57 | } 58 | // dequantize the sum into floating point 59 | acc += (float)intermediate_sum * s_a * s_w; 60 | ch += block_size; 61 | #endif 62 | #ifdef QM_x86 63 | // scales of the second block 64 | float s_w_2nd = params->scales[(col * k + ch) / block_size + 1]; 65 | float s_a_2nd = params->A_scales[(row * k + ch) / block_size + 1]; 66 | // order of weights with QM_x86: 67 | // origin order: (w0,w1), (w2,w3), (w4,w5), (w6,w7), (w8, w9), ... (w62,w63) 68 | // QM_ARM order: (w0,w32),(w1,w33),(w2,w34),(w3,w35),(w4, w36),... (w31,w63) 69 | // |--| 70 | // 4 bits 71 | // |------| 72 | // 8 bits (byte) 73 | // low|----------------------------------------------------------|high 74 | // 0 256 bit 75 | // process 32 bytes of weigths (256 bit) = 2 blocks 76 | // intermediate variable to store sum of integer multiplication and accumulation 77 | int intermediate_sum = 0, intermediate_sum_2nd = 0; 78 | for (int qj = 0; qj < 32; qj++) { 79 | // decode a packed byte into two int8 in the range of (-8, 7) 80 | uint8_t packed_int4_0 = w_int4[qj]; 81 | signed char w_de_0 = (packed_int4_0 & 0x0F) - 8.0; 82 | signed char w_de_16 = (packed_int4_0 >> 4) - 8.0; 83 | // int8 multiply and accumulate operation 84 | intermediate_sum += a_int8[qj] * w_de_0; 85 | intermediate_sum_2nd += a_int8[qj + 32] * w_de_16; 86 | } 87 | // dequantize the sum into floating point 88 | acc += (float)intermediate_sum * s_a * s_w; 89 | acc += (float)intermediate_sum_2nd * s_a_2nd * s_w_2nd; 90 | ch += block_size * 2; 91 | #endif 92 | } 93 | C->data_ptr[row * n + col] = acc; 94 | } 95 | } 96 | return NULL; 97 | } 98 | 99 | namespace matmul { 100 | void MatmulOperator::mat_mul_multithreading(struct matmul_params* params) { 101 | const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; 102 | const int block_size = params->block_size; 103 | 104 | quantize_fp32_to_int8(A->data_ptr, A->int8_data_ptr, params->A_scales, A->row * A->column, block_size); 105 | 106 | int m = C->row, n = C->column, k = A->column; 107 | 108 | const int num_thread = 4; 109 | pthread_t thread_pool[num_thread]; 110 | struct multithreading_thread_args threads_args[num_thread]; 111 | 112 | // TODO: Thread creation 113 | 114 | // TODO: Join threads 115 | }; 116 | } // namespace matmul 117 | --------------------------------------------------------------------------------