├── src ├── models │ ├── common_params.h │ ├── CMakeLists.txt │ ├── llama │ │ └── llama_params.h │ └── basemodel.h ├── weights │ ├── CMakeLists.txt │ ├── llama │ │ ├── norm_weights.h │ │ ├── embedding_weights.h │ │ ├── CMakeLists.txt │ │ ├── ffn_weights.h │ │ ├── attention_weights.h │ │ ├── layer_weights.h │ │ ├── llama_weights.h │ │ └── llama_weights.cc │ ├── weight.h │ └── base_weights.h ├── layers │ ├── CMakeLists.txt │ ├── ffn │ │ ├── CMakeLists.txt │ │ ├── ffn.h │ │ └── ffn.cpp │ ├── attention │ │ ├── CMakeLists.txt │ │ ├── masked_self_attention.h │ │ ├── context_attention.h │ │ └── masked_self_attention.cpp │ └── decoder │ │ ├── CMakeLists.txt │ │ ├── self_decoder.h │ │ ├── context_decoder.h │ │ └── self_decoder.cpp ├── CMakeLists.txt ├── kernels │ ├── act_kernel.h │ ├── build_casual_mask.h │ ├── input_embedding.h │ ├── cal_paddingoffset.h │ ├── fused_transpose_and_remv_pad.h │ ├── attn_softmax_kernel.h │ ├── add_residual.h │ ├── repeat_kv.h │ ├── sampling.h │ ├── rmsnorm_kernel.h │ ├── fused_addresidual_norm.h │ ├── concat_past_kv.h │ ├── fused_decoder_self_attention.h │ ├── linear.h │ ├── topK_bk.h │ ├── qkv_bias_and_RoPE.h │ ├── topK.h │ ├── build_casual_mask.cu │ ├── cal_paddingoffset.cu │ ├── input_embedding.cu │ ├── act_kernel.cu │ ├── cublas_utils.h │ ├── add_residual.cu │ ├── fused_transpose_and_remv_pad.cu │ ├── sampling.cu │ ├── CMakeLists.txt │ ├── cublas_utils.cc │ ├── repeat_kv.cu │ ├── topK.cu │ ├── topK_bk.cu │ └── rmsnorm_kernel.cu ├── utils │ ├── CMakeLists.txt │ ├── params.h │ ├── weight_utils.h │ ├── vectorize_utils.h │ ├── cuda_debug_utils.cuh │ ├── string_utils.h │ ├── model_utils.h │ ├── debug_utils.h │ ├── macro.h │ └── weight_utils.cu └── memory │ └── allocator │ └── base_allocator.h ├── tests ├── CMakeLists.txt └── unittests │ ├── test_fused_trans_remv_pad.cu │ ├── test_cal_paddingoffset.cu │ ├── test_act.cu │ ├── CMakeLists.txt │ ├── test_repeat_kv.cu │ ├── test_data_compare.cu │ ├── test_linear.cu │ ├── test_casual_mask.cu │ ├── test_topk.cu │ ├── test_residual.cu │ ├── test_concat_kv.cu │ ├── test_bmm.cu │ └── test_mask_softmax.cu ├── examples ├── CMakeLists.txt ├── cpp │ ├── CMakeLists.txt │ ├── ffn │ │ ├── CMakeLists.txt │ │ └── ffn_example.cpp │ ├── decoder │ │ └── CMakeLists.txt │ └── attention │ │ └── CMakeLists.txt └── README.md ├── tools ├── 1.png ├── HF_llama_run_script.py └── README.md ├── llama2-7b-tokenizer.bin ├── README.md ├── user_entry.cpp └── CMakeLists.txt /src/models/common_params.h: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(unittests) -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(cpp) 2 | -------------------------------------------------------------------------------- /src/weights/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(llama) 2 | -------------------------------------------------------------------------------- /tools/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RussWong/LLM-engineering/HEAD/tools/1.png -------------------------------------------------------------------------------- /llama2-7b-tokenizer.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RussWong/LLM-engineering/HEAD/llama2-7b-tokenizer.bin -------------------------------------------------------------------------------- /src/layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(ffn) 2 | add_subdirectory(attention) 3 | add_subdirectory(decoder) 4 | -------------------------------------------------------------------------------- /examples/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(attention) 2 | add_subdirectory(ffn) 3 | add_subdirectory(decoder) 4 | -------------------------------------------------------------------------------- /src/weights/llama/norm_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | template 3 | struct LayerNormWeight { 4 | T* gamma; 5 | }; -------------------------------------------------------------------------------- /src/weights/weight.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | struct Weight { 3 | virtual void loadWeights(std::string weight_path) = 0; 4 | }; 5 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(weights) 2 | add_subdirectory(kernels) 3 | add_subdirectory(layers) 4 | add_subdirectory(utils) 5 | add_subdirectory(models) 6 | -------------------------------------------------------------------------------- /src/weights/llama/embedding_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/base_weights.h" 3 | template 4 | struct EmbeddingWeight: public BaseWeight { 5 | }; 6 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## note 2 | 1. examples folder is the functionality verification of attention and ffn and decoder and other ones, not related to accuracy 3 | 2. I am not sure if the examples folder can run as expected, you can compile them and have a try. -------------------------------------------------------------------------------- /src/weights/llama/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(layerweights STATIC layer_weights.cc) 2 | target_link_libraries(layerweights PUBLIC -lcudart weightutils) 3 | add_library(llamaweights STATIC llama_weights.cc) 4 | target_link_libraries(llamaweights PUBLIC layerweights) -------------------------------------------------------------------------------- /src/weights/llama/ffn_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/base_weights.h" 3 | template 4 | struct LLaMAFFNWeights { 5 | BaseWeight gate; 6 | BaseWeight up; 7 | BaseWeight down; 8 | BaseWeight gateAndup; 9 | }; 10 | -------------------------------------------------------------------------------- /src/kernels/act_kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/utils/vectorize_utils.h" 7 | 8 | template 9 | void launchAct(TensorWrapper* input, TensorWrapper* out); -------------------------------------------------------------------------------- /src/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(weightutils STATIC weight_utils.cu) 2 | set_property(TARGET weightutils PROPERTY CUDA_SEPARABLE_COMPILATION ON) 3 | set_property(TARGET weightutils PROPERTY POSITION_INDEPENDENT_CODE ON) 4 | set_property(TARGET weightutils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -------------------------------------------------------------------------------- /src/utils/params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | // (RussWong) notes: some data structure to wrap many arguements of a function for simplicity 5 | using IntDict = std::unordered_map; 6 | using floatDict = std::unordered_map; -------------------------------------------------------------------------------- /src/weights/llama/attention_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/base_weights.h" 3 | template 4 | struct LLaMAattentionWeights { 5 | BaseWeight q; 6 | BaseWeight k; 7 | BaseWeight v; 8 | BaseWeight qkv; 9 | BaseWeight output; 10 | }; 11 | -------------------------------------------------------------------------------- /examples/cpp/ffn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_executable(ffnExample ffn_example.cpp) 4 | set_property(TARGET ffnExample PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET ffnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(ffnExample PUBLIC Llamaffn) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLMengine 2 | 具体运行步骤可见配套pdf文档中准备工作一栏 3 | 4 | 这里再重复一波 5 | 6 | # steps 7 | ``` 8 | 1.模型转换,见tools/README.md 9 | 10 | 2.将转换后模型的路径,替换到根目录下user_entry.cpp#L5的路径 11 | 12 | 3./path/to/LLM-engineering/llama2-7b-tokenizer.bin替换到user_entry.cpp#L6的路径 13 | 14 | 4. mkdir build && cd build && cmake .. && make -j8 && ./bin/main 15 | ``` 16 | -------------------------------------------------------------------------------- /src/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | 3 | add_library(Llama STATIC llama/llama.cpp) 4 | set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(Llama PUBLIC LlamaCtxdecoder Llamaselfdecoder weightutils Llamaffn sample embeddingFunctor) 7 | -------------------------------------------------------------------------------- /src/kernels/build_casual_mask.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/utils/macro.h" 7 | template 8 | void launchBuildCausalMasks(TensorWrapper* mask, 9 | TensorWrapper* q_lens, 10 | TensorWrapper* k_lens); -------------------------------------------------------------------------------- /src/kernels/input_embedding.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "src/utils/tensor.h" 5 | #include "src/weights/llama/embedding_weights.h" 6 | template 7 | void launchInputEmbedding(TensorWrapper* input_ids, 8 | TensorWrapper* output, 9 | EmbeddingWeight* embed_table); -------------------------------------------------------------------------------- /src/kernels/cal_paddingoffset.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/macro.h" 6 | #include "src/utils/tensor.h" 7 | 8 | void launchCalPaddingoffset(TensorWrapper* padding_offset, 9 | TensorWrapper* cum_seqlens, 10 | TensorWrapper* input_lengths //actual input lens 11 | ); -------------------------------------------------------------------------------- /src/kernels/fused_transpose_and_remv_pad.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | 7 | template 8 | void launchTransposeOutRemovePadding(TensorWrapper* qkv_buf_w_pad, 9 | TensorWrapper* padding_offset, 10 | TensorWrapper* qkv_buf_wo_pad_1); -------------------------------------------------------------------------------- /src/kernels/attn_softmax_kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/utils/vectorize_utils.h" 7 | template 8 | void launchScaleMaskAndSoftmax(TensorWrapper* qk, 9 | TensorWrapper* mask, 10 | TensorWrapper* attn_score, 11 | float scale); 12 | -------------------------------------------------------------------------------- /src/layers/ffn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_library(Llamaffn STATIC ffn.cpp) 4 | set_property(TARGET Llamaffn PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET Llamaffn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(Llamaffn PUBLIC 7 | -lcudart 8 | -lcudadevrt 9 | act 10 | linear) -------------------------------------------------------------------------------- /src/kernels/add_residual.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/utils/vectorize_utils.h" 7 | template 8 | void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units 9 | TensorWrapper *residual, 10 | TensorWrapper *decoder_out, // [num tokens, hidden_units] 11 | bool is_print=false 12 | ); 13 | -------------------------------------------------------------------------------- /src/models/llama/llama_params.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | struct LLaMAAttentionStaticParams { 3 | int rotary_embedding_dim; 4 | float rotary_embedding_base; 5 | int max_position_embeddings; 6 | bool use_dynamic_ntk; // for dyn scaling rope 7 | }; 8 | 9 | // (RussWong)note: llama类模型里面动态改变的变量, 注意非全部必需 10 | struct LLaMAAttentionDynParams { 11 | int batch_size; 12 | int num_tokens; 13 | int max_q_len; 14 | int max_k_len; 15 | int num_layers; 16 | bool is_ctx = false; 17 | }; 18 | -------------------------------------------------------------------------------- /src/kernels/repeat_kv.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | 7 | template 8 | void launchRepeatKVCache(TensorWrapper *k_cache_src, 9 | TensorWrapper *v_cache_src, 10 | TensorWrapper *context_length, 11 | TensorWrapper *layer_id, 12 | TensorWrapper *k_cache_dst, 13 | TensorWrapper *v_cache_dst); 14 | -------------------------------------------------------------------------------- /src/kernels/sampling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "src/utils/tensor.h" 8 | #include "src/utils/params.h" 9 | 10 | template 11 | void launchSampling(TensorWrapper* topk_id, 12 | TensorWrapper* topk_val, 13 | TensorWrapper* seqlen, 14 | TensorWrapper* is_finished, 15 | TensorWrapper* output_id, 16 | IntDict& params); -------------------------------------------------------------------------------- /tools/HF_llama_run_script.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, LlamaForCausalLM 2 | # 注意,这个py用来debug的,作为本课程各个kernel的groudtruth,且此huggingface接口只接受Llama-2-7b-hf,不接受Llama-2-7b 3 | # 注意,此脚本我只在pip install transformers==4.38 sentencepiece accelerate的环境下验证了能跑通并拿来作为groudtruth 4 | model = LlamaForCausalLM.from_pretrained("/path/to/Llama-2-7b-hf") 5 | tokenizer = AutoTokenizer.from_pretrained("/path/to/Llama-2-7b-hf") 6 | prompt = "Hey, are you conscious? Can you talk to me?" 7 | inputs = tokenizer(prompt, return_tensors="pt") 8 | generate_ids = model.generate(inputs.input_ids, max_length=30) 9 | -------------------------------------------------------------------------------- /src/utils/weight_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "src/utils/macro.h" 8 | 9 | template 10 | void GPUMalloc(T** ptr, size_t size); 11 | 12 | template 13 | void GPUFree(T* ptr); 14 | 15 | template ::value> struct loadWeightFromBin{ 16 | public: 17 | static void internalFunc(T_OUT* ptr, std::vector shape, std::string filename); 18 | }; // 模板的泛化形式(原型) 19 | -------------------------------------------------------------------------------- /src/kernels/rmsnorm_kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/weights/llama/norm_weights.h" 7 | #include "src/utils/vectorize_utils.h" 8 | template 9 | void launchRMSNorm( TensorWrapper* decoder_out, // [num tokens, hidden_units] 10 | TensorWrapper* decoder_residual, 11 | LayerNormWeight& attn_norm_weight, //RMSNorm weights 12 | float eps, //RMSNorm eps 13 | bool is_last = false 14 | ); 15 | -------------------------------------------------------------------------------- /src/memory/allocator/base_allocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | class BaseAllocator 6 | { 7 | public: 8 | virtual ~BaseAllocator(){}; 9 | // unified interface for all derived allocator to alloc buffer 10 | template 11 | T* Malloc(T* ptr, size_t size, bool is_host){ 12 | return (T*)UnifyMalloc((void*)ptr, size, is_host); 13 | } 14 | virtual void* UnifyMalloc(void* ptr, size_t size, bool is_host = false) = 0; 15 | template 16 | void Free(T* ptr, bool is_host = false){ 17 | UnifyFree((void*)ptr, is_host); 18 | } 19 | virtual void UnifyFree(void* ptr, bool is_host = false) = 0; 20 | }; 21 | -------------------------------------------------------------------------------- /src/kernels/fused_addresidual_norm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/weights/base_weights.h" 6 | #include "src/weights/llama/norm_weights.h" 7 | #include "src/utils/tensor.h" 8 | #include "src/utils/vectorize_utils.h" 9 | template 10 | void launchFusedAddBiasResidualRMSNorm( // residual.shape = [num tokens, hidden_units] 11 | TensorWrapper* residual, 12 | TensorWrapper* decoder_out, // [num tokens, hidden_units] 13 | BaseWeight& norm, 14 | T* scale, //RMSNorm weights 15 | float eps); 16 | -------------------------------------------------------------------------------- /examples/cpp/decoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_executable(ctxDecoderExample context_decoder_example.cpp) 4 | set_property(TARGET ctxDecoderExample PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET ctxDecoderExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(ctxDecoderExample PUBLIC 7 | embeddingFunctor 8 | LlamaCtxdecoder) 9 | 10 | add_executable(selfDecoderExample self_decoder_example.cpp) 11 | set_property(TARGET selfDecoderExample PROPERTY POSITION_INDEPENDENT_CODE ON) 12 | set_property(TARGET selfDecoderExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 13 | target_link_libraries(selfDecoderExample PUBLIC 14 | Llamaselfdecoder) -------------------------------------------------------------------------------- /src/kernels/concat_past_kv.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | 7 | template 8 | void launchConcatKVCache(TensorWrapper *k_src, // from qkv bias and rope 9 | TensorWrapper *v_src, 10 | TensorWrapper *layer_id, // layer offset = layer_id * batchxbeam * max_seq_len * kv_head_num * head_size 11 | TensorWrapper *cur_query_length, // current epoch or local input length,[batchsize] 12 | TensorWrapper *history_length, 13 | TensorWrapper *k_dst, 14 | TensorWrapper *v_dst); // (RussWong)note: 少写一个;都会发生很多奇怪的错误 15 | -------------------------------------------------------------------------------- /src/kernels/fused_decoder_self_attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/utils/tensor.h" 6 | #include "src/models/llama/llama_params.h" 7 | #include "src/weights/base_weights.h" 8 | #include "src/utils/vectorize_utils.h" 9 | template 10 | void launchDecoderMaskedMHA(TensorWrapper* qkv_buf, 11 | BaseWeight& qkv, 12 | TensorWrapper* layer_id, 13 | TensorWrapper* k_cache, 14 | TensorWrapper* v_cache, 15 | TensorWrapper* finished, 16 | TensorWrapper* step, 17 | TensorWrapper* mha_output, 18 | LLaMAAttentionStaticParams& static_params); 19 | -------------------------------------------------------------------------------- /examples/cpp/attention/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_executable(CtxAttnExample context_attn_example.cpp) 4 | set_property(TARGET CtxAttnExample PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET CtxAttnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(CtxAttnExample PUBLIC 7 | -lcudart 8 | -lcudadevrt 9 | LlamaCtxAttn) 10 | add_executable(selfAttnExample self_attn_example.cpp) 11 | set_property(TARGET selfAttnExample PROPERTY POSITION_INDEPENDENT_CODE ON) 12 | set_property(TARGET selfAttnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 13 | target_link_libraries(selfAttnExample PUBLIC 14 | -lcudart 15 | -lcudadevrt 16 | LlamaselfAttn 17 | linear) 18 | -------------------------------------------------------------------------------- /src/weights/base_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | enum class WeightType 6 | { 7 | FP32_W, 8 | FP16_W, 9 | INT8_W, 10 | UNSUPPORTED_W 11 | }; 12 | 13 | template 14 | inline WeightType getWeightType() 15 | { 16 | if (std::is_same::value || std::is_same::value) { 17 | return WeightType::FP32_W; 18 | } 19 | else if (std::is_same::value || std::is_same::value) { 20 | return WeightType::FP16_W; 21 | } 22 | else if (std::is_same::value || std::is_same::value) { 23 | return WeightType::INT8_W; 24 | } 25 | else { 26 | return WeightType::UNSUPPORTED_W; 27 | } 28 | } 29 | template 30 | struct BaseWeight { 31 | std::vector shape; 32 | T* data; 33 | WeightType type; 34 | T* bias; 35 | }; 36 | -------------------------------------------------------------------------------- /src/kernels/linear.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "src/kernels/cublas_utils.h" 7 | #include "src/utils/tensor.h" 8 | #include "src/weights/base_weights.h" 9 | #include "src/utils/macro.h" 10 | //TODO: when enable int8/int4 weight only, we can add a new type param T2 to represent weight type 11 | template 12 | void launchLinearGemm(TensorWrapper* input, 13 | BaseWeight& weight, 14 | TensorWrapper* output, 15 | cublasWrapper* cublas_wrapper, 16 | bool trans_a = false, 17 | bool trans_b = false); 18 | template 19 | void launchLinearStridedBatchGemm(TensorWrapper* input1, 20 | TensorWrapper* input2, 21 | TensorWrapper* output, 22 | cublasWrapper* cublas_wrapper, 23 | bool trans_a = false, 24 | bool trans_b = false); 25 | -------------------------------------------------------------------------------- /src/weights/llama/layer_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/llama/norm_weights.h" 3 | #include "src/weights/llama/attention_weights.h" 4 | #include "src/weights/llama/ffn_weights.h" 5 | #include "src/utils/weight_utils.h" 6 | template 7 | class LlamaLayerWeight { 8 | private: 9 | int head_num; 10 | int kv_head_num; 11 | int head_size; 12 | int hidden_units; 13 | int inter_size; 14 | WeightType weight_type; 15 | int bit_size; 16 | bool attn_bias; 17 | 18 | public: 19 | LlamaLayerWeight() = delete; 20 | LlamaLayerWeight(int head_num, 21 | int kv_head_num, 22 | int head_size, 23 | int inter_size, 24 | WeightType weight_type, 25 | bool attn_bias); 26 | ~LlamaLayerWeight(); 27 | 28 | void loadWeights(std::string weight_path, WeightType weight_type); 29 | 30 | void loadWeights(); 31 | 32 | LayerNormWeight attn_norm_weight; 33 | LayerNormWeight ffn_norm_weight; 34 | LLaMAattentionWeights self_attn_weight; 35 | LLaMAFFNWeights ffn_weight; 36 | }; 37 | -------------------------------------------------------------------------------- /src/kernels/topK_bk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "src/utils/tensor.h" 7 | template 8 | struct topK 9 | { 10 | T val[K]; 11 | int id[K]; 12 | 13 | __device__ void init() { 14 | for (int i = 0; i < K; i++) { 15 | id[i] = -1; 16 | val[i] = FLT_MIN; 17 | } 18 | } 19 | 20 | __device__ void insertHeap(T data, int data_id) { 21 | if (id[K-1] == -1 || val[K-1] < data) { 22 | id[K-1] = data_id; 23 | val[K-1] = data; 24 | } 25 | for (int i = K - 2; i >= 0; i--) { 26 | if (val[i + 1] > val[i] || id[i] == -1) { 27 | T tmp = val[i]; 28 | val[i] = val[i + 1]; 29 | val[i + 1] = tmp; 30 | int tmp_id = id[i]; 31 | id[i] = id[i + 1]; 32 | id[i + 1] = tmp_id; 33 | } 34 | } 35 | } 36 | }; 37 | 38 | template 39 | void launchTopKforBeamSearch(TensorWrapper *probs, 40 | TensorWrapper *tmp_topk_ids, 41 | TensorWrapper *tmp_topk_vals, 42 | TensorWrapper *final_topk_ids, 43 | TensorWrapper *final_topk_vals); 44 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | **steps to download HF llama weights and convert to bin file** 2 | 3 | 1. download weight from https://huggingface.co/meta-llama/Llama-2-7b/tree/main, note that maybe apply for access first 4 | 5 | 2. run `python convert_downloaded_llama_weights.py --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path` and then, we can get some output files 6 | 7 | 3. based on files got by step 2, we run `python weights_convert.py -i path/to/step2's_output_dir -o path/to/final_bin_file` 8 | 9 | then we can get the weights like below pic: 10 | 11 | ![image-20240208212828680](1.png) 12 | 13 | 4. at this time, the weight is ready, and replace the weight path in user_entry.cpp by your step3's output path 14 | 15 | **note** 16 | 1. 下载Llama-2-7b-hf这个模型也是可以的,但是它不需要做以上的step2,直接做step3即可 17 | 2. 下载Llama-2-7b-chat-hf这个模型是无法复现出课程结果的,这个模型的weight和Llama-2-7b-hf的weight不一样 18 | 3. 针对之前有同学反馈的无法复现出课程视频所示结果,修改代码weights_convert.py的97和115行的np.hstack为np.vstack即可,拿qkv linear来举例,我们想要计算的是y=x * (w^T) (这个等价于torch.nn.linear),但是hstack后,x.shape = [num tokens, hidden units], w.shape=[4096, 4096 * 3],很明显x*(w^T)不成立,搞人的地方在于cublas接收这种shape的数据时居然没有报错。如果是vstack,w.shape=[4096 * 3, 4096],恰好x*(w^T)符合矩阵乘法维度规则 19 | -------------------------------------------------------------------------------- /src/weights/llama/llama_weights.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "src/weights/weight.h" 4 | #include "src/weights/base_weights.h" 5 | #include "src/weights/llama/embedding_weights.h" 6 | #include "src/weights/llama/layer_weights.h" 7 | template 8 | struct LlamaWeight : public Weight { 9 | private: 10 | int hidden_units; 11 | int inter_size; 12 | int vocab_size; 13 | int vocab_size_padded; 14 | int num_layer; 15 | WeightType weight_type; 16 | 17 | public: 18 | std::vector*> llama_layer_weight; 19 | LayerNormWeight out_rmsnorm_weight; 20 | EmbeddingWeight post_decoder_embedding_weight; 21 | EmbeddingWeight pre_decoder_embedding_weight; 22 | 23 | LlamaWeight() = default; 24 | LlamaWeight( 25 | int head_num, 26 | int kv_head_num, 27 | int head_size, 28 | int inter_size, 29 | int vocab_size, 30 | int num_layer, 31 | bool attn_bias, 32 | WeightType weight_type 33 | ); 34 | ~LlamaWeight(); 35 | void loadWeights(std::string weight_path); 36 | void loadWeightsFromDummy(); 37 | }; -------------------------------------------------------------------------------- /src/layers/attention/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_library(LlamaCtxAttn STATIC context_attention.cpp) 4 | set_property(TARGET LlamaCtxAttn PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET LlamaCtxAttn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(LlamaCtxAttn PUBLIC 7 | # -lcudart 8 | -lcudadevrt 9 | qkv_bias_and_rope 10 | concat_kv 11 | # cublasWrapper 12 | linear 13 | fused_transpose_and_remv_pad 14 | repeat_kv 15 | mask_softmax) 16 | 17 | add_library(LlamaselfAttn STATIC masked_self_attention.cpp) 18 | 19 | set_property(TARGET LlamaselfAttn PROPERTY POSITION_INDEPENDENT_CODE ON) 20 | set_property(TARGET LlamaselfAttn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 21 | target_link_libraries(LlamaselfAttn PUBLIC 22 | -lcudart 23 | -lcudadevrt 24 | fused_decoder_self_attention 25 | qkv_bias_and_rope 26 | ) 27 | -------------------------------------------------------------------------------- /src/kernels/qkv_bias_and_RoPE.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/models/llama/llama_params.h" 6 | #include "src/utils/tensor.h" 7 | #include "src/weights/base_weights.h" 8 | #include "src/utils/vectorize_utils.h" 9 | 10 | template 11 | void launchAddFusedQKVBiasTransposeAndRoPE(TensorWrapper* q_buf, 12 | TensorWrapper* k_buf, 13 | TensorWrapper* v_buf, 14 | TensorWrapper* QKV, 15 | BaseWeight& qkv, 16 | //Tensor* qkv_bias, 17 | TensorWrapper* padding_offset, 18 | TensorWrapper* history_length, 19 | TensorWrapper* input_length, 20 | LLaMAAttentionStaticParams& params); 21 | 22 | template 23 | void launchRoPE(TensorWrapper* qkv_buf, 24 | TensorWrapper* step, 25 | LLaMAAttentionStaticParams& static_params); -------------------------------------------------------------------------------- /src/layers/decoder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_library(LlamaCtxdecoder STATIC context_decoder.cpp) 4 | set_property(TARGET LlamaCtxdecoder PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET LlamaCtxdecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | target_link_libraries(LlamaCtxdecoder PUBLIC 7 | LlamaCtxAttn 8 | Llamaffn 9 | llamaweights 10 | cal_paddingoffset 11 | build_casual_mask 12 | rmsnorm 13 | fused_addresidual_norm 14 | add_residual 15 | ) 16 | 17 | add_library(Llamaselfdecoder STATIC self_decoder.cpp) 18 | set_property(TARGET Llamaselfdecoder PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET Llamaselfdecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | target_link_libraries(Llamaselfdecoder PUBLIC 21 | LlamaselfAttn 22 | Llamaffn 23 | llamaweights 24 | rmsnorm 25 | fused_addresidual_norm 26 | add_residual 27 | ) 28 | -------------------------------------------------------------------------------- /src/utils/vectorize_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | //(RussWong)note: below 5 overloaded function can convert different scalar type data to specified vector type data. 6 | template 7 | inline __device__ T_OUT scalar_cast_vec(T_IN val) 8 | { 9 | return val; 10 | } 11 | 12 | template<> 13 | inline __device__ half2 scalar_cast_vec(float val) 14 | { 15 | return __float2half2_rn(val); 16 | } 17 | 18 | template<> 19 | inline __device__ float4 scalar_cast_vec(float val) 20 | { 21 | return make_float4(val, val, val, val); 22 | } 23 | 24 | template<> 25 | inline __device__ float2 scalar_cast_vec(float val) 26 | { 27 | return make_float2(val, val); 28 | } 29 | 30 | template<> 31 | inline __device__ half2 scalar_cast_vec(half val) 32 | { 33 | //(RussWong)note: __half2half2 cant be parsed by my nvcc compiler, so I give it up 34 | //return __half2half2(val); 35 | half2 res; 36 | res.x = val; 37 | res.y = val; 38 | return res; 39 | } 40 | 41 | template 42 | struct Vec { 43 | using Type = T; 44 | static constexpr int size = 0; 45 | }; 46 | template<> 47 | struct Vec { 48 | using Type = half2; 49 | static constexpr int size = 2; 50 | }; 51 | template<> 52 | struct Vec { 53 | using Type = float4; 54 | static constexpr int size = 4; 55 | }; 56 | //(RussWong)note: temply dont know which LLM use two continuous elements do RoPE 57 | struct TwoFloat2{ 58 | float2 x; 59 | float2 y; 60 | }; 61 | -------------------------------------------------------------------------------- /src/utils/cuda_debug_utils.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | // usage: print_data<<<1, 1>>>() 6 | // notes: you can self define the print info using your actual case. 7 | template 8 | __global__ void print_data(T* src1, bool is_target=false) { 9 | int tid = threadIdx.x; 10 | if(tid == 0) { 11 | printf("%dth = %f\n", tid, src1[tid]); 12 | printf("%dth = %f\n", tid + 1, src1[tid + 1]); 13 | // is_target is used to print the info for specified function, to avoid too much print info in screen. 14 | if (is_target){ 15 | printf("%dth = %f\n", tid + 128, src1[tid + 128]); 16 | printf("%dth = %f\n", tid + 129, src1[tid + 129]); 17 | printf("%dth = %f\n", tid + 130, src1[tid + 130]); 18 | printf("%dth = %f\n", tid + 131, src1[tid + 131]); 19 | printf("%dth = %f\n", tid + 1024, src1[tid + 1024]); 20 | } 21 | // printf("from_tensor/outlinearin data[%d] = %f\n", tid, src3[tid]); 22 | // printf("from_tensor/outlinearin data[%d] = %f\n", tid + 1, src3[tid+1]); 23 | // printf("from_tensor/outlinearin data[%d] = %f\n", tid + 128, src3[tid+128]); 24 | // printf("from_tensor/outlinearin data[%d] = %f\n", tid + 129, src3[tid+129]); 25 | 26 | // printf("qkvweight/outweight data[%d] = %f\n", tid, src2[tid]); 27 | // printf("qkvweight/outweight data[%d] = %f\n", tid + 1, src2[tid+1]); 28 | // printf("qkvweight/outweight data[%d] = %f\n", tid + 128, src2[tid+128]); 29 | // printf("qkvweight/outweight data[%d] = %f\n", tid + 129, src2[tid +129]); 30 | // printf("linear done\n"); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/layers/ffn/ffn.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/llama/attention_weights.h" 3 | #include "src/weights/llama/ffn_weights.h" 4 | #include "src/memory/allocator/cuda_allocator.h" 5 | #include "src/kernels/linear.h" 6 | #include "src/utils/tensor.h" 7 | #include "src/kernels/cublas_utils.h" 8 | #include "src/models/llama/llama_params.h" 9 | #include "src/kernels/act_kernel.h" 10 | #include "src/utils/macro.h" 11 | template 12 | class LLaMAFFNLayer { 13 | private: 14 | // this params are shared across all LLMs 15 | const int head_num; 16 | const int head_size; 17 | const int inter_size; 18 | const int hidden_units; 19 | int count = -1; // used to record layer index currently 20 | 21 | cudaStream_t stream; 22 | BaseAllocator* allocator; 23 | // for linear proj 24 | cublasWrapper* cublas_wrapper; 25 | 26 | // buffer 27 | // [2, num tokens, intersize] 28 | TensorWrapper* SwiGLU_input = nullptr; //gate proj and up proj output buf 29 | // [num tokens, intersize] 30 | TensorWrapper* down_proj_input = nullptr; 31 | 32 | 33 | public: 34 | LLaMAFFNLayer(int head_num, 35 | int head_size, 36 | int inter_size, 37 | cudaStream_t stream, 38 | cublasWrapper* cublas_wrapper, 39 | BaseAllocator* allocator); 40 | 41 | void allocForForward(LLaMAAttentionDynParams& params); 42 | void allocForForward(int batch_size); 43 | void freeBuf(); 44 | void forward(TensorMap& inputs, TensorMap& outputs, LLaMAFFNWeights& weights, LLaMAAttentionDynParams& params); 45 | }; 46 | -------------------------------------------------------------------------------- /src/models/basemodel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "src/utils/tensor.h" 5 | #include "src/models/common_params.h" 6 | #include "src/memory/allocator/base_allocator.h" 7 | #include "src/kernels/cublas_utils.h" 8 | // (RussWong)note: 回调函数, 用于打印当前轮次对话的LLM生成内容 9 | using CallBack = std::function; 10 | 11 | class BaseModel{ 12 | public: 13 | std::string model_name; 14 | // (RussWong)note: 必需的且所有模型子类都共有的4个数据成员 15 | cudaStream_t stream; 16 | cublasWrapper* cublas_wrapper; 17 | BaseAllocator* allocator; 18 | cudaDeviceProp* cuda_device_prop; 19 | BaseModel(cudaStream_t stream, 20 | cublasWrapper* cublas_wrapper, 21 | BaseAllocator* allocator, 22 | cudaDeviceProp* cuda_device_prop = nullptr): 23 | stream(stream), 24 | cublas_wrapper(cublas_wrapper), 25 | allocator(allocator), 26 | cuda_device_prop(cuda_device_prop){}; 27 | // (RussWong)note: 3个纯虚函数API, 每个具体模型子类需要实现 28 | virtual void loadTokenizer(std::string file) = 0; 29 | virtual void loadWeights(std::string file) = 0; 30 | virtual void loadWeightsFromDummy() = 0; 31 | // (RussWong)note: 3个纯虚函数API, 用于定义每轮对话的输入、历史记录和回复API, 每个具体模型子类需要实现 32 | // 根据历史信息和当前输入生成当前轮次的prompt 33 | virtual std::vector MakeInput(const std::string &history, int round, const std::string &input) = 0; 34 | // 根据当前轮次回复更新到history string 35 | virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) = 0; 36 | // 回复内容的返回接口 37 | virtual std::string Response(const std::vector& input, CallBack PrintRes) = 0; 38 | }; 39 | -------------------------------------------------------------------------------- /user_entry.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/utils/model_utils.h" 3 | 4 | struct ConvertedModel { 5 | std::string model_path = "/home/llamaweight/"; // 模型文件路径 6 | std::string tokenizer_path = "/home/llama2-7b-tokenizer.bin"; // tokenizer文件路径 7 | }; 8 | 9 | int main(int argc, char **argv) { 10 | int round = 0; 11 | std::string history = ""; 12 | ConvertedModel model; 13 | // auto model = llm::CreateDummyLLMModel(model.tokenizer_file);//load dummy weight + load tokenizer 14 | auto llm_model = llm::CreateRealLLMModel(model.model_path, model.tokenizer_path);//load real weight + load tokenizer 15 | std::string model_name = llm_model->model_name; 16 | // exist when generate end token or reach max seq 17 | while (true) { 18 | printf("please input the question: "); 19 | std::string input; 20 | std::getline(std::cin, input); 21 | if (input == "s") {//停止对话 22 | break; 23 | } 24 | // (RussWong)notes: index = 生成的第几个token,从0开始 25 | std::string retString = llm_model->Response(llm_model->MakeInput(history, round, input), [model_name](int index, const char* content) { 26 | if (index == 0) { 27 | printf(":%s", content); 28 | fflush(stdout); 29 | } 30 | if (index > 0) { 31 | printf("%s", content); 32 | fflush(stdout); 33 | } 34 | if (index == -1) { 35 | printf("\n"); 36 | } 37 | }); 38 | //(RussWong)notes: 多轮对话保留history,和当前轮次input制作成新的上下文context 39 | history = llm_model->MakeHistory(history, round, input, retString); 40 | round++; 41 | } 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /src/utils/string_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include // std::make_unique 3 | #include // std::stringstream 4 | #include 5 | #include 6 | //(RussWong)note: this function allow us can self define print string 7 | template 8 | inline std::string fmtstr(const std::string& format, Args... args) 9 | { 10 | // This function came from a code snippet in stackoverflow under cc-by-1.0 11 | // https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf 12 | 13 | // Disable format-security warning in this function. 14 | int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; // Extra space for '\0' 15 | if (size_s <= 0) { 16 | throw std::runtime_error("Error during formatting."); 17 | } 18 | auto size = static_cast(size_s); 19 | std::unique_ptr buf(new char[size]); 20 | std::snprintf(buf.get(), size, format.c_str(), args...); 21 | return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside 22 | } 23 | //(RussWong)note: below two functions allow us can convert elements in vector or pointer to string 24 | template 25 | inline std::string vec2str(std::vector vec) 26 | { 27 | std::stringstream ss; 28 | ss << "("; 29 | if (!vec.empty()) { 30 | for (size_t i = 0; i < vec.size() - 1; ++i) { 31 | ss << vec[i] << ", "; 32 | } 33 | ss << vec.back(); 34 | } 35 | ss << ")"; 36 | return ss.str(); 37 | } 38 | 39 | template 40 | inline std::string arr2str(T* arr, size_t size) 41 | { 42 | std::stringstream ss; 43 | ss << "("; 44 | for (size_t i = 0; i < size - 1; ++i) { 45 | ss << arr[i] << ", "; 46 | } 47 | if (size > 0) { 48 | ss << arr[size - 1]; 49 | } 50 | ss << ")"; 51 | return ss.str(); 52 | } 53 | -------------------------------------------------------------------------------- /src/kernels/topK.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "src/utils/tensor.h" 7 | template 8 | struct topK 9 | { 10 | T val[K]; 11 | int id[K]; 12 | 13 | __device__ void init(){ 14 | for (int i = 0; i < K; i++) { 15 | id[i] = -1; 16 | val[i] = -1e-20; 17 | } 18 | } 19 | 20 | __device__ void insertHeap(T data, int data_id){ 21 | float v = (float)val[K-1]; 22 | if(id[K-1] == -1 || v < (float)data){ 23 | id[K-1] = data_id; 24 | val[K-1] = data; 25 | } 26 | //Note: 仅需一轮冒泡排序(插入新元素的重排),因为此时除了最后一个新元素,其它都是有序 27 | for (int i = K - 2; i >= 0; i--){ 28 | if(val[i + 1] > val[i] || id[i] == -1) { 29 | T tmp = val[i]; 30 | val[i] = val[i + 1]; 31 | val[i + 1] = tmp; 32 | int tmp_id = id[i]; 33 | id[i] = id[i + 1]; 34 | id[i + 1] = tmp_id; 35 | } 36 | } 37 | } 38 | }; 39 | 40 | 41 | template 42 | void launchTopKforBeamSearch(TensorWrapper *probs, 43 | TensorWrapper *topk_ids, 44 | TensorWrapper *topk_vals, 45 | TensorWrapper *final_topk_ids, 46 | TensorWrapper *final_topk_vals); 47 | // template 48 | // void launchTopKforBeamSearch(const T* probs, 49 | // const int batch_size, 50 | // const int vocab_size, 51 | // int* topk_ids, 52 | // T* topk_vals, 53 | // int* final_topk_ids, 54 | // T* final_topk_vals); 55 | -------------------------------------------------------------------------------- /src/kernels/build_casual_mask.cu: -------------------------------------------------------------------------------- 1 | #include "src/kernels/build_casual_mask.h" 2 | // mask shape = [bs, max_q_len, max_k_len] 3 | template 4 | __global__ void BuildCausalMasksConsideringContextPastKV(T* mask, 5 | const int* q_lens, //input lens, shape=[batch size] 6 | const int* k_lens, //context lens, shape=[batch size] 7 | int max_q_len, 8 | int max_k_len){ 9 | int tid = threadIdx.x; 10 | int qlen = q_lens[blockIdx.x]; 11 | int klen = k_lens[blockIdx.x]; 12 | mask += blockIdx.x * max_q_len * max_k_len; 13 | int offset = threadIdx.x; 14 | // note: this judgement confirms we dont exceed data boundry 15 | while (offset < max_q_len * max_k_len){ 16 | int q = offset / max_k_len; 17 | int k = offset % max_k_len; 18 | bool is_one = q < qlen && k < klen && k <= q + (klen - qlen) && k >= klen - qlen; 19 | mask[offset] = static_cast(is_one); 20 | 21 | offset += blockDim.x; 22 | } 23 | } 24 | 25 | template 26 | void launchBuildCausalMasks(TensorWrapper* mask, 27 | TensorWrapper* q_lens, 28 | TensorWrapper* k_lens) 29 | { 30 | int batch_size = mask->shape[0]; 31 | int max_q_len = mask->shape[1]; 32 | int max_k_len = mask->shape[2]; 33 | BuildCausalMasksConsideringContextPastKV<<>>(mask->data, q_lens->data, k_lens->data, max_q_len, max_k_len); 34 | } 35 | 36 | template void launchBuildCausalMasks(TensorWrapper* mask, 37 | TensorWrapper* q_lens, 38 | TensorWrapper* k_lens); 39 | 40 | template void launchBuildCausalMasks(TensorWrapper* mask, 41 | TensorWrapper* q_lens, 42 | TensorWrapper* k_lens); 43 | -------------------------------------------------------------------------------- /src/kernels/cal_paddingoffset.cu: -------------------------------------------------------------------------------- 1 | #include "src/kernels/cal_paddingoffset.h" 2 | // shape: 3 | //seq_lengths:[batch size] 4 | //cum_seqlens:[batch size + 1],first ele is 0 5 | //padding_offset:[batch size * max q len] 6 | // note: the point is to calc padding offset and cum offset 7 | // TODO: we first use serial algo, then can enhance to CUDA scan algo 8 | 9 | __global__ void CalPaddingoffset(int* padding_offset, 10 | int* cum_seqlens, 11 | const int* input_lengths, //actual input lens 12 | const int batch_size, 13 | const int max_q_len) { 14 | int ind = 0; 15 | int cum_offset = 0; 16 | int total_seqlen = 0; 17 | for(int b = 0; b < batch_size; b++) { 18 | int seqlen = input_lengths[b]; 19 | 20 | cum_seqlens[b] = total_seqlen; 21 | // each token in one seq has same cum offset 22 | for (int i = 0; i < seqlen; i++) { 23 | padding_offset[ind] = cum_offset; 24 | ind++; 25 | } 26 | cum_offset += max_q_len - seqlen; 27 | total_seqlen += seqlen; 28 | } 29 | cum_seqlens[batch_size] = total_seqlen; 30 | } 31 | 32 | void launchCalPaddingoffset(TensorWrapper* padding_offset, 33 | TensorWrapper* cum_seqlens, 34 | TensorWrapper* input_lengths)//actual input lens 35 | { 36 | const int batch_size = padding_offset->shape[0]; 37 | const int max_q_len = padding_offset->shape[1]; 38 | LLM_CHECK_WITH_INFO(batch_size == input_lengths->shape[0], "input lenghts numbers should equal to padding offset bs dim!") ; 39 | LLM_CHECK_WITH_INFO(batch_size == cum_seqlens->shape[0] - 1, "cum seqlen numbers should equal to padding offset bs dim + 1!") ; 40 | CalPaddingoffset<<<1, 1>>>( 41 | padding_offset->data, cum_seqlens->data, input_lengths->data, batch_size, max_q_len 42 | ); 43 | } -------------------------------------------------------------------------------- /src/kernels/input_embedding.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/kernels/input_embedding.h" 3 | #include "src/utils/cuda_debug_utils.cuh" 4 | template 5 | __global__ void embeddingFunctor(const int* input_ids, 6 | T* output, 7 | const T* embed_table, 8 | const int max_context_token_num, 9 | const int hidden_size) 10 | { 11 | int index = blockIdx.x * blockDim.x + threadIdx.x; 12 | while (index < max_context_token_num * hidden_size) { 13 | int id = input_ids[index / hidden_size]; 14 | output[index] = embed_table[id * hidden_size + index % hidden_size]; 15 | index += blockDim.x * gridDim.x; 16 | } 17 | } 18 | 19 | template 20 | void launchInputEmbedding(TensorWrapper* input_ids, // INT [token num] 21 | TensorWrapper* output, // FP32 [token num, hidden_size] = [token num, 4096] 22 | EmbeddingWeight* embed_table// FP32 [vocal_size, hidden_size] 23 | ) { 24 | const int blockSize = 256; 25 | const int max_context_token_num = output->shape[0]; // token num 26 | const int hidden_size = output->shape[1]; 27 | const int gridSize = 2048; 28 | LLM_CHECK_WITH_INFO(max_context_token_num == input_ids->shape[0], "input ids 1st shape should equal to 1st shape of output"); 29 | embeddingFunctor<<>>(input_ids->data, 30 | output->data, 31 | embed_table->data, 32 | max_context_token_num, 33 | hidden_size); 34 | #ifdef PRINT_DATA 35 | print_data<<<1, 1>>>(output->data); 36 | #else 37 | #endif 38 | } 39 | 40 | template void launchInputEmbedding(TensorWrapper* input_ids, 41 | TensorWrapper* output, 42 | EmbeddingWeight* embed_table); 43 | template void launchInputEmbedding(TensorWrapper* input_ids, 44 | TensorWrapper* output, 45 | EmbeddingWeight* embed_table); 46 | -------------------------------------------------------------------------------- /src/layers/attention/masked_self_attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/llama/attention_weights.h" 3 | #include "src/memory/allocator/cuda_allocator.h" 4 | #include "src/kernels/linear.h" //1st/4th kernel of masked self attention, qkv gemm 5 | #include "src/kernels/attn_softmax_kernel.h" 6 | #include "src/kernels/qkv_bias_and_RoPE.h" //2nd rope 7 | #include "src/kernels/fused_decoder_self_attention.h" //3rd kernel 8 | #include "src/utils/tensor.h" 9 | #include "src/kernels/cublas_utils.h" 10 | #include "src/models/llama/llama_params.h" 11 | #include "src/utils/macro.h" 12 | 13 | // (RussWong)note: 这里面的数据成员都是只存在于attention layer,而不像finished,seq lengths这种贯穿整个过程 14 | template 15 | class LLaMASelfAttentionLayer { 16 | private: 17 | // this params are shared across all LLMs 18 | const int head_num; 19 | const int head_size; 20 | const int hidden_units; 21 | const int q_head_per_kv; //for GQA and MQA 22 | const int kv_head_num; 23 | float scale; 24 | // this params are only saw in llama and are unchanged 25 | LLaMAAttentionStaticParams attn_static_params; 26 | cudaStream_t stream; 27 | BaseAllocator* allocator; 28 | // for linear and batchgemm 29 | cublasWrapper* cublas_wrapper; 30 | 31 | // intermedia buffer 32 | TensorWrapper* qkv_buf = nullptr; // for qkv linear output and rope input/output 33 | TensorWrapper* mha_output = nullptr; // mha output, then invoke a linear to attention output 34 | 35 | public: 36 | LLaMASelfAttentionLayer(int head_num, 37 | int kv_head_num, 38 | int head_size, 39 | LLaMAAttentionStaticParams attn_params, 40 | cudaStream_t stream, 41 | cublasWrapper* cublas_wrapper, 42 | BaseAllocator* allocator); 43 | // (RussWong)note: private data member can only be accessed by member function 44 | LLaMAAttentionStaticParams& GetAttnStaticParams(){ 45 | return attn_static_params; 46 | } 47 | void allocForForward(LLaMAAttentionDynParams& params); 48 | void freeBuf(); 49 | void forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights& weights, LLaMAAttentionDynParams& params); 50 | }; 51 | -------------------------------------------------------------------------------- /src/kernels/act_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/kernels/act_kernel.h" 3 | #include "src/utils/cuda_debug_utils.cuh" 4 | #include "src/utils/macro.h" 5 | template 6 | __device__ __forceinline__ T silu(const T& in) { 7 | // x * sigmoid(x) 8 | return (T) (((float) in) / (1.0f + expf((float) -in))); 9 | } 10 | 11 | template<> 12 | __device__ __forceinline__ half2 silu(const half2& in) { 13 | return make_half2(__float2half(silu((float)(in.x))), __float2half(silu((float)(in.y)))); 14 | } 15 | 16 | //代码逻辑:第一个intermediate 去做silu,结果与第二个intermediate mul 17 | template 18 | __global__ void silu_and_mul_kernel( 19 | T* out, // [bs, intermedia size] 20 | const T* input, // [bs, 2, intermedia size] 21 | const int intermedia_size) { 22 | const int batch_idx = blockIdx.x; 23 | for (int idx = threadIdx.x; idx < intermedia_size; idx += blockDim.x) { 24 | const T x = input[batch_idx * 2 * intermedia_size + idx]; 25 | const T y = input[batch_idx * 2 * intermedia_size + intermedia_size + idx]; 26 | out[batch_idx * intermedia_size + idx] = silu(x) * y; 27 | } 28 | } 29 | 30 | template<> 31 | __global__ void silu_and_mul_kernel( 32 | half* out, // [bs, intermedia size] 33 | const half* input, // [bs, 2, intermedia size] 34 | const int intermedia_size) { 35 | const int batch_idx = blockIdx.x; 36 | int vec_size = Vec::size; 37 | using Vec_t = typename Vec::Type; 38 | for (int idx = threadIdx.x * vec_size; idx < intermedia_size; idx += blockDim.x) { 39 | const Vec_t x = *reinterpret_cast(const_cast(&input[batch_idx * 2 * intermedia_size + idx])); 40 | const Vec_t y = *reinterpret_cast(const_cast(&input[batch_idx * 2 * intermedia_size + intermedia_size + idx])); 41 | *reinterpret_cast(&out[batch_idx * intermedia_size + idx]) = __hmul2(silu(x), y); 42 | } 43 | 44 | } 45 | 46 | template 47 | void launchAct(TensorWrapper* input, TensorWrapper* out) { 48 | int batch_size = input->shape[0]; 49 | LLM_CHECK(input->shape[1] == 2); 50 | int intermedia_size = input->shape[2]; 51 | dim3 grid(batch_size); 52 | dim3 block(256); 53 | silu_and_mul_kernel<<>>(out->data, input->data, intermedia_size); 54 | #ifdef PRINT_DATA 55 | printf("act kernel top2 result:\n"); 56 | print_data<<<1, 1>>>(out->data); 57 | #else 58 | #endif 59 | } 60 | // We must instancite the template, if not, will report linking issue 61 | template void launchAct(TensorWrapper* input, TensorWrapper* output); 62 | template void launchAct(TensorWrapper* input, TensorWrapper* output); 63 | -------------------------------------------------------------------------------- /src/layers/attention/context_attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/weights/llama/attention_weights.h" 3 | #include "src/memory/allocator/cuda_allocator.h" 4 | #include "src/kernels/linear.h" 5 | #include "src/kernels/attn_softmax_kernel.h" 6 | #include "src/kernels/qkv_bias_and_RoPE.h" 7 | #include "src/kernels/fused_transpose_and_remv_pad.h" 8 | #include "src/kernels/concat_past_kv.h" 9 | #include "src/kernels/repeat_kv.h" 10 | #include "src/utils/tensor.h" 11 | #include "src/kernels/cublas_utils.h" 12 | #include "src/models/llama/llama_params.h" 13 | template 14 | class LLaMAContextAttentionLayer { 15 | private: 16 | // this params are shared across all LLMs 17 | const int head_num; 18 | const int head_size; 19 | const int hidden_units; 20 | const int q_head_per_kv; //for GQA and MQA 21 | const int kv_head_num; 22 | float scale; 23 | // this params are only saw in llama and are unchanged 24 | LLaMAAttentionStaticParams attn_static_params; 25 | cudaStream_t stream; 26 | BaseAllocator* allocator; 27 | // for linear and batchgemm 28 | cublasWrapper* cublas_wrapper; 29 | 30 | TensorWrapper* qkv_buf_wo_pad = nullptr; 31 | TensorWrapper* q_buf_w_pad = nullptr; 32 | TensorWrapper* k_buf_w_pad = nullptr; 33 | TensorWrapper* v_buf_w_pad = nullptr; 34 | TensorWrapper* k_cache_buf = nullptr; 35 | TensorWrapper* v_cache_buf = nullptr; 36 | TensorWrapper* qk_buf = nullptr; 37 | TensorWrapper* qkv_buf_w_pad = nullptr; 38 | TensorWrapper* qkv_buf_wo_pad_1 = nullptr; 39 | 40 | public: 41 | LLaMAContextAttentionLayer(int head_num, 42 | int kv_head_num, 43 | int head_size, 44 | LLaMAAttentionStaticParams attn_params, 45 | cudaStream_t stream, 46 | cublasWrapper* cublas_wrapper, 47 | BaseAllocator* allocator); 48 | LLaMAAttentionStaticParams& GetAttnStaticParams(){ 49 | return attn_static_params; 50 | } 51 | 52 | void allocForForward(LLaMAAttentionDynParams& params); 53 | void freeBuf(); 54 | void forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights& weights, LLaMAAttentionDynParams& params, LLaMAAttentionStaticParams& static_params); 55 | // whats the diff across these 3 max len: 56 | // max_seq_len is the max kv len considering context, ep. multiple epochs chat 57 | // max_q_len is the current max q len after padding in this batch 58 | // all kv cache is max seq len to save all kv cache in all epochs, but in context attention, all kv cache should be broadcast to adapt q as kv cache buf whose shape is max k len 59 | // so max k len is the max context len in cur batch 60 | // void flashAttn(); 61 | }; 62 | -------------------------------------------------------------------------------- /src/kernels/cublas_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "src/utils/macro.h" 8 | //1.cublas API: must allocate the required matrices in the GPU memory space, 9 | // fill them with data, call the sequence of desired cuBLAS functions, and then upload the results back to the host. 10 | //2.cublasXt API: have the data on the Host 11 | //3.cuBLASLt API: lightweight library dedicated to GEMM with a new flexible API. 12 | // adds flexibility in matrix data layouts, input types, compute types, and also in choosing the algorithmic implementations and heuristics through parameter programmability 13 | class cublasWrapper { 14 | private: 15 | cublasHandle_t cublas_handle_; 16 | cublasLtHandle_t cublaslt_handle_; 17 | 18 | cudaDataType_t Atype_; 19 | cudaDataType_t Btype_; 20 | cudaDataType_t Ctype_; 21 | cudaDataType_t computeType_; 22 | 23 | public: 24 | cublasWrapper(cublasHandle_t cublas_handle_, 25 | cublasLtHandle_t cublaslt_handle_); 26 | // BaseAllocator* allocator); enable it when we use cublasLt API 27 | 28 | ~cublasWrapper(); 29 | void setFP32GemmConfig(); 30 | void setFP16GemmConfig(); 31 | //for proj matmul 32 | void Gemm(cublasOperation_t transa, 33 | cublasOperation_t transb, 34 | const int m, 35 | const int n, 36 | const int k, 37 | const void* A, 38 | const int lda, 39 | const void* B, 40 | const int ldb, 41 | void* C, 42 | const int ldc, 43 | float alpha, 44 | float beta); 45 | // for qk*v and q*k 46 | void stridedBatchedGemm(cublasOperation_t transa, 47 | cublasOperation_t transb, 48 | const int m, 49 | const int n, 50 | const int k, 51 | const void* A, 52 | const int lda, 53 | const int64_t strideA, 54 | const void* B, 55 | const int ldb, 56 | const int64_t strideB, 57 | void* C, 58 | const int ldc, 59 | const int64_t strideC, 60 | const int batchCount, 61 | float f_alpha, 62 | float f_beta); 63 | }; 64 | -------------------------------------------------------------------------------- /src/kernels/add_residual.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/kernels/add_residual.h" 3 | #include "src/utils/cuda_debug_utils.cuh" 4 | 5 | // (RussWong)note: this kernel is used at the end of FFN in every decoder layer 6 | template 7 | __global__ void AddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units 8 | T *residual, 9 | T *decoder_out, // [num tokens, hidden_units] 10 | int num_tokens, 11 | int hidden_units) 12 | { 13 | int vec_size = Vec::size; 14 | using Vec_t = typename Vec::Type; 15 | int batch_id = blockIdx.x; 16 | int tid = threadIdx.x; 17 | Vec_t *dout = reinterpret_cast(decoder_out + batch_id * hidden_units); 18 | Vec_t *rsd = reinterpret_cast(residual + batch_id * hidden_units); 19 | for (int i = tid; i < hidden_units / vec_size; i += blockDim.x) 20 | { 21 | dout[i].x += rsd[i].x; 22 | dout[i].y += rsd[i].y; 23 | dout[i].z += rsd[i].z; 24 | dout[i].w += rsd[i].w; 25 | } // addresidual 26 | } 27 | 28 | template <> 29 | __global__ void AddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units 30 | half *residual, 31 | half *decoder_out, // [num tokens, hidden_units] 32 | int num_tokens, 33 | int hidden_units) 34 | { 35 | int vec_size = Vec::size; 36 | using Vec_t = typename Vec::Type; 37 | int batch_id = blockIdx.x; 38 | int tid = threadIdx.x; 39 | Vec_t *dout = reinterpret_cast(decoder_out + batch_id * hidden_units); 40 | Vec_t *rsd = reinterpret_cast(residual + batch_id * hidden_units); 41 | for (int i = tid; i < hidden_units / vec_size; i += blockDim.x) 42 | { 43 | dout[i] = __hadd2(dout[i], rsd[i]); 44 | } // addresidual 45 | } 46 | 47 | template 48 | void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, 256 threads travrse hiddenunits eles recursely 49 | TensorWrapper *residual, 50 | TensorWrapper *decoder_out, // [num tokens, hidden_units] 51 | bool is_print 52 | ) 53 | { 54 | int batch_size = decoder_out->shape[0]; 55 | int hidden_units = decoder_out->shape[1]; 56 | int vec_size = Vec::size; 57 | dim3 grid(batch_size); 58 | dim3 block(256); 59 | AddResidual<<>>(residual->data, 60 | decoder_out->data, 61 | batch_size, 62 | hidden_units); 63 | #ifdef PRINT_DATA 64 | if (is_print){ 65 | print_data<<<1, 1>>>(decoder_out->data); 66 | } 67 | #else 68 | #endif 69 | } 70 | template void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units 71 | TensorWrapper *residual, 72 | TensorWrapper *decoder_out, // [num tokens, hidden_units] 73 | bool is_print 74 | ); 75 | template void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units 76 | TensorWrapper *residual, 77 | TensorWrapper *decoder_out, // [num tokens, hidden_units] 78 | bool is_print 79 | ); 80 | -------------------------------------------------------------------------------- /tests/unittests/test_fused_trans_remv_pad.cu: -------------------------------------------------------------------------------- 1 | #include "src/kernels/fused_transpose_and_remv_pad.h" 2 | #include 3 | // [b,h,s,d]=>[b,s,h,d]=>[num tokens,h,d] 4 | // padding_offset.shape = [num_tokens] 5 | // (RussWong)note: this kernel is only supporting fp32 type UT 6 | // we compare the kernel correctnesss by eyes and result print infos 7 | // `./test_fused_trans_remv_pad` to test fp32 kernel 8 | int main() { 9 | const int batch_size = 2; 10 | const int head_num = 2; 11 | const int max_seq_len = 4; 12 | const int head_size = 2; 13 | const int num_tokens = 5; 14 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 15 | const int in_size = batch_size * head_num * max_seq_len * head_size; 16 | const int out_size = num_tokens * head_num * head_size; 17 | float* h_in; 18 | float* d_in; 19 | h_in = (float*)malloc(sizeof(float) * in_size); 20 | cudaMalloc((void**)&d_in, sizeof(float) * in_size); 21 | float* h_out; 22 | float* d_out; 23 | h_out = (float*)malloc(sizeof(float) * out_size); 24 | cudaMalloc((void**)&d_out, sizeof(float) * out_size); 25 | int* h_padding_offset; 26 | int* d_padding_offset; 27 | h_padding_offset = (int*)malloc(sizeof(int) * num_tokens); 28 | cudaMalloc((void**)&d_padding_offset, sizeof(int) * num_tokens); 29 | 30 | //1st seqlen: 2, due to 1st seq, so its padding offset are all 0 31 | //2nd seqlen: 3, so its padding offset are all 4-2=2 32 | for(int i = 0; i < in_size; i++) { 33 | h_in[i] = i; 34 | } 35 | for(int i = 0; i < 2; i++) { 36 | h_padding_offset[i] = 0; 37 | } 38 | h_padding_offset[2] = 2; 39 | h_padding_offset[3] = 2; 40 | h_padding_offset[4] = 2; 41 | 42 | cudaMemcpy(d_in, h_in, sizeof(float) * in_size, cudaMemcpyHostToDevice); 43 | cudaMemcpy(d_padding_offset, h_padding_offset, sizeof(int) * num_tokens, cudaMemcpyHostToDevice); 44 | 45 | DataType type = getTensorType(); 46 | DataType type_pad = getTensorType(); 47 | TensorWrapper* in = new TensorWrapper(Device::GPU, type, {batch_size, head_num, max_seq_len, head_size}, d_in); 48 | TensorWrapper* in_pad = new TensorWrapper(Device::GPU, type_pad, {num_tokens}, d_padding_offset); 49 | TensorWrapper* out = new TensorWrapper(Device::GPU, type, {num_tokens, head_num, head_size}, d_out); 50 | std::cout << "before launch softmax kernel" << std::endl; 51 | launchTransposeOutRemovePadding(in, in_pad, out); 52 | std::cout << "after launch softmax kernel" << std::endl; 53 | std::cout << "cuda memcpy device to host" << std::endl; 54 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 55 | cudaMemcpy(h_out, out->data, sizeof(float) * out_size, cudaMemcpyDeviceToHost); 56 | for(int i = 0; i < out_size; i++) { 57 | printf("after trans and remv pad, out[%d] = %f\n", i, h_out[i]); 58 | } 59 | // debug info, better to retain: std::cout << "before free" << std::endl; 60 | free(h_in); 61 | free(h_out); 62 | free(h_padding_offset); 63 | cudaFree(d_in); 64 | cudaFree(d_out); 65 | cudaFree(d_padding_offset); 66 | } -------------------------------------------------------------------------------- /tests/unittests/test_cal_paddingoffset.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include "src/kernels/cal_paddingoffset.h" 9 | // (RussWong)note: this kernel is only int type input and output, not fp32 or half 10 | // we compare the kernel correctnesss by eyes and result print infos 11 | // `./paddingoffset` to run 12 | int main() { 13 | const int batch_size = 3; 14 | const int max_q_len = 5; 15 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 16 | int* h_seq_lens; 17 | int *d_seq_lens; 18 | h_seq_lens = (int*)malloc(sizeof(int) * batch_size); 19 | cudaMalloc((void**)&d_seq_lens, sizeof(int) * batch_size); 20 | 21 | int* h_cum_seqlens; 22 | int* d_cum_seqlens; 23 | h_cum_seqlens = (int*)malloc(sizeof(int) * (batch_size + 1)); 24 | cudaMalloc((void**)&d_cum_seqlens, sizeof(int) * (batch_size + 1)); 25 | 26 | int* h_padding_offset; 27 | int* d_padding_offset; 28 | h_padding_offset = (int*)malloc(sizeof(int) * batch_size * max_q_len); 29 | cudaMalloc((void**)&d_padding_offset, sizeof(int) * batch_size * max_q_len); 30 | 31 | for(int i = 0; i < batch_size; i++) { // 3 32 | h_seq_lens[i] = batch_size; 33 | } 34 | cudaMemcpy(d_seq_lens, h_seq_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice); 35 | DataType type_int = getTensorType(); 36 | TensorWrapper* padding_offset = new TensorWrapper(Device::GPU, type_int, {batch_size, max_q_len}, d_padding_offset); 37 | TensorWrapper* cum_seqlens = new TensorWrapper(Device::GPU, type_int, {batch_size + 1}, d_cum_seqlens); 38 | TensorWrapper* input_lengths = new TensorWrapper(Device::GPU, type_int, {batch_size}, d_seq_lens); 39 | // debug info, better to retain: std::cout << "before launch kernel" << std::endl; 40 | launchCalPaddingoffset(padding_offset, 41 | cum_seqlens, 42 | input_lengths); 43 | // debug info, better to retain: std::cout << "after launch kernel" << std::endl; 44 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 45 | cudaMemcpy(h_padding_offset, d_padding_offset, sizeof(int) * batch_size * max_q_len, cudaMemcpyDeviceToHost); 46 | cudaMemcpy(h_cum_seqlens, d_cum_seqlens, sizeof(int) * (batch_size + 1), cudaMemcpyDeviceToHost); 47 | // debug info, better to retain: std::cout << "cuda memcpy device to host" << std::endl; 48 | for(int i = 0; i < batch_size * max_q_len; i++) { 49 | printf("padding_offset = %d\n", h_padding_offset[i]); 50 | } 51 | for(int i = 0; i < batch_size + 1; i++){ 52 | printf("cum_seqlens =%d\n", h_cum_seqlens[i]); 53 | } 54 | //expected result is: 55 | // padding_offset: 0,0,0,2,2,2,4,4,4,0.... shape = [batchsize, max_q_len] 56 | // cum_seqlens: 0,3,6,9. shape=[batchsize+1] 57 | // debug info, better to retain: std::cout << "before free" << std::endl; 58 | free(h_seq_lens); 59 | free(h_padding_offset); 60 | free(h_cum_seqlens); 61 | cudaFree(d_seq_lens); 62 | cudaFree(d_padding_offset); 63 | cudaFree(d_cum_seqlens); 64 | } 65 | -------------------------------------------------------------------------------- /src/layers/decoder/self_decoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/kernels/fused_decoder_self_attention.h" 3 | #include "src/kernels/fused_addresidual_norm.h" 4 | #include "src/kernels/rmsnorm_kernel.h" 5 | #include "src/kernels/add_residual.h" 6 | #include "src/layers/attention/masked_self_attention.h" 7 | #include "src/layers/ffn/ffn.h" 8 | #include "src/weights/llama/llama_weights.h" 9 | #include "src/utils/tensor.h" 10 | 11 | // layer weights is ready at the model_utils.h by loadweights in onellm.cpp, outside of the decoder 12 | template 13 | class LlamaSelfDecoder 14 | { 15 | private: 16 | int head_num; 17 | int kv_head_num; 18 | int head_size; 19 | int inter_size; 20 | int num_layer; 21 | int hidden_units; 22 | float rmsnorm_eps; 23 | 24 | cudaStream_t stream; 25 | cublasWrapper *cublas_wrapper; 26 | BaseAllocator *allocator; 27 | 28 | TensorWrapper *decoder_residual; 29 | 30 | LLaMASelfAttentionLayer *selfAttn; 31 | LLaMAFFNLayer *ffn; 32 | DataType data_type; 33 | 34 | public: 35 | LlamaSelfDecoder(int head_num, 36 | int kv_head_num, 37 | int head_size, 38 | int inter_size, 39 | int num_layer, 40 | const LLaMAAttentionStaticParams &attn_params, 41 | float rmsnorm_eps, 42 | cudaStream_t stream, 43 | cublasWrapper *cublas_wrapper, 44 | BaseAllocator *allocator) : head_num(head_num), 45 | head_size(head_size), 46 | inter_size(inter_size), 47 | hidden_units(head_num * head_size), 48 | num_layer(num_layer), 49 | rmsnorm_eps(rmsnorm_eps), 50 | data_type(getTensorType()), 51 | stream(stream), 52 | cublas_wrapper(cublas_wrapper), 53 | allocator(allocator) 54 | { 55 | selfAttn = new LLaMASelfAttentionLayer(head_num, 56 | kv_head_num, 57 | head_size, 58 | attn_params, 59 | stream, 60 | cublas_wrapper, 61 | allocator); 62 | 63 | ffn = new LLaMAFFNLayer(head_num, 64 | head_size, 65 | inter_size, 66 | stream, 67 | cublas_wrapper, 68 | allocator); 69 | }; 70 | void allocForForward(LLaMAAttentionDynParams &dyn_params); 71 | void freeBuf(); 72 | void forward(TensorMap &input_tensors, const std::vector *> &layerWeights, TensorMap &output_tensors, LLaMAAttentionDynParams &dyn_params); 73 | }; 74 | -------------------------------------------------------------------------------- /src/layers/decoder/context_decoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "src/kernels/build_casual_mask.h" 3 | #include "src/kernels/cal_paddingoffset.h" 4 | #include "src/kernels/fused_addresidual_norm.h" 5 | #include "src/kernels/add_residual.h" 6 | #include "src/kernels/rmsnorm_kernel.h" 7 | #include "src/layers/attention/context_attention.h" 8 | #include "src/layers/ffn/ffn.h" 9 | #include "src/weights/llama/llama_weights.h" 10 | #include "src/utils/tensor.h" 11 | 12 | // layer weights is ready at the model_utils.h 13 | template 14 | class LlamaContextDecoder 15 | { 16 | private: 17 | int head_num; 18 | int kv_head_num; 19 | int head_size; 20 | int inter_size; 21 | int num_layer; 22 | int hidden_units; 23 | float rmsnorm_eps; 24 | TensorWrapper *attention_mask; 25 | TensorWrapper *padding_offset; 26 | TensorWrapper *cum_seqlens; 27 | TensorWrapper *decoder_residual; 28 | cudaStream_t stream; 29 | cublasWrapper *cublas_wrapper; 30 | BaseAllocator *allocator; 31 | 32 | LLaMAContextAttentionLayer *ctxAttn; 33 | LLaMAFFNLayer *ffn; 34 | DataType data_type; 35 | 36 | public: 37 | LlamaContextDecoder(int head_num, 38 | int kv_head_num, 39 | int head_size, 40 | int inter_size, 41 | int num_layer, 42 | const LLaMAAttentionStaticParams &attn_params, 43 | float rmsnorm_eps, 44 | cudaStream_t stream, 45 | cublasWrapper *cublas_wrapper, 46 | BaseAllocator *allocator) : head_num(head_num), 47 | head_size(head_size), 48 | inter_size(inter_size), 49 | hidden_units(head_num * head_size), 50 | num_layer(num_layer), 51 | rmsnorm_eps(rmsnorm_eps), 52 | data_type(getTensorType()), 53 | stream(stream), 54 | cublas_wrapper(cublas_wrapper), 55 | allocator(allocator) 56 | { 57 | ctxAttn = new LLaMAContextAttentionLayer(head_num, 58 | kv_head_num, 59 | head_size, 60 | attn_params, 61 | stream, 62 | cublas_wrapper, 63 | allocator); 64 | 65 | ffn = new LLaMAFFNLayer(head_num, 66 | head_size, 67 | inter_size, 68 | stream, 69 | cublas_wrapper, 70 | allocator); 71 | }; 72 | void allocForForward(LLaMAAttentionDynParams &dyn_params); 73 | void freeBuf(); 74 | void forward(TensorMap &input_tensors, const std::vector *> &layerWeights, TensorMap &output_tensors, LLaMAAttentionDynParams &dyn_params); 75 | }; 76 | -------------------------------------------------------------------------------- /src/kernels/fused_transpose_and_remv_pad.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/utils/cuda_debug_utils.cuh" 3 | #include "src/kernels/fused_transpose_and_remv_pad.h" 4 | // [bs,head nums,seqlen,head size]=>[bs,seqlen,head nums,head size]=>[num tokens,head nums,head size] 5 | // padding_offset.shape = [num_tokens] 6 | template 7 | __global__ void fused_transpose_reshape_remv_pad(T *src, 8 | T *dst, 9 | const int num_tokens, 10 | const int batch_size, 11 | const int seq_len, 12 | const int head_num, 13 | const int head_size, 14 | const int *padding_offset /*for remove padding*/) 15 | { 16 | int token_id = blockIdx.x; 17 | // map to input id 18 | int batch_id = (blockIdx.x + padding_offset[token_id]) / seq_len; 19 | int seq_id = (blockIdx.x + padding_offset[token_id]) % seq_len; 20 | // compute the offset of transpose and remove padding before or after 21 | int src_offset = batch_id * head_num * seq_len * head_size + seq_id * head_size; 22 | int dst_offset = token_id * head_num * head_size; 23 | 24 | for (int i = threadIdx.x; i < head_num * head_size; i += blockDim.x) 25 | { 26 | int head_id = i / head_size; 27 | int head_size_id = i % head_size; 28 | dst[dst_offset + i] = src[src_offset + head_id * seq_len * head_size + head_size_id]; 29 | } 30 | } 31 | template 32 | void launchTransposeOutRemovePadding(TensorWrapper *qkv_buf_w_pad, 33 | TensorWrapper *padding_offset, 34 | TensorWrapper *qkv_buf_wo_pad_1) 35 | { 36 | int batch_size = qkv_buf_w_pad->shape[0]; 37 | int head_num = qkv_buf_w_pad->shape[1]; 38 | int seq_len = qkv_buf_w_pad->shape[2]; 39 | int head_size = qkv_buf_w_pad->shape[3]; 40 | int num_tokens = qkv_buf_wo_pad_1->shape[0]; 41 | dim3 grid(num_tokens); 42 | dim3 block(std::min(head_num * head_size, 1024)); 43 | fused_transpose_reshape_remv_pad<<>>(qkv_buf_w_pad->data, 44 | qkv_buf_wo_pad_1->data, 45 | num_tokens, 46 | batch_size, 47 | seq_len, 48 | head_num, 49 | head_size, 50 | padding_offset->data); 51 | #ifdef PRINT_DATA 52 | print_data<<<1, 1>>>(qkv_buf_wo_pad_1->data); 53 | #else 54 | #endif 55 | } 56 | 57 | template void launchTransposeOutRemovePadding(TensorWrapper *qkv_buf_w_pad, 58 | TensorWrapper *padding_offset, 59 | TensorWrapper *qkv_buf_wo_pad_1); 60 | template void launchTransposeOutRemovePadding(TensorWrapper *qkv_buf_w_pad, 61 | TensorWrapper *padding_offset, 62 | TensorWrapper *qkv_buf_wo_pad_1); 63 | -------------------------------------------------------------------------------- /src/utils/model_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "src/models/basemodel.h" 6 | #include "src/models/llama/llama.h" 7 | #include "src/utils/macro.h" 8 | #include "src/memory/allocator/cuda_allocator.h" 9 | #include "src/models/llama/llama_params.h" 10 | // (RussWong) note: all LLM models are created in the header file, and I provided two ways, one is real weight model, the other is dummy weight model for functionality 11 | namespace llm { 12 | template 13 | BaseModel *CreateModelWithName(const std::string& model_name) { 14 | LLM_CHECK_WITH_INFO(model_name == "llama", "dont support other models except llama yet!"); 15 | int head_num = 32; 16 | int kv_head_num = 32; 17 | int head_size = 128; 18 | int inter_size = 11008; 19 | int num_layers = 32; 20 | int max_seq_len = 64; 21 | int vocab_size = 32000; 22 | int hidden_units = (head_num + 2 * kv_head_num) * head_size; 23 | int q_hidden_units = head_num * head_size; 24 | bool attn_bias = false; 25 | LLaMAAttentionStaticParams attn_static_params; 26 | attn_static_params.rotary_embedding_dim = 128; 27 | attn_static_params.rotary_embedding_base = 10000; 28 | attn_static_params.max_position_embeddings = 4096; 29 | attn_static_params.use_dynamic_ntk = false; // true is for dyn scaling rope 30 | cublasHandle_t cublas_handle; 31 | cublasLtHandle_t cublaslt_handle; 32 | cudaStream_t stream; 33 | cublasCreate(&cublas_handle); 34 | cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); 35 | cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle); 36 | cublas_wrapper->setFP32GemmConfig(); 37 | BaseAllocator* allocator = new CudaAllocator; 38 | cudaDeviceProp deviceProp; 39 | cudaGetDeviceProperties(&deviceProp, 0); 40 | BaseModel *model = new Llama(head_num, 41 | kv_head_num, 42 | head_size, 43 | inter_size, 44 | num_layers, 45 | vocab_size, 46 | attn_static_params, 47 | max_seq_len, 48 | stream, 49 | cublas_wrapper, 50 | allocator, 51 | &deviceProp); 52 | return model; 53 | } 54 | 55 | template 56 | std::unique_ptr CreateDummyLLMModel(std::string tokenizer_file){ 57 | BaseModel *model = CreateModelWithName("llama"); 58 | model->loadTokenizer(tokenizer_file); 59 | model->loadWeightsFromDummy(); 60 | return std::unique_ptr (model); 61 | } 62 | 63 | template 64 | std::unique_ptr CreateRealLLMModel(std::string model_dir, std::string tokenizer_file){ 65 | BaseModel *model = CreateModelWithName("llama"); 66 | std::cout << "start creating model..." << "\n"; 67 | model->loadTokenizer(tokenizer_file); 68 | model->loadWeights(model_dir); 69 | std::cout << "finish creating model..." << "\n"; 70 | return std::unique_ptr (model); 71 | } 72 | } // namespace llm 73 | 74 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(oneLLM LANGUAGES CXX CUDA) 3 | 4 | find_package(CUDA 10.0 REQUIRED) 5 | 6 | set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) 7 | 8 | 9 | list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64) 10 | find_package(CUDA REQUIRED) 11 | 12 | # setting compiler flags 13 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 15 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall") 16 | 17 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \ 18 | -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \ 19 | -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \ 20 | -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \ 21 | -gencode=arch=compute_86,code=\\\"sm_86,compute_86\\\" \ 22 | ") 23 | # -rdc=true") # not sure the effect of this option, retain it temply 24 | 25 | set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86) 26 | message("-- Assign GPU architecture (sm=70 75 80 86)") 27 | 28 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0") 29 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0") 30 | set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall") 31 | 32 | message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS}) 33 | 34 | set(CMAKE_CXX_STANDARD 11) 35 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 36 | 37 | if(CMAKE_CXX_STANDARD STREQUAL "11") 38 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") 39 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") 40 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") 41 | endif() 42 | 43 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") 44 | set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3") 45 | 46 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 47 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 48 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 49 | 50 | set(COMMON_HEADER_DIRS 51 | ${PROJECT_SOURCE_DIR} 52 | ${CUDA_PATH}/include 53 | ) 54 | 55 | set(COMMON_LIB_DIRS 56 | ${CUDA_PATH}/lib64 57 | ) 58 | 59 | include_directories( 60 | ${COMMON_HEADER_DIRS} 61 | ) 62 | 63 | link_directories( 64 | ${COMMON_LIB_DIRS} 65 | ) 66 | option (PERF 67 | "measure model inference performance" 68 | OFF 69 | ) 70 | option (PRINT_DATA 71 | "print kernel output to debug" 72 | OFF 73 | ) 74 | option (SAVE_DATA 75 | "save kernel output to debug" 76 | OFF 77 | ) 78 | if (PERF) 79 | add_compile_options(-DPERF) 80 | endif() 81 | if (PRINT_DATA) 82 | add_compile_options(-DPRINT_DATA) 83 | endif() 84 | if (SAVE_DATA) 85 | add_compile_options(-DSAVE_DATA) 86 | endif() 87 | #cmake .. -DPRINT_DATA=ON && make 88 | #cmake .. -DPRINT_DATA=ON -DSAVE_DATA=ON && make 89 | #cmake .. -DPERF=ON && make 90 | #cmake .. && make 91 | file(GLOB_RECURSE LLM_CXX_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cc) 92 | file(GLOB_RECURSE LLM_CUDA_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cu) 93 | 94 | add_library(llmengine OBJECT 95 | ${LLM_CXX_SOURCES} 96 | ${LLM_CUDA_SOURCES} 97 | ) 98 | 99 | add_subdirectory(src) 100 | add_subdirectory(tests) 101 | # add_subdirectory(examples) 102 | 103 | add_executable(main user_entry.cpp) 104 | target_link_libraries(main PUBLIC -lcublas -lcudart -lcudadevrt llmengine) 105 | -------------------------------------------------------------------------------- /src/utils/debug_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "src/utils/tensor.h" 7 | #include "src/weights/base_weights.h" 8 | #include "src/utils/macro.h" 9 | // (RussWong)note: overloaded 3 different function for saving intermediate output tensor to debug 10 | // because LLMs have many layers, so I provide some overloaded function to specify layer id to print specify layer output tensor to debug 11 | // after you save tensor into specified file ,you can turn to tests/unitests/test_data_compare.cu to specify file path to compare res with HF. 12 | template 13 | void save_tensor(TensorWrapper* input, std::string filename){ 14 | int Bm = 0; 15 | int Bk = 0; 16 | if (input->shape.size() == 4){ 17 | Bm = input->shape[0] * input->shape[1]; 18 | Bk = input->shape[3] * input->shape[2]; 19 | } else if (input->shape.size() == 3){ 20 | Bm = input->shape[0]; 21 | Bk = input->shape[1] * input->shape[2]; 22 | } else if (input->shape.size() == 2){ 23 | Bm = input->shape[0]; 24 | Bk = input->shape[1]; 25 | } 26 | T* icpu = (T*)malloc(sizeof(T) * Bm * Bk); 27 | cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost); 28 | std::ofstream F; 29 | std::cout << "saving intermediate tensor in " << filename << "\n"; 30 | F.open("/home/data/"+ filename, std::ofstream::binary); 31 | F.write(reinterpret_cast(icpu), sizeof(T)*Bm*Bk); 32 | F.close(); 33 | } 34 | 35 | template 36 | void save_tensor(TensorWrapper* input, std::string filename, TensorWrapper* layer_id){ 37 | int id = layer_id->getVal(); 38 | if (id > 2) { 39 | return; 40 | } 41 | int Bm = 0; 42 | int Bk = 0; 43 | if (input->shape.size() == 4){ 44 | Bm = input->shape[0] * input->shape[1]; 45 | Bk = input->shape[3] * input->shape[2]; 46 | } else if (input->shape.size() == 3){ 47 | Bm = input->shape[0]; 48 | Bk = input->shape[1] * input->shape[2]; 49 | } else if (input->shape.size() == 2){ 50 | Bm = input->shape[0]; 51 | Bk = input->shape[1]; 52 | } 53 | T* icpu = (T*)malloc(sizeof(T) * Bm * Bk); 54 | cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost); 55 | std::ofstream F; 56 | std::cout << "saving intermediate tensor in " << filename << "\n"; 57 | F.open("/home/data/" + std::to_string(id) + "_" + filename, std::ofstream::binary); 58 | F.write(reinterpret_cast(icpu), sizeof(T)*Bm*Bk); 59 | F.close(); 60 | } 61 | 62 | template 63 | void save_tensor(TensorWrapper* input, std::string filename, int layer_id){ 64 | int id = layer_id; 65 | if (id > 2) { 66 | return; 67 | } 68 | int Bm = 0; 69 | int Bk = 0; 70 | if (input->shape.size() == 4){ 71 | Bm = input->shape[0] * input->shape[1]; 72 | Bk = input->shape[3] * input->shape[2]; 73 | } else if (input->shape.size() == 3){ 74 | Bm = input->shape[0]; 75 | Bk = input->shape[1] * input->shape[2]; 76 | } else if (input->shape.size() == 2){ 77 | Bm = input->shape[0]; 78 | Bk = input->shape[1]; 79 | } 80 | T* icpu = (T*)malloc(sizeof(T) * Bm * Bk); 81 | cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost); 82 | std::ofstream F; 83 | std::cout << "saving intermediate tensor in " << filename << "\n"; 84 | F.open("/home/data/" + std::to_string(id) + "_" + filename, std::ofstream::binary); 85 | F.write(reinterpret_cast(icpu), sizeof(T)*Bm*Bk); 86 | F.close(); 87 | } 88 | -------------------------------------------------------------------------------- /tests/unittests/test_act.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | #include "src/kernels/act_kernel.h" 8 | // (RussWong)note: not sure CPU implementation is absolutely right and the GPU kernel is right compared with HF. 9 | // when you are implementing LLMs inference on CPU, you can reuse the CPU kernel and test its correctness 10 | // (RussWong)note: 11 | // `./test_act 1` to test half GPU kernel 12 | // `./test_act` to test fp32 GPU kernel 13 | template 14 | void CPUSwiGLU(T* input, T* output, int batch_size, int intermedia_size){ 15 | float silu_out = 0.0f; 16 | for(int batch_id = 0; batch_id < batch_size; batch_id++){ 17 | for(int i = 0; i < intermedia_size; i++) { 18 | int offset1 = batch_id * 2 * intermedia_size + i; 19 | int offset2 = batch_id * 2 * intermedia_size + i + intermedia_size; 20 | int out_offset = batch_id * intermedia_size + i; 21 | silu_out = (float)input[offset1] / (1.0f + expf(-1 * (float)input[offset1])); 22 | output[out_offset] = static_cast(silu_out * (float)input[offset2]); 23 | } 24 | } 25 | } 26 | template 27 | bool CheckResult(T* CPUoutput, T* GPUoutput, int output_size) { 28 | for(int i = 0; i < output_size; i++) { 29 | if(fabs((float)CPUoutput[i] - (float)GPUoutput[i]) > 1e-6){ 30 | printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, (float)CPUoutput[i], (float)GPUoutput[i]); 31 | } 32 | } 33 | return true; 34 | } 35 | 36 | template 37 | void test_act(int batch_size, int intermedia_size, int input_size , int output_size) { 38 | T* h_input; 39 | T* d_input; 40 | h_input = (T*)malloc(sizeof(T) * input_size); 41 | cudaMalloc((void**)&d_input, sizeof(T) * input_size); 42 | T* h_output; 43 | T* d_output; 44 | h_output = (T*)malloc(sizeof(T) * output_size); 45 | cudaMalloc((void**)&d_output, sizeof(T) * output_size); 46 | for(int i = 0; i < input_size; i++) { // initialize host data 47 | h_input[i] = (T)1; 48 | } 49 | cudaMemcpy(d_input, h_input, sizeof(T) * input_size, cudaMemcpyHostToDevice); 50 | DataType type = getTensorType(); 51 | TensorWrapper* input_tensor = new TensorWrapper(GPU, type, {batch_size, 2, intermedia_size}, d_input); 52 | TensorWrapper* output_tensor = new TensorWrapper(GPU, type, {batch_size, intermedia_size}, d_output); 53 | launchAct(input_tensor, output_tensor); 54 | cudaMemcpy(h_output, d_output, sizeof(T) * output_size, cudaMemcpyDeviceToHost); 55 | T* CPU_output = (T*)malloc(sizeof(T) * output_size); 56 | CPUSwiGLU(h_input, CPU_output, batch_size, intermedia_size); 57 | bool is_true = CheckResult(CPU_output, h_output, output_size); 58 | if(is_true){ 59 | printf("test passed"); 60 | } else { 61 | printf("test failed"); 62 | } 63 | 64 | free(h_input); 65 | free(h_output); 66 | free(CPU_output); 67 | cudaFree(d_input); 68 | cudaFree(d_output); 69 | } 70 | 71 | int main(int argc, char** argv) { 72 | constexpr int batch_size = 16; 73 | constexpr int intermedia_size = 11008; 74 | constexpr int input_size = batch_size * intermedia_size * 2; 75 | constexpr int output_size = batch_size * intermedia_size; 76 | if (argv[1]){ 77 | test_act(batch_size, intermedia_size, input_size, output_size); 78 | } else { 79 | test_act(batch_size, intermedia_size, input_size, output_size); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/kernels/sampling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/kernels/sampling.h" 3 | // mini-softmax + curand_sample 4 | // input: [bs, K] from topK output 5 | // output: [bs] 6 | // (Russwong)note: beamsearch不存在sampling,所以bsxbm = bs 7 | template 8 | __global__ void SamplingKernel(int* topk_id, 9 | T* topk_val, //[bs, K] from topK 10 | int* output_id, //[bs] 11 | int* seqlen, //cumulated seq len,[bs] 12 | bool* is_finished, //[bs] 13 | int K, 14 | int rand_num, // step 15 | int end_id, // when initialize llama model, we will init it, and this is a fixed val 16 | int vocab_size) 17 | { 18 | int batch_id = blockIdx.x; 19 | int bid = batch_id; 20 | int tid = threadIdx.x; 21 | int offset = batch_id * K + tid; 22 | T max_val = topk_val[batch_id * K]; // max val is the top of the buffer, because topK 23 | topk_val[offset] = (T)(expf((float)topk_val[offset] - (float)max_val)); 24 | __shared__ float thredhold, sum; 25 | if(tid == 0) { 26 | sum = 0.0f; 27 | for(int i = 0; i < K; i++) { 28 | sum += (float)topk_val[batch_id * K + i]; 29 | } 30 | curandState_t state; 31 | // (Russwong)note: curand_init API only support ulonglong data type 32 | curand_init((unsigned long long)rand_num,(unsigned long long)bid, (unsigned long long)0, &state); 33 | thredhold = (float)curand_uniform(&state) * sum; // for a block 34 | output_id[bid] = topk_id[bid * K] % vocab_size; 35 | for(int i = 0; i < K; i++) { 36 | thredhold = thredhold - (float)topk_val[batch_id * K + i]; 37 | if(thredhold < 0) { 38 | output_id[bid] = topk_id[batch_id * K + i] % vocab_size; 39 | break; 40 | } 41 | } 42 | seqlen[bid] = is_finished[bid] ? seqlen[bid] : seqlen[bid] + 1; 43 | is_finished[bid] = output_id[bid] == end_id ? 1 : 0; 44 | } 45 | } 46 | 47 | template 48 | void launchSampling(TensorWrapper* topk_id, 49 | TensorWrapper* topk_val, 50 | TensorWrapper* seqlen, 51 | TensorWrapper* is_finished, 52 | TensorWrapper* output_id, 53 | IntDict& params) { 54 | int batch_size = topk_id->shape[0]; 55 | int K = topk_id->shape[1]; 56 | int vocab_size = params["vocab_size"]; 57 | int step = params["step"]; 58 | int end_id = params["end_id"]; 59 | 60 | dim3 grid(batch_size); 61 | dim3 block(K); // K is small, so directly allocate K threads is enough 62 | SamplingKernel<<>>( 63 | topk_id->data, 64 | topk_val->data, 65 | output_id->data, 66 | seqlen->data, 67 | is_finished->data, 68 | K, 69 | step, 70 | end_id, 71 | vocab_size 72 | ); 73 | } 74 | 75 | template void launchSampling(TensorWrapper* topk_id, 76 | TensorWrapper* topk_val, 77 | TensorWrapper* seqlen, 78 | TensorWrapper* is_finished, 79 | TensorWrapper* output_id, 80 | IntDict& params); 81 | 82 | template void launchSampling(TensorWrapper* topk_id, 83 | TensorWrapper* topk_val, 84 | TensorWrapper* seqlen, 85 | TensorWrapper* is_finished, 86 | TensorWrapper* output_id, 87 | IntDict& params); 88 | -------------------------------------------------------------------------------- /tests/unittests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(embedding 2 | test_input_embedding.cu 3 | ) 4 | target_link_libraries( 5 | embedding PUBLIC 6 | -lcudart 7 | -lcudadevrt 8 | embeddingFunctor) 9 | 10 | add_executable(rms_norm 11 | test_rmsnorm.cu 12 | ) 13 | target_link_libraries( 14 | rms_norm PUBLIC 15 | -lcudart 16 | -lcudadevrt 17 | rmsnorm) 18 | 19 | add_executable(paddingoffset 20 | test_cal_paddingoffset.cu 21 | ) 22 | target_link_libraries( 23 | paddingoffset PUBLIC 24 | -lcudart 25 | -lcudadevrt 26 | cal_paddingoffset) 27 | 28 | add_executable(causalmask 29 | test_casual_mask.cu 30 | ) 31 | target_link_libraries( # Libs for test_build_casual_mask 32 | causalmask PUBLIC 33 | -lcudart 34 | -lcudadevrt 35 | build_casual_mask) 36 | 37 | add_executable(testlinear 38 | test_linear.cu 39 | ) 40 | target_link_libraries( # Libs for test_build_casual_mask 41 | testlinear PUBLIC 42 | -lcudart 43 | -lcudadevrt 44 | linear) 45 | 46 | add_executable(debug 47 | test_data_compare.cu 48 | ) 49 | target_link_libraries( # Libs for test_build_casual_mask 50 | debug PUBLIC 51 | -lcudart 52 | -lcudadevrt) 53 | 54 | add_executable(bmm 55 | test_bmm.cu 56 | ) 57 | target_link_libraries( # Libs for test_build_casual_mask 58 | bmm PUBLIC 59 | -lcudart 60 | -lcudadevrt 61 | linear) 62 | 63 | add_executable(biasRope 64 | test_bias_and_RoPE.cu 65 | ) 66 | target_link_libraries( # Libs for test_qkv_bias_and_rope 67 | biasRope PUBLIC 68 | -lcudart 69 | -lcudadevrt 70 | qkv_bias_and_rope) 71 | 72 | add_executable(test_concat_kv 73 | test_concat_kv.cu 74 | ) 75 | target_link_libraries( # Libs for test_qkv_bias_and_rope 76 | test_concat_kv PUBLIC 77 | -lcudart 78 | -lcudadevrt 79 | concat_kv) 80 | 81 | add_executable(test_repeat_kv 82 | test_repeat_kv.cu 83 | ) 84 | target_link_libraries( 85 | test_repeat_kv PUBLIC 86 | -lcudart 87 | -lcudadevrt 88 | repeat_kv) 89 | 90 | add_executable(test_mask_softmax 91 | test_mask_softmax.cu 92 | ) 93 | target_link_libraries( 94 | test_mask_softmax PUBLIC 95 | -lcudart 96 | -lcudadevrt 97 | mask_softmax) 98 | 99 | add_executable(test_fused_trans_remv_pad 100 | test_fused_trans_remv_pad.cu 101 | ) 102 | target_link_libraries( 103 | test_fused_trans_remv_pad PUBLIC 104 | -lcudart 105 | -lcudadevrt 106 | fused_transpose_and_remv_pad) 107 | 108 | add_executable(test_fused_addresidual_norm 109 | test_fused_addresidual_norm.cu 110 | ) 111 | target_link_libraries( 112 | test_fused_addresidual_norm PUBLIC 113 | -lcudart 114 | -lcudadevrt 115 | fused_addresidual_norm) 116 | 117 | add_executable(test_act 118 | test_act.cu 119 | ) 120 | target_link_libraries( 121 | test_act PUBLIC 122 | -lcudart 123 | -lcudadevrt 124 | act) 125 | 126 | add_executable(test_topk 127 | test_topk.cu 128 | ) 129 | target_link_libraries( 130 | test_topk PUBLIC 131 | -lcudart 132 | -lcudadevrt 133 | topk) 134 | 135 | add_executable(test_fused_decoder_attention 136 | test_fused_decoder_attention.cu 137 | ) 138 | target_link_libraries( 139 | test_fused_decoder_attention PUBLIC 140 | -lcudart 141 | -lcudadevrt 142 | fused_decoder_self_attention) 143 | 144 | add_executable(test_sampling 145 | test_sampling.cu 146 | ) 147 | target_link_libraries( 148 | test_sampling PUBLIC 149 | -lcudart 150 | -lcudadevrt 151 | sampling) 152 | 153 | add_executable(test_residual 154 | test_residual.cu 155 | ) 156 | target_link_libraries( 157 | test_residual PUBLIC 158 | -lcudart 159 | -lcudadevrt 160 | add_residual) 161 | -------------------------------------------------------------------------------- /src/layers/ffn/ffn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/layers/ffn/ffn.h" 3 | #include "src/utils/debug_utils.h" 4 | //(RussWong) note: layers文件夹下,很多操作后面我都加了`DeviceSyncAndCheckCudaError();`,大家可手动删除或者按照lesson30所示添加条件编译代码 5 | template 6 | LLaMAFFNLayer::LLaMAFFNLayer(int head_num, 7 | int head_size, 8 | int inter_size, 9 | cudaStream_t stream, 10 | cublasWrapper* cublas_wrapper, 11 | BaseAllocator* allocator): 12 | head_num(head_num), 13 | head_size(head_size), 14 | inter_size(inter_size), 15 | stream(stream), 16 | cublas_wrapper(cublas_wrapper), 17 | allocator(allocator), 18 | hidden_units(head_num * head_size) {} 19 | 20 | template 21 | void LLaMAFFNLayer::allocForForward(LLaMAAttentionDynParams& params){ 22 | int num_tokens = params.num_tokens; 23 | DataType type = getTensorType(); 24 | SwiGLU_input = new TensorWrapper(Device::GPU, type, {num_tokens, 2, inter_size}); 25 | down_proj_input = new TensorWrapper(Device::GPU, type, {num_tokens, inter_size}); 26 | SwiGLU_input->data = allocator->Malloc(SwiGLU_input->data, sizeof(T) * num_tokens * 2 * inter_size, false); 27 | down_proj_input->data = allocator->Malloc(down_proj_input->data, sizeof(T) * num_tokens * inter_size, false); 28 | } 29 | template 30 | void LLaMAFFNLayer::allocForForward(int batch_size){ 31 | DataType type = getTensorType(); 32 | SwiGLU_input = new TensorWrapper(Device::GPU, type, {batch_size, 2, inter_size}); 33 | down_proj_input = new TensorWrapper(Device::GPU, type, {batch_size, inter_size}); 34 | SwiGLU_input->data = allocator->Malloc(SwiGLU_input->data, sizeof(T) * batch_size * 2 * inter_size, false); 35 | down_proj_input->data = allocator->Malloc(down_proj_input->data, sizeof(T) * batch_size * inter_size, false); 36 | } 37 | template 38 | void LLaMAFFNLayer::freeBuf(){ 39 | allocator->Free(SwiGLU_input->data); 40 | DeviceSyncAndCheckCudaError(); 41 | allocator->Free(down_proj_input->data); 42 | DeviceSyncAndCheckCudaError(); 43 | } 44 | template 45 | void LLaMAFFNLayer::forward(TensorMap& inputs, TensorMap& outputs, LLaMAFFNWeights& weights, LLaMAAttentionDynParams& params){ 46 | if (params.num_tokens > 0) { 47 | allocForForward(params); 48 | } else { 49 | allocForForward(params.batch_size); 50 | } 51 | Tensor* ffn_input = inputs["ffn_input"]; 52 | Tensor* ffn_output = outputs["ffn_output"]; 53 | count += 1; 54 | bool is_ctx = params.is_ctx; 55 | #ifdef SAVE_DATA 56 | save_tensor(ffn_input->as(), "ffn_input.bin", count); 57 | #else 58 | #endif 59 | // 1.fusedGateUp proj 60 | launchLinearGemm(ffn_input->as(), weights.gateAndup, SwiGLU_input, cublas_wrapper, false, true); 61 | DeviceSyncAndCheckCudaError(); 62 | // single up proj linear, deprecated due to fuse gate and up into fusedGateAndup 63 | // launchLinearGemm(ffn_input->as(), weights.up, SwiGLU_input, cublas_wrapper, false, false, true); 64 | #ifdef SAVE_DATA 65 | save_tensor(SwiGLU_input ,"swiglu_input.bin", count); 66 | #else 67 | #endif 68 | // 2.swiGLU 69 | launchAct(SwiGLU_input, down_proj_input);// down_proj_input maybe can reuse swiglu_input buf, will validate it later 70 | DeviceSyncAndCheckCudaError(); 71 | #ifdef SAVE_DATA 72 | save_tensor(down_proj_input ,"down_proj_input.bin", count); 73 | #else 74 | #endif 75 | // 3.down proj 76 | launchLinearGemm(down_proj_input, weights.down, ffn_output->as(), cublas_wrapper, false, true); 77 | DeviceSyncAndCheckCudaError(); 78 | this->freeBuf(); 79 | }; 80 | 81 | template class LLaMAFFNLayer; 82 | template class LLaMAFFNLayer; 83 | -------------------------------------------------------------------------------- /tests/unittests/test_repeat_kv.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include 9 | #include "src/kernels/repeat_kv.h" 10 | // (RussWong)note: 11 | // there is no repeat kv cpu kernel implementation now 12 | // we compare the kernel correctnesss by eyes 13 | // `./test_repeat_kv` to test fp32 GPU kernel 14 | int main() { 15 | const int batch_size = 1; 16 | const int head_num = 2; 17 | const int kv_head_num = 2; 18 | const int max_seq_len = 4; 19 | const int max_k_len = 2; 20 | const int head_size = 2; 21 | const int num_layers = 2; 22 | const int k_size = num_layers * batch_size * kv_head_num * max_seq_len * head_size; 23 | const int out_k_size = batch_size * head_num * max_k_len * head_size; 24 | float* h_k; 25 | float* d_k; 26 | h_k = (float*)malloc(sizeof(float) * k_size); 27 | cudaMalloc((void**)&d_k, sizeof(float) * k_size); 28 | float* h_v; 29 | float* d_v; 30 | h_v = (float*)malloc(sizeof(float) * k_size); 31 | cudaMalloc((void**)&d_v, sizeof(float) * k_size); 32 | int* h_ctx_len; 33 | int* d_ctx_len; 34 | h_ctx_len = (int*)malloc(sizeof(int) * batch_size); 35 | cudaMalloc((void**)&d_ctx_len, sizeof(int) * batch_size); 36 | float* h_trans_k; 37 | float* d_trans_k; 38 | h_trans_k = (float*)malloc(sizeof(float) * out_k_size); 39 | cudaMalloc((void**)&d_trans_k, sizeof(float) * out_k_size); 40 | float* h_trans_v; 41 | float* d_trans_v; 42 | h_trans_v = (float*)malloc(sizeof(float) * out_k_size); 43 | cudaMalloc((void**)&d_trans_v, sizeof(float) * out_k_size); 44 | 45 | for(int i = 0; i < k_size; i++) { 46 | h_v[i] = i; 47 | h_k[i] = i; 48 | } 49 | int* h_layer_id = (int*)malloc(sizeof(int)*batch_size); 50 | 51 | for(int i = 0; i < batch_size; i++) { 52 | h_ctx_len[i] = 2; 53 | h_layer_id[i] = 0; 54 | } 55 | 56 | cudaMemcpy(d_k, h_k, sizeof(float) * k_size, cudaMemcpyHostToDevice); 57 | cudaMemcpy(d_v, h_v, sizeof(float) * k_size, cudaMemcpyHostToDevice); 58 | cudaMemcpy(d_ctx_len, h_ctx_len, sizeof(int) * batch_size, cudaMemcpyHostToDevice); 59 | DataType type = getTensorType(); 60 | DataType type_int = getTensorType(); 61 | TensorWrapper* in_k = new TensorWrapper(Device::GPU, type, {num_layers, batch_size, kv_head_num, max_seq_len, head_size}, d_k); 62 | TensorWrapper* in_v = new TensorWrapper(Device::GPU, type, {num_layers, batch_size, kv_head_num, max_seq_len, head_size}, d_v); 63 | TensorWrapper* ctx_len = new TensorWrapper(Device::GPU, type_int, {batch_size}, d_ctx_len); 64 | TensorWrapper* out_k = new TensorWrapper(Device::GPU, type, {batch_size, head_num, max_k_len, head_size}, d_trans_k); 65 | TensorWrapper* out_v = new TensorWrapper(Device::GPU, type, {batch_size, head_num, max_k_len, head_size}, d_trans_v); 66 | TensorWrapper* layer_id = new TensorWrapper(Device::CPU, type_int, {batch_size}, h_layer_id); 67 | 68 | std::cout << "before launch repeat kv kernel" << std::endl; 69 | launchRepeatKVCache(in_k, in_v, ctx_len, layer_id, out_k, out_v); 70 | std::cout << "after launch repeat kv kernel" << std::endl; 71 | std::cout << "cuda memcpy device to host" << std::endl; 72 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 73 | cudaMemcpy(h_trans_k, out_k->data, sizeof(float) * out_k_size, cudaMemcpyDeviceToHost); 74 | for(int i = 0; i < out_k_size; i++) { 75 | printf("k trans[%d] = %f\n", i, h_trans_k[i]); 76 | } 77 | // debug info, better to retain: std::cout << "before free" << std::endl; 78 | free(h_k); 79 | free(h_v); 80 | free(h_ctx_len); 81 | free(h_trans_k); 82 | free(h_trans_v); 83 | free(h_layer_id); 84 | cudaFree(d_k); 85 | cudaFree(d_v); 86 | cudaFree(d_ctx_len); 87 | cudaFree(d_trans_k); 88 | cudaFree(d_trans_v); 89 | } 90 | -------------------------------------------------------------------------------- /src/layers/attention/masked_self_attention.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/utils/debug_utils.h" 3 | #include "src/layers/attention/masked_self_attention.h" 4 | //(RussWong) note: layers文件夹下,很多操作后面我都加了`DeviceSyncAndCheckCudaError();`,大家可手动删除或者按照lesson30所示添加条件编译代码 5 | template 6 | LLaMASelfAttentionLayer::LLaMASelfAttentionLayer( 7 | int head_num, 8 | int kv_head_num, 9 | int head_size, 10 | LLaMAAttentionStaticParams attn_params, 11 | cudaStream_t stream, 12 | cublasWrapper* cublas_wrapper, 13 | BaseAllocator* allocator): 14 | head_num(head_num), 15 | kv_head_num(kv_head_num), 16 | head_size(head_size), 17 | stream(stream), 18 | cublas_wrapper(cublas_wrapper), 19 | allocator(allocator), 20 | hidden_units(head_num * head_size), 21 | attn_static_params(attn_params), 22 | // TODO: check kv_head_num is divided by haed_num 23 | q_head_per_kv(head_num / kv_head_num), 24 | scale(float(1 / sqrt(head_size))){} 25 | 26 | template 27 | void LLaMASelfAttentionLayer::allocForForward(LLaMAAttentionDynParams& params) { 28 | int batch_size = params.batch_size; 29 | int num_tokens = params.num_tokens; 30 | int max_q_len = params.max_q_len; 31 | int max_k_len = params.max_k_len; 32 | DataType type = getTensorType(); 33 | const int qkv_head_num = head_num + 2 * kv_head_num; 34 | // (RussWong) note: 当前step的q k v的shape里面step或seqlen都是1,之前step的kv在做gemv的时候直接从kv cache拿 35 | qkv_buf = new TensorWrapper(Device::GPU, type, {batch_size, qkv_head_num, head_size}); 36 | mha_output = new TensorWrapper(Device::GPU, type, {batch_size, hidden_units}); 37 | 38 | qkv_buf->data = allocator->Malloc(qkv_buf->data, sizeof(T) * batch_size * qkv_head_num * head_size, false); 39 | mha_output->data = allocator->Malloc( 40 | mha_output->data, sizeof(T) * batch_size * hidden_units, false); 41 | } 42 | template 43 | void LLaMASelfAttentionLayer::freeBuf(){ 44 | allocator->Free(qkv_buf->data); 45 | DeviceSyncAndCheckCudaError(); 46 | allocator->Free(mha_output->data); 47 | DeviceSyncAndCheckCudaError(); 48 | } 49 | // (RussWong) note: params order of launcher function in LaMAContextAttentionLayer::forward: (input[Tensor], input[Tensor],...,weight[Weight], output[*]) 50 | template 51 | void LLaMASelfAttentionLayer::forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights& weights, LLaMAAttentionDynParams& params) 52 | { 53 | // (RussWong) note: allocate intermediate buf of the layer forward 54 | allocForForward(params); 55 | //1. qkv linear 56 | //shape:[bs,1,q_hidden_units] * [q_hidden_units, hidden_units] = [bs,1,hidden_units] 57 | Tensor* attention_input = inputs["attention_input"]; 58 | launchLinearGemm(attention_input->as(), weights.qkv, qkv_buf, cublas_wrapper, false, true); 59 | DeviceSyncAndCheckCudaError(); 60 | //2. biasRope 61 | Tensor* attention_output = outputs["attention_output"]; 62 | // kv cache shape = [bs, kv head num, max seq len head size] 63 | Tensor* key_cache = outputs["all_k_cache"]; 64 | Tensor* value_cache = outputs["all_v_cache"]; 65 | Tensor* finished = inputs["finished"]; 66 | Tensor* step = inputs["step"];//[1] onCPU 67 | Tensor* layer_id = inputs["layer_id"];//[1] onCPU 68 | launchRoPE(qkv_buf, step->as(), attn_static_params); 69 | DeviceSyncAndCheckCudaError(); 70 | // 3. fused masked mha 71 | launchDecoderMaskedMHA(qkv_buf, weights.qkv, layer_id->as(), key_cache->as(), value_cache->as(), finished->as(), step->as(), mha_output, attn_static_params); 72 | DeviceSyncAndCheckCudaError(); 73 | #ifdef SAVE_DATA 74 | save_tensor(mha_output ,"self_decoder_qk_v_after_bmm.bin", layer_id->as()); 75 | #else 76 | #endif 77 | // 4. attention output linear 78 | launchLinearGemm(mha_output, weights.output, attention_output->as(), cublas_wrapper, false, true); 79 | DeviceSyncAndCheckCudaError(); 80 | #ifdef SAVE_DATA 81 | save_tensor(mha_output ,"self_decoder_outlinear_out.bin", layer_id->as()); 82 | #else 83 | #endif 84 | this->freeBuf(); 85 | } 86 | 87 | template class LLaMASelfAttentionLayer; 88 | template class LLaMASelfAttentionLayer; 89 | -------------------------------------------------------------------------------- /tests/unittests/test_data_compare.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | #include 8 | #include 9 | #include "src/utils/macro.h" 10 | #include "src/utils/debug_utils.h" 11 | // (RussWong)note: 12 | // this test is for debug, to compare intermediate tensor and HF intermediate tensor 13 | // and the intermediate tensor will be saved in file when you compile the proj by `cmake .. -DSAVE_DATA=ON && make -j8` 14 | // before run, you should change the path to your local right dir 15 | // `./debug` to compare 16 | 17 | std::vector loadWeightFromBinHelper(std::vector shape, std::string filename) 18 | { 19 | size_t dim0 = 1, dim1 = 1; 20 | if (shape.size() > 2) { 21 | dim0 = shape[0] * shape[1]; 22 | dim1 = shape[2]; 23 | } 24 | 25 | if (shape.size() == 2) { 26 | dim0 = shape[0]; 27 | dim1 = shape[1]; 28 | } 29 | size_t size = dim0 * dim1; 30 | if (size == 0) { 31 | std::cout << "shape is zero, skip loading weight from file: " << filename << std::endl; 32 | return std::vector(); 33 | } 34 | 35 | std::vector host_array(size); 36 | std::ifstream in(filename, std::ios::in | std::ios::binary); 37 | if (!in.is_open()) { 38 | std::cout << "file" << filename << "cannot be opened, loading model fails!" << std::endl; 39 | return std::vector(); 40 | } 41 | 42 | size_t loaded_data_size = sizeof(float) * size; 43 | in.seekg(0, in.end); 44 | in.seekg(0, in.beg); 45 | 46 | std::cout << "Read " << std::to_string(loaded_data_size) << " bytes from " << filename << std::endl; 47 | in.read((char*)host_array.data(), loaded_data_size); 48 | 49 | size_t in_get_size = in.gcount(); 50 | if (in_get_size != loaded_data_size) { 51 | return std::vector(); 52 | } 53 | in.close(); 54 | // If we succeed, return an array with values. 55 | return host_array; 56 | } 57 | void internalFunc(float* ptr, std::vector shape, std::string filename) { 58 | std::vector host_array = loadWeightFromBinHelper(shape, filename); 59 | if (host_array.empty()) { 60 | std::cout << "[warning] data from file is empty!!" << "\n"; 61 | return; 62 | } 63 | // copy host_array to our defined ptr 64 | memcpy(ptr, host_array.data(), host_array.size()); 65 | return; 66 | } 67 | void loadWeights(float* ptr1, std::string weight_path, int shape0, int shape1) // weighttype参数比较多余 68 | { 69 | // load attn output 70 | internalFunc(ptr1, {(size_t)shape0, (size_t)shape1}, weight_path); 71 | 72 | } 73 | void loadWeights_trans(float* ptr1, std::string weight_path, int shape0, int shape1) // weighttype参数比较多余 74 | { 75 | // load attn output 76 | internalFunc(ptr1, {(size_t)shape0, (size_t)shape1}, weight_path); 77 | 78 | } 79 | 80 | bool CheckResult(float* CPUoutput, float* GPUoutput, int in_size) { 81 | for(int i = 0; i < in_size; i++) { 82 | if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){ 83 | printf("the %dth res is wrong, onellm = %f, trans = %f\n", i, CPUoutput[i], GPUoutput[i]); 84 | } 85 | } 86 | return true; 87 | } 88 | // 1.for example: the path of two data files is below, and you should replace L101&L102 with the two 89 | // /home/data/trans/q_buf_after_rope_trans.bin 90 | // /home/data/onellm/q_buf_after_rope.bin 91 | // 2.And you should change the L93&L94 to the right data size according to your tensor shape of the data file 92 | int main(int argc, char *argv[]) { 93 | int shape0 = 1; // TO MODIFY before run 94 | int shape1 = 4096; // TO MODIFY before run 95 | 96 | int in_size = shape0 * shape1; 97 | 98 | float* d_in = (float*) malloc(sizeof(float) * in_size); 99 | float* d_in_trans = (float*) malloc(sizeof(float) * in_size); 100 | 101 | loadWeights(d_in, "/home/data/onellm/0_self_decoder_qk_v_after_bmm.bin", shape0, shape1); // TO MODIFY 102 | loadWeights_trans(d_in_trans, "/home/data/trans/self_decoder_qk_v_buf_after_bmm_trans.bin", shape0, shape1); // TO MODIFY 103 | std::cout << "====intermediate tensor comparison result====" << "\n"; 104 | CheckResult(d_in, d_in_trans, shape0 * shape1); 105 | 106 | free(d_in); 107 | free(d_in_trans); 108 | 109 | } 110 | -------------------------------------------------------------------------------- /src/layers/decoder/self_decoder.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/utils/macro.h" 3 | #include "src/layers/decoder/self_decoder.h" 4 | //(RussWong) note: layers文件夹下,很多操作后面我都加了`DeviceSyncAndCheckCudaError();`,大家可手动删除或者按照lesson30所示添加条件编译代码 5 | template 6 | void LlamaSelfDecoder::allocForForward(LLaMAAttentionDynParams& params) 7 | { 8 | DataType type = getTensorType(); 9 | int batch_size = params.batch_size; 10 | decoder_residual = new TensorWrapper(Device::GPU, type, {batch_size, hidden_units}); 11 | decoder_residual->data = allocator->Malloc(decoder_residual->data, sizeof(T) * batch_size * hidden_units, false); 12 | 13 | } 14 | template 15 | void LlamaSelfDecoder::freeBuf() 16 | { 17 | allocator->Free(decoder_residual->data); 18 | } 19 | template 20 | void LlamaSelfDecoder::forward(TensorMap& input_tensors, const std::vector*>& layerWeights, TensorMap& output_tensors, LLaMAAttentionDynParams& dyn_params) 21 | { 22 | allocForForward(dyn_params); 23 | Tensor* decoder_input = input_tensors["decoder_input"]; 24 | Tensor* step = input_tensors["step"]; 25 | Tensor* finished = input_tensors["finished"]; 26 | Tensor* decoder_output = output_tensors["decoder_output"]; 27 | Tensor* all_k_cache = output_tensors["all_k_cache"]; 28 | Tensor* all_v_cache = output_tensors["all_v_cache"]; 29 | Tensor* layer_id = input_tensors["layer_id"]; 30 | DataType type_int = getTensorType(); 31 | LLM_CHECK_WITH_INFO(decoder_input->as()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!"); 32 | LLM_CHECK_WITH_INFO(step->as()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!"); 33 | LLM_CHECK_WITH_INFO(finished->as()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!"); 34 | 35 | TensorMap self_attn_inputs{ 36 | {"attention_input", decoder_input}, 37 | {"layer_id", layer_id}, 38 | {"step", step}, 39 | {"finished", finished} 40 | }; 41 | TensorMap self_attn_outputs{ 42 | {"attention_output", decoder_output}, 43 | {"all_k_cache", all_k_cache}, 44 | {"all_v_cache", all_v_cache} 45 | }; 46 | 47 | for(int layer_id = 0; layer_id < num_layer; layer_id++) { 48 | //std::cout << "=============in layer " << layer_id << "==============" << "\n"; 49 | if (layer_id > 0){ 50 | TensorWrapper* layer = new TensorWrapper(Device::CPU, type_int, {1}, &layer_id); 51 | self_attn_inputs.insert("layer_id", layer); 52 | } 53 | decoder_input = self_attn_inputs["attention_input"]; 54 | launchRMSNorm(decoder_input->as(), //in&out, [bs, q_hidden_units] 55 | decoder_residual, // = rmsnorm input hidden states, as input of next add residual 56 | layerWeights[layer_id]->attn_norm_weight,//rmsnorm weights, [q_hidden_units] 57 | rmsnorm_eps); 58 | DeviceSyncAndCheckCudaError(); 59 | selfAttn->forward(self_attn_inputs, self_attn_outputs, layerWeights[layer_id]->self_attn_weight, dyn_params); 60 | launchFusedAddBiasResidualRMSNorm(decoder_residual, //in residual from tensor before rmsnorm and return decoder_residual + decoder_output, [bs, q hidden_units] 61 | decoder_output->as(), //in&out from attention output, [bs, q hidden_units] 62 | layerWeights[layer_id]->self_attn_weight.output, //bias 63 | layerWeights[layer_id]->ffn_norm_weight.gamma,//rmsnorm weights, [q hidden_units] 64 | rmsnorm_eps); 65 | DeviceSyncAndCheckCudaError(); 66 | TensorMap ffn_inputs{ 67 | {"ffn_input", decoder_output} 68 | }; 69 | TensorMap ffn_outputs{ 70 | {"ffn_output", decoder_output} 71 | }; 72 | ffn->forward(ffn_inputs, ffn_outputs, layerWeights[layer_id]->ffn_weight, dyn_params); 73 | launchAddResidual(decoder_residual, //in, [bs, hidden_units] 74 | decoder_output->as(), //in&out, [bs, hidden_units] 75 | true); 76 | 77 | DeviceSyncAndCheckCudaError(); 78 | self_attn_inputs.insert("attention_input", decoder_output); // for next iter 79 | } 80 | // no intermedia buffer to free, so ignore call free 81 | } 82 | 83 | template class LlamaSelfDecoder; 84 | template class LlamaSelfDecoder; 85 | -------------------------------------------------------------------------------- /tests/unittests/test_linear.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | #include 8 | #include 9 | #include "src/utils/macro.h" 10 | #include "src/kernels/linear.h" 11 | #include "src/weights/base_weights.h" 12 | 13 | void CPUlinear(float* input, float* weight, float* output, 14 | int m, int k, int n) { 15 | for(int i = 0; i < m; i++) { 16 | for(int j = 0; j < n; j++) { 17 | for(int l = 0; l < k; l++) { 18 | output[i * n + j] += input[i * k + l] * weight[l * n + j]; 19 | } 20 | } 21 | } 22 | } 23 | 24 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) { 25 | for(int i = 0; i < output_size; i++) { 26 | if (i < 5) { 27 | printf("0th res, CPUoutput = %f, GPUoutput = %f\n", CPUoutput[i], GPUoutput[i]); 28 | } 29 | if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){ 30 | printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]); 31 | return false; 32 | } 33 | 34 | } 35 | return true; 36 | } 37 | 38 | int main(int argc, char *argv[]) { 39 | const int seqlen = 13; 40 | const int hidden_units = 4096; 41 | const int vocab_size = 32; 42 | const int inter_size = 10; 43 | int hidden_units_2 = 0; 44 | int output_size = 0; 45 | 46 | hidden_units_2 = hidden_units * hidden_units; 47 | output_size = seqlen * hidden_units; 48 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 49 | float* h_w; 50 | float* d_w; 51 | h_w = (float*)malloc(sizeof(float) * hidden_units_2); 52 | cudaMalloc((void**)&d_w, sizeof(float) * hidden_units_2); 53 | for(int i = 0; i < hidden_units_2; i++) { 54 | h_w[i] = (float)(i % 3); // 1 2 1 2 55 | } 56 | 57 | float* h_in = (float*) malloc(sizeof(float) * hidden_units * seqlen); 58 | float* d_in; 59 | cudaMalloc((void**)&d_in, sizeof(float) * seqlen * hidden_units); 60 | for(int i = 0; i < hidden_units * seqlen; i++) { 61 | h_in[i] = (float)(i % 3); 62 | } 63 | 64 | float* h_out = (float*) malloc(sizeof(float) * output_size); 65 | float* d_out; 66 | cudaMalloc((void**)&d_out, sizeof(float) * output_size); 67 | CHECK(cudaMemcpy(d_in, h_in, sizeof(float) * hidden_units * seqlen, cudaMemcpyHostToDevice)); 68 | CHECK(cudaMemcpy(d_w, h_w, sizeof(float) * hidden_units_2, cudaMemcpyHostToDevice)); 69 | DataType type = getTensorType(); 70 | WeightType wtype = getWeightType(); 71 | TensorWrapper* in = new TensorWrapper(Device::GPU, type, {seqlen, hidden_units}, d_in); 72 | BaseWeight weight; 73 | weight.shape = {hidden_units, hidden_units}; 74 | weight.data = d_w; 75 | weight.type = wtype; 76 | TensorWrapper* out; 77 | out = new TensorWrapper(Device::GPU, type, {seqlen, hidden_units}, d_out); 78 | cublasHandle_t cublas_handle; 79 | cublasLtHandle_t cublaslt_handle; 80 | cublasCreate(&cublas_handle); 81 | cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); 82 | cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle); 83 | cublas_wrapper->setFP32GemmConfig(); 84 | // debug info, better to retain: 85 | std::cout << "before launch kernel" << std::endl; 86 | launchLinearGemm(in, weight, out, cublas_wrapper); 87 | // debug info, better to retain: 88 | std::cout << "after launch kernel" << std::endl; 89 | // debug info, better to retain: 90 | std::cout << "cuda memcpy device to host" << std::endl; 91 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 92 | CHECK(cudaMemcpy(h_out, d_out, sizeof(float) * output_size, cudaMemcpyDeviceToHost)); 93 | float* CPUout = (float*) malloc(sizeof(float) * output_size); 94 | CPUlinear(h_in, h_w, CPUout, seqlen, hidden_units, hidden_units); 95 | 96 | bool is_right = CheckResult(CPUout, h_out, output_size); 97 | // debug info, better to retain: 98 | std::cout << "before free" << std::endl; 99 | std::cout << "linear passed" << std::endl; 100 | free(h_in); 101 | free(h_w); 102 | free(h_out); 103 | free(CPUout); 104 | cudaFree(d_in); 105 | cudaFree(d_w); 106 | cudaFree(d_out); 107 | } -------------------------------------------------------------------------------- /src/utils/macro.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | //(RussWong) note: some macro check to assert for helping us find errors, so that we can 9 | // find the bugs faster 10 | #define CHECK(call) \ 11 | do \ 12 | { \ 13 | const cudaError_t error_code = call; \ 14 | if (error_code != cudaSuccess) \ 15 | { \ 16 | printf("CUDA Error:\n"); \ 17 | printf(" File: %s\n", __FILE__); \ 18 | printf(" Line: %d\n", __LINE__); \ 19 | printf(" Error code: %d\n", error_code); \ 20 | printf(" Error text: %s\n", \ 21 | cudaGetErrorString(error_code)); \ 22 | exit(1); \ 23 | } \ 24 | } while (0) 25 | 26 | static const char* _cudaGetErrorEnum(cudaError_t error) 27 | { 28 | return cudaGetErrorString(error); 29 | } 30 | 31 | static const char* _cudaGetErrorEnum(cublasStatus_t error) 32 | { 33 | switch (error) { 34 | case CUBLAS_STATUS_SUCCESS: 35 | return "CUBLAS_STATUS_SUCCESS"; 36 | 37 | case CUBLAS_STATUS_NOT_INITIALIZED: 38 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 39 | 40 | case CUBLAS_STATUS_ALLOC_FAILED: 41 | return "CUBLAS_STATUS_ALLOC_FAILED"; 42 | 43 | case CUBLAS_STATUS_INVALID_VALUE: 44 | return "CUBLAS_STATUS_INVALID_VALUE"; 45 | 46 | case CUBLAS_STATUS_ARCH_MISMATCH: 47 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 48 | 49 | case CUBLAS_STATUS_MAPPING_ERROR: 50 | return "CUBLAS_STATUS_MAPPING_ERROR"; 51 | 52 | case CUBLAS_STATUS_EXECUTION_FAILED: 53 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 54 | 55 | case CUBLAS_STATUS_INTERNAL_ERROR: 56 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 57 | 58 | case CUBLAS_STATUS_NOT_SUPPORTED: 59 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 60 | 61 | case CUBLAS_STATUS_LICENSE_ERROR: 62 | return "CUBLAS_STATUS_LICENSE_ERROR"; 63 | } 64 | return ""; 65 | } 66 | 67 | template 68 | void check(T result, char const* const func, const char* const file, int const line) 69 | { 70 | if (result) { 71 | throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " 72 | + file + ":" + std::to_string(line) + " \n"); 73 | } 74 | } 75 | 76 | #define CHECK_CUBLAS(val) check((val), #val, __FILE__, __LINE__) 77 | 78 | inline void syncAndCheck(const char* const file, int const line) 79 | { 80 | cudaDeviceSynchronize(); 81 | cudaError_t result = cudaGetLastError(); 82 | if (result) { 83 | throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " 84 | + file + ":" + std::to_string(line) + " \n"); 85 | } 86 | } 87 | 88 | #define DeviceSyncAndCheckCudaError() syncAndCheck(__FILE__, __LINE__) 89 | 90 | [[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "") 91 | { 92 | throw std::runtime_error(std::string("[oneLLM][ERROR] ") + info + " Assertion fail: " + file + ":" 93 | + std::to_string(line) + " \n"); 94 | } 95 | 96 | inline void llmAssert(bool result, const char* const file, int const line, std::string const& info = "") 97 | { 98 | if (!result) { 99 | throwRuntimeError(file, line, info); 100 | } 101 | } 102 | 103 | #define LLM_CHECK(val) llmAssert(val, __FILE__, __LINE__) 104 | #define LLM_CHECK_WITH_INFO(val, info) \ 105 | do { \ 106 | bool is_valid_val = (val); \ 107 | if (!is_valid_val) { \ 108 | llmAssert(is_valid_val, __FILE__, __LINE__, (info)); \ 109 | } \ 110 | } while (0) 111 | -------------------------------------------------------------------------------- /examples/cpp/ffn/ffn_example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "src/layers/ffn/ffn.h" 6 | 7 | int main(int argc, char** argv) 8 | { 9 | int head_num = 4; 10 | int head_size = 8; 11 | int inter_size = 12; 12 | int hidden_units = head_num * head_size; 13 | 14 | cublasHandle_t cublas_handle; 15 | cublasLtHandle_t cublaslt_handle; 16 | cudaStream_t stream; 17 | cublasCreate(&cublas_handle); 18 | cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); 19 | cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle); 20 | BaseAllocator* allocator = new CudaAllocator; 21 | 22 | LLaMAAttentionDynParams attn_dyn_params; 23 | attn_dyn_params.num_tokens = 14; 24 | std::cout << "start malloc/cudamalloc buffer" << "\n"; 25 | float* h_ffn_input = (float*) malloc(sizeof(float) * hidden_units * attn_dyn_params.num_tokens); 26 | float* d_ffn_input; 27 | cudaMalloc((void**)&d_ffn_input, sizeof(float) * hidden_units * attn_dyn_params.num_tokens); 28 | for(int i = 0; i < hidden_units * attn_dyn_params.num_tokens; i++) { 29 | h_ffn_input[i] = (float)(i % 2 + 1); 30 | } 31 | float* h_gate_up = (float*) malloc(sizeof(float) * hidden_units * 2 * inter_size); 32 | float* d_gate_up; 33 | cudaMalloc((void**)&d_gate_up, sizeof(float) * hidden_units * 2 * inter_size); 34 | for(int i = 0; i < hidden_units * 2 * inter_size; i++) { 35 | h_gate_up[i] = (float)(i % 2 + 1); 36 | } 37 | // float* h_up = (float*) malloc(sizeof(float) * hidden_units * inter_size); 38 | // float* d_up; 39 | // cudaMalloc((void**)&d_up, sizeof(float) * hidden_units * inter_size); 40 | // for(int i = 0; i < hidden_units * inter_size; i++) { 41 | // h_up[i] = 1.0f; 42 | // } 43 | float* h_down = (float*) malloc(sizeof(float) * hidden_units * inter_size); 44 | float* d_down; 45 | cudaMalloc((void**)&d_down, sizeof(float) * hidden_units * inter_size); 46 | for(int i = 0; i < hidden_units * inter_size; i++) { 47 | h_down[i] = (float)(i % 2 + 1); 48 | } 49 | float* d_ffn_output; 50 | cudaMalloc((void**)&d_ffn_output, sizeof(float) * attn_dyn_params.num_tokens * hidden_units); 51 | std::cout << "end malloc/cudamalloc buffer and start memcpyh2d" << "\n"; 52 | CHECK(cudaMemcpy(d_ffn_input, h_ffn_input, sizeof(float) * hidden_units * attn_dyn_params.num_tokens, cudaMemcpyHostToDevice)); 53 | CHECK(cudaMemcpy(d_gate_up, h_gate_up, sizeof(float) * hidden_units * 2 * inter_size, cudaMemcpyHostToDevice)); 54 | // CHECK(cudaMemcpy(d_up, h_up, sizeof(float) * hidden_units * inter_size, cudaMemcpyHostToDevice)); 55 | CHECK(cudaMemcpy(d_down, h_down, sizeof(float) * hidden_units * inter_size, cudaMemcpyHostToDevice)); 56 | DataType type = getTensorType(); // note: the type should be as a class data member! 57 | LLaMAFFNWeights ffn_weights; 58 | ffn_weights.gateAndup.data = d_gate_up; 59 | ffn_weights.gateAndup.shape = {2 * inter_size, hidden_units}; 60 | // ffn_weights.up.data = d_up; 61 | // ffn_weights.up.shape = {hidden_units, inter_size}; 62 | ffn_weights.down.data = d_down; 63 | ffn_weights.down.shape = {hidden_units, inter_size}; 64 | TensorWrapper* ffn_input = new TensorWrapper(GPU, 65 | type, 66 | {attn_dyn_params.num_tokens, hidden_units}, 67 | d_ffn_input); 68 | TensorWrapper* ffn_output = new TensorWrapper(GPU, 69 | type, 70 | {attn_dyn_params.num_tokens, hidden_units}, 71 | d_ffn_output); 72 | TensorMap ffn_inputs{ 73 | {"ffn_input", ffn_input} 74 | }; 75 | TensorMap ffn_outputs{ 76 | {"ffn_output", ffn_output} 77 | }; 78 | std::cout << "initializing ffn layer" << "\n"; 79 | LLaMAFFNLayer* ffn_layer = new LLaMAFFNLayer(head_num, 80 | head_size, 81 | inter_size, 82 | stream, 83 | cublas_wrapper, 84 | allocator); 85 | std::cout << "start fwd" << "\n"; 86 | ffn_layer->forward(ffn_inputs, ffn_outputs, ffn_weights, attn_dyn_params); 87 | std::cout << "end fwd" << "\n"; 88 | free(h_ffn_input); 89 | free(h_gate_up); 90 | // free(h_up); 91 | free(h_down); 92 | cudaFree(d_ffn_input); 93 | cudaFree(d_gate_up); 94 | // cudaFree(d_up); 95 | cudaFree(d_down); 96 | cudaFree(d_ffn_output); 97 | } 98 | -------------------------------------------------------------------------------- /tests/unittests/test_casual_mask.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include "src/kernels/build_casual_mask.h" 9 | // (RussWong)note: this kernel's CPU implementation is absolutely right. 10 | // when you are implementing LLMs inference on CPU, you can reuse the CPU kernel 11 | // we compare the kernel correctnesss by eyes and result print infos 12 | void CPUbuildCasualMask(float* mask, 13 | const int* q_lens, //input lens, shape=[batch size] 14 | const int* k_lens, //context lens, shape=[batch size] 15 | int max_q_len, 16 | int max_k_len, 17 | int batch_size) { 18 | for(int b = 0; b < batch_size; b++){ 19 | int start = b * max_q_len * max_k_len; 20 | int q = q_lens[b]; 21 | int k = k_lens[b]; 22 | for(int i = 0; i < max_q_len; i++) { 23 | for(int j = 0; j < max_k_len; j++) { 24 | if(j <= i + (k - q) && i < q && j < k) { 25 | mask[start + i * max_k_len + j] = 1.0f; 26 | } else { 27 | mask[start + i * max_k_len + j] = 0.0f; 28 | } 29 | } 30 | } 31 | } 32 | } 33 | bool CheckResult(float* CPUres, float* GPUres, const int size) { 34 | for(int i = 0; i < size; i++) { 35 | if(fabs(CPUres[i] - GPUres[i]) > 1e-6){ 36 | printf("the %dth res is wrong, CPU mask = %f, GPU mask = %f\n", i, CPUres[i], GPUres[i]); 37 | return false; 38 | } 39 | } 40 | return true; 41 | } 42 | // (RussWong)note: 43 | // `./causalmask` to test fp32 GPU build causal mask kernel 44 | int main() { 45 | const int batch_size = 1; 46 | const int max_q_len = 5; 47 | const int max_k_len = 5; 48 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 49 | const int mask_size = batch_size * max_q_len * max_k_len; 50 | int* h_q_lens; 51 | int* d_q_lens; 52 | h_q_lens = (int*)malloc(sizeof(int) * batch_size); 53 | cudaMalloc((void**)&d_q_lens, sizeof(int) * batch_size); 54 | int* h_k_lens; 55 | int* d_k_lens; 56 | h_k_lens = (int*)malloc(sizeof(int) * batch_size); 57 | cudaMalloc((void**)&d_k_lens, sizeof(int) * batch_size); 58 | 59 | float* d_mask; 60 | float* h_mask = (float*)malloc(sizeof(float) * mask_size); 61 | cudaMalloc((void**)&d_mask, sizeof(float) * mask_size); 62 | 63 | for(int i = 0; i < batch_size; i++) { 64 | h_q_lens[i] = 3; 65 | } 66 | for(int i = 0; i < batch_size; i++) { 67 | h_k_lens[i] = 3; 68 | } 69 | CHECK(cudaMemcpy(d_q_lens, h_q_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice)); 70 | CHECK(cudaMemcpy(d_k_lens, h_k_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice)); 71 | DataType type_float = getTensorType(); 72 | DataType type_int = getTensorType(); 73 | TensorWrapper* mask = new TensorWrapper(Device::GPU, 74 | type_float, 75 | {batch_size, max_q_len, max_k_len}, 76 | d_mask); 77 | TensorWrapper* q_lens = new TensorWrapper(Device::GPU, 78 | type_int, 79 | {batch_size}, 80 | d_q_lens); 81 | TensorWrapper* k_lens = new TensorWrapper(Device::GPU, 82 | type_int, 83 | {batch_size}, 84 | d_k_lens); 85 | launchBuildCausalMasks(mask, q_lens, k_lens); 86 | // debug info, better to retain: std::cout << "after launch kernel" << std::endl; 87 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 88 | CHECK(cudaMemcpy(h_mask, d_mask, sizeof(float) * mask_size, cudaMemcpyDeviceToHost)); 89 | float* CPUmask = (float*)malloc(sizeof(float) * mask_size); 90 | CPUbuildCasualMask(CPUmask, h_q_lens, h_k_lens, max_q_len, max_k_len, batch_size); 91 | if (CheckResult(CPUmask, h_mask, mask_size)) { 92 | printf("test passed!\n"); 93 | } 94 | 95 | // debug info, better to retain: std::cout << "before free" << std::endl; 96 | free(h_q_lens); 97 | free(h_k_lens); 98 | free(h_mask); 99 | free(CPUmask); 100 | cudaFree(d_q_lens); 101 | cudaFree(d_k_lens); 102 | cudaFree(d_mask); 103 | } 104 | -------------------------------------------------------------------------------- /src/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(embeddingFunctor STATIC input_embedding.cu) 2 | set_property(TARGET embeddingFunctor PROPERTY CUDA_SEPARABLE_COMPILATION ON) 3 | set_property(TARGET embeddingFunctor PROPERTY POSITION_INDEPENDENT_CODE ON) 4 | set_property(TARGET embeddingFunctor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 5 | 6 | add_library(rmsnorm STATIC rmsnorm_kernel.cu) 7 | set_property(TARGET rmsnorm PROPERTY CUDA_SEPARABLE_COMPILATION ON) 8 | set_property(TARGET rmsnorm PROPERTY POSITION_INDEPENDENT_CODE ON) 9 | set_property(TARGET rmsnorm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 10 | 11 | add_library(cal_paddingoffset STATIC cal_paddingoffset.cu) 12 | set_property(TARGET cal_paddingoffset PROPERTY CUDA_SEPARABLE_COMPILATION ON) 13 | set_property(TARGET cal_paddingoffset PROPERTY POSITION_INDEPENDENT_CODE ON) 14 | set_property(TARGET cal_paddingoffset PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 15 | 16 | add_library(build_casual_mask STATIC build_casual_mask.cu) 17 | set_property(TARGET build_casual_mask PROPERTY CUDA_SEPARABLE_COMPILATION ON) 18 | set_property(TARGET build_casual_mask PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET build_casual_mask PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | 21 | add_library(cublasWrapper STATIC cublas_utils.cc) 22 | set_property(TARGET cublasWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) 23 | set_property(TARGET cublasWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 24 | 25 | add_library(linear STATIC linear.cu) 26 | set_property(TARGET linear PROPERTY CUDA_SEPARABLE_COMPILATION ON) 27 | set_property(TARGET linear PROPERTY POSITION_INDEPENDENT_CODE ON) 28 | set_property(TARGET linear PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 29 | target_link_libraries(linear PUBLIC -lcudart -lcublas cublasWrapper) 30 | 31 | add_library(qkv_bias_and_rope STATIC qkv_bias_and_RoPE.cu) 32 | set_property(TARGET qkv_bias_and_rope PROPERTY CUDA_SEPARABLE_COMPILATION ON) 33 | set_property(TARGET qkv_bias_and_rope PROPERTY POSITION_INDEPENDENT_CODE ON) 34 | set_property(TARGET qkv_bias_and_rope PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 35 | 36 | add_library(concat_kv STATIC concat_past_kv.cu) 37 | set_property(TARGET concat_kv PROPERTY CUDA_SEPARABLE_COMPILATION ON) 38 | set_property(TARGET concat_kv PROPERTY POSITION_INDEPENDENT_CODE ON) 39 | set_property(TARGET concat_kv PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 40 | 41 | add_library(repeat_kv STATIC repeat_kv.cu) 42 | set_property(TARGET repeat_kv PROPERTY CUDA_SEPARABLE_COMPILATION ON) 43 | set_property(TARGET repeat_kv PROPERTY POSITION_INDEPENDENT_CODE ON) 44 | set_property(TARGET repeat_kv PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 45 | 46 | add_library(mask_softmax STATIC attn_softmax_kernel.cu) 47 | set_property(TARGET mask_softmax PROPERTY CUDA_SEPARABLE_COMPILATION ON) 48 | set_property(TARGET mask_softmax PROPERTY POSITION_INDEPENDENT_CODE ON) 49 | set_property(TARGET mask_softmax PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 50 | 51 | add_library(fused_transpose_and_remv_pad STATIC fused_transpose_and_remv_pad.cu) 52 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY CUDA_SEPARABLE_COMPILATION ON) 53 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY POSITION_INDEPENDENT_CODE ON) 54 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 55 | 56 | add_library(fused_addresidual_norm STATIC fused_addresidual_norm.cu) 57 | set_property(TARGET fused_addresidual_norm PROPERTY CUDA_SEPARABLE_COMPILATION ON) 58 | set_property(TARGET fused_addresidual_norm PROPERTY POSITION_INDEPENDENT_CODE ON) 59 | set_property(TARGET fused_addresidual_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 60 | 61 | add_library(act STATIC act_kernel.cu) 62 | set_property(TARGET act PROPERTY CUDA_SEPARABLE_COMPILATION ON) 63 | set_property(TARGET act PROPERTY POSITION_INDEPENDENT_CODE ON) 64 | set_property(TARGET act PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 65 | 66 | add_library(topk STATIC topK.cu) 67 | set_property(TARGET topk PROPERTY CUDA_SEPARABLE_COMPILATION ON) 68 | set_property(TARGET topk PROPERTY POSITION_INDEPENDENT_CODE ON) 69 | set_property(TARGET topk PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 70 | 71 | add_library(fused_decoder_self_attention STATIC fused_decoder_self_attention.cu) 72 | set_property(TARGET fused_decoder_self_attention PROPERTY CUDA_SEPARABLE_COMPILATION ON) 73 | set_property(TARGET fused_decoder_self_attention PROPERTY POSITION_INDEPENDENT_CODE ON) 74 | set_property(TARGET fused_decoder_self_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 75 | 76 | add_library(sampling STATIC sampling.cu) 77 | set_property(TARGET sampling PROPERTY CUDA_SEPARABLE_COMPILATION ON) 78 | set_property(TARGET sampling PROPERTY POSITION_INDEPENDENT_CODE ON) 79 | set_property(TARGET sampling PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 80 | 81 | add_library(add_residual STATIC add_residual.cu) 82 | set_property(TARGET add_residual PROPERTY CUDA_SEPARABLE_COMPILATION ON) 83 | set_property(TARGET add_residual PROPERTY POSITION_INDEPENDENT_CODE ON) 84 | set_property(TARGET add_residual PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -------------------------------------------------------------------------------- /src/weights/llama/llama_weights.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/weights/llama/llama_weights.h" 3 | template 4 | LlamaWeight::LlamaWeight( 5 | int head_num, 6 | int kv_head_num, 7 | int head_size, 8 | int inter_size, 9 | int vocab_size, 10 | int num_layer, 11 | bool attn_bias, 12 | WeightType weight_type 13 | ): 14 | hidden_units(head_num * head_size), 15 | inter_size(inter_size), 16 | vocab_size(vocab_size), 17 | vocab_size_padded(vocab_size), 18 | num_layer(num_layer), 19 | weight_type(weight_type) 20 | { 21 | llama_layer_weight.reserve(num_layer); 22 | for (int l = 0; l < num_layer; ++l) { 23 | llama_layer_weight.push_back(new LlamaLayerWeight(head_num, 24 | kv_head_num, 25 | head_size, 26 | inter_size, 27 | weight_type, 28 | //group_size, 29 | attn_bias)); 30 | } 31 | GPUMalloc(&out_rmsnorm_weight.gamma, hidden_units); 32 | GPUMalloc(&post_decoder_embedding_weight.data, vocab_size * hidden_units); 33 | GPUMalloc(&pre_decoder_embedding_weight.data, vocab_size * hidden_units); 34 | pre_decoder_embedding_weight.shape = {vocab_size, hidden_units}; 35 | post_decoder_embedding_weight.shape = {vocab_size, hidden_units}; 36 | pre_decoder_embedding_weight.type = weight_type; 37 | post_decoder_embedding_weight.type = weight_type; 38 | } 39 | // (RussWong)note: weight from HF is always half type, and if we want run fp32 inference, we should convert half weight to fp32 weight in tools/weights_convert.py 40 | // (RussWong)note: shape and data of embedding and LMHead weight downloaded form HF are transposed, so we should carefully declare shape here 41 | template 42 | void LlamaWeight::loadWeights(std::string weight_path) { 43 | loadWeightFromBin::internalFunc(out_rmsnorm_weight.gamma, {(size_t)hidden_units}, weight_path + "model.norm.weight.bin"); 44 | loadWeightFromBin::internalFunc(post_decoder_embedding_weight.data, {(size_t)vocab_size, (size_t)hidden_units}, weight_path + "lm_head.weight.bin"); 45 | loadWeightFromBin::internalFunc(pre_decoder_embedding_weight.data, {(size_t)vocab_size, (size_t)hidden_units}, weight_path + "model.embed_tokens.weight.bin"); 46 | for (int layer = 0; layer < num_layer; ++layer) { 47 | llama_layer_weight[layer]->loadWeights(weight_path + "model.layers." + std::to_string(layer), weight_type); 48 | } 49 | } 50 | 51 | template 52 | void LlamaWeight::loadWeightsFromDummy() { 53 | T* d_dummy_out_rmsnorm_weight_gamma; 54 | T* d_dummy_post_decoder_embedding_weight; 55 | T* d_dummy_pre_decoder_embedding_weight; 56 | GPUMalloc(&d_dummy_out_rmsnorm_weight_gamma, sizeof(T) * hidden_units); 57 | GPUMalloc(&d_dummy_post_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size); 58 | GPUMalloc(&d_dummy_pre_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size); 59 | T* h_dummy_out_rmsnorm_weight_gamma = (T*)malloc(sizeof(T) * hidden_units); 60 | T* h_dummy_post_decoder_embedding_weight = (T*)malloc(sizeof(T) * hidden_units * vocab_size); 61 | T* h_dummy_pre_decoder_embedding_weight = (T*)malloc(sizeof(T) * hidden_units * vocab_size); 62 | for (int i = 0; i < hidden_units; i++){ 63 | h_dummy_out_rmsnorm_weight_gamma[i] = (T)1.0f; 64 | } 65 | for (int i = 0; i < hidden_units * vocab_size; i++) { 66 | h_dummy_post_decoder_embedding_weight[i] = (T)1.0f; 67 | h_dummy_pre_decoder_embedding_weight[i] = (T)1.0f; 68 | } 69 | cudaMemcpy(d_dummy_out_rmsnorm_weight_gamma, h_dummy_out_rmsnorm_weight_gamma, sizeof(T) * hidden_units, cudaMemcpyHostToDevice); 70 | cudaMemcpy(d_dummy_post_decoder_embedding_weight, h_dummy_post_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size, cudaMemcpyHostToDevice); 71 | cudaMemcpy(d_dummy_pre_decoder_embedding_weight, h_dummy_pre_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size, cudaMemcpyHostToDevice); 72 | 73 | out_rmsnorm_weight.gamma = d_dummy_out_rmsnorm_weight_gamma; 74 | post_decoder_embedding_weight.data = d_dummy_post_decoder_embedding_weight; 75 | pre_decoder_embedding_weight.data = d_dummy_pre_decoder_embedding_weight; 76 | for (int layer = 0; layer < num_layer; ++layer) { 77 | llama_layer_weight[layer]->loadWeights(); 78 | } 79 | } 80 | 81 | template 82 | LlamaWeight::~LlamaWeight() 83 | { 84 | cudaFree(pre_decoder_embedding_weight.data); 85 | cudaFree(out_rmsnorm_weight.gamma); 86 | cudaFree(post_decoder_embedding_weight.data); 87 | 88 | for (auto& p : llama_layer_weight) { 89 | delete p; 90 | } 91 | } 92 | // template instantial required in linking time 93 | template struct LlamaWeight; 94 | template struct LlamaWeight; 95 | -------------------------------------------------------------------------------- /tests/unittests/test_topk.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include 9 | #include "src/kernels/topK.h" 10 | // (RussWong)note: 11 | // there is no top k cpu kernel implementation now 12 | // we compare the kernel correctnesss by eyes and result print infos 13 | // `./test_topk` to test fp32 GPU kernel 14 | int main() { 15 | const int batch_size = 1; 16 | const int vocab_size = 30000; 17 | const int beamwidth = 2; 18 | const int K = 5; 19 | const int BlockPerBeam = 8; 20 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 21 | const int probs_size = batch_size * vocab_size * beamwidth; 22 | float* h_probs; 23 | float *d_probs; 24 | h_probs = (float*)malloc(sizeof(float) * probs_size); 25 | cudaMalloc((void**)&d_probs, sizeof(float) * probs_size); 26 | 27 | int topK_val_buf_size = batch_size * beamwidth * BlockPerBeam * K; 28 | int topK_ids_buf_size = batch_size * beamwidth * BlockPerBeam * K; 29 | int final_topK_val_buf_size = batch_size * beamwidth * K; // sampling topK buf size, beamsearch topK size = [batch_size * beam_width * beam_width] 30 | 31 | 32 | int *d_tmp_topk_ids; 33 | cudaMalloc((void**)&d_tmp_topk_ids, sizeof(int) * topK_ids_buf_size); 34 | 35 | float *d_tmp_topk_vals; 36 | cudaMalloc((void**)&d_tmp_topk_vals, sizeof(float) * topK_val_buf_size); 37 | 38 | int* h_final_topk_ids; 39 | int *d_final_topk_ids; 40 | h_final_topk_ids = (int*)malloc(sizeof(int) * final_topK_val_buf_size); 41 | cudaMalloc((void**)&d_final_topk_ids, sizeof(int) * final_topK_val_buf_size); 42 | 43 | float* h_final_topk_vals; 44 | float *d_final_topk_vals; 45 | h_final_topk_vals = (float*)malloc(sizeof(float) * final_topK_val_buf_size); 46 | cudaMalloc((void**)&d_final_topk_vals, sizeof(float) * final_topK_val_buf_size); 47 | 48 | for(int i = 0; i < probs_size; i++) { // 0-59999 49 | h_probs[i] = i; 50 | } 51 | cudaMemcpy(d_probs, h_probs, sizeof(float)*probs_size, cudaMemcpyHostToDevice); 52 | 53 | DataType type_float = getTensorType(); 54 | DataType type_int = getTensorType(); 55 | TensorWrapper* probs_tensor = new TensorWrapper(Device::GPU, 56 | type_float, 57 | {batch_size * beamwidth, vocab_size}, 58 | d_probs); 59 | TensorWrapper *tmp_topk_ids = new TensorWrapper(Device::GPU, 60 | type_int, 61 | {batch_size, beamwidth, BlockPerBeam, K}, 62 | d_tmp_topk_ids); 63 | TensorWrapper* tmp_topk_vals = new TensorWrapper(Device::GPU, 64 | type_float, 65 | {batch_size, beamwidth, BlockPerBeam, K}, 66 | d_tmp_topk_vals); 67 | TensorWrapper *final_topk_ids = new TensorWrapper(Device::GPU, 68 | type_int, 69 | {batch_size * beamwidth, K}, 70 | d_final_topk_ids); 71 | TensorWrapper *final_topk_vals = new TensorWrapper(Device::GPU, 72 | type_float, 73 | {batch_size * beamwidth, K}, 74 | d_final_topk_vals); 75 | // debug info, better to retain: std::cout << "before launch kernel" << std::endl; 76 | launchTopKforBeamSearch(probs_tensor, tmp_topk_ids, tmp_topk_vals, final_topk_ids, final_topk_vals); 77 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 78 | cudaMemcpy(h_final_topk_ids, d_final_topk_ids, sizeof(int) * final_topK_val_buf_size, cudaMemcpyDeviceToHost); 79 | cudaMemcpy(h_final_topk_vals, d_final_topk_vals, sizeof(float) * final_topK_val_buf_size, cudaMemcpyDeviceToHost); 80 | for(int i = 0; i < final_topK_val_buf_size; i++) { 81 | int id = h_final_topk_ids[i]; 82 | printf("topK id = %d\n", id); 83 | float val = h_final_topk_vals[i]; 84 | printf("topK val =%f\n", val); 85 | } 86 | // debug info, better to retain: std::cout << "before free" << std::endl; 87 | free(h_probs); 88 | free(h_final_topk_ids); 89 | free(h_final_topk_vals); 90 | cudaFree(d_probs); 91 | cudaFree(d_final_topk_ids); 92 | cudaFree(d_final_topk_vals); 93 | cudaFree(d_tmp_topk_ids); 94 | cudaFree(d_tmp_topk_vals); 95 | } 96 | -------------------------------------------------------------------------------- /tests/unittests/test_residual.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include 9 | #include "src/kernels/add_residual.h" 10 | 11 | #include 12 | // (RussWong)note: this kernel's CPU implementation is absolutely right. 13 | // But when you are implementing LLMs inference on CPU, I dont recommend to reuse the CPU kernel, because its performance is bad 14 | // `./test_residual` to test fp32 GPU kernel 15 | #define CHECK(call) \ 16 | do \ 17 | { \ 18 | const cudaError_t error_code = call; \ 19 | if (error_code != cudaSuccess) \ 20 | { \ 21 | printf("CUDA Error:\n"); \ 22 | printf(" File: %s\n", __FILE__); \ 23 | printf(" Line: %d\n", __LINE__); \ 24 | printf(" Error code: %d\n", error_code); \ 25 | printf(" Error text: %s\n", \ 26 | cudaGetErrorString(error_code)); \ 27 | exit(1); \ 28 | } \ 29 | } while (0) 30 | 31 | void CPUresidual(float* h_residual, float* h_decoder_out, int hidden_units, int num_tokens) { 32 | for(int b = 0; b < num_tokens; b++) { 33 | for (int i = 0; i < hidden_units; i++) { 34 | h_decoder_out[b * hidden_units + i] += h_residual[b * hidden_units + i]; 35 | } 36 | } 37 | } 38 | 39 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) { 40 | for(int i = 0; i < output_size; i++) { 41 | if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){ 42 | printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]); 43 | return false; 44 | } 45 | 46 | } 47 | return true; 48 | } 49 | 50 | int main() { 51 | const int num_tokens = 16; 52 | const int hidden_units = 4096; 53 | const int total_size = num_tokens * hidden_units; 54 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 55 | float* h_residual; 56 | float* d_residual; 57 | h_residual = (float*)malloc(sizeof(float) * total_size); 58 | cudaMalloc((void**)&d_residual, sizeof(float) * total_size); 59 | for(int i = 0; i < total_size; i++) { 60 | h_residual[i] = (float)(i % 2 + 1); 61 | } 62 | 63 | float* h_decoder_out = (float*) malloc(sizeof(float) * total_size); 64 | float* decoder_out = (float*) malloc(sizeof(float) * total_size); 65 | float* d_decoder_out; 66 | cudaMalloc((void**)&d_decoder_out, sizeof(float) * total_size); 67 | for(int i = 0; i < total_size; i++) { 68 | h_decoder_out[i] = (float)(i % 2 + 1); 69 | } 70 | 71 | CHECK(cudaMemcpy(d_residual, h_residual, sizeof(float) * total_size, cudaMemcpyHostToDevice)); 72 | CHECK(cudaMemcpy(d_decoder_out, h_decoder_out, sizeof(float) * total_size, cudaMemcpyHostToDevice)); 73 | DataType type_float = getTensorType(); 74 | TensorWrapper* decoder_out_tensor = new TensorWrapper(Device::GPU, 75 | type_float, 76 | {num_tokens, hidden_units}, 77 | d_decoder_out); 78 | TensorWrapper* residual_tensor = new TensorWrapper(Device::GPU, 79 | type_float, 80 | {num_tokens, hidden_units}, 81 | d_residual); 82 | // debug info, better to retain: 83 | std::cout << "before launch kernel" << std::endl; 84 | launchAddResidual(residual_tensor, decoder_out_tensor); 85 | // debug info, better to retain: 86 | std::cout << "after launch kernel" << std::endl; 87 | // debug info, better to retain: 88 | std::cout << "cuda memcpy device to host" << std::endl; 89 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 90 | CHECK(cudaMemcpy(decoder_out, d_decoder_out, sizeof(float) * total_size, cudaMemcpyDeviceToHost)); 91 | float* CPUout = (float*) malloc(sizeof(float) * total_size); 92 | for(int i = 0; i < total_size; i++){ 93 | CPUout[i] = (float)(i % 2 + 1); 94 | } 95 | CPUresidual(h_residual, CPUout, hidden_units, num_tokens); 96 | bool is_right = CheckResult(CPUout, decoder_out, total_size); 97 | // debug info, better to retain: 98 | std::cout << "before free" << std::endl; 99 | std::cout << "AddResidual kernel passed" << std::endl; 100 | free(h_residual); 101 | free(h_decoder_out); 102 | free(CPUout); 103 | free(decoder_out); 104 | cudaFree(d_residual); 105 | cudaFree(d_decoder_out); 106 | } 107 | -------------------------------------------------------------------------------- /src/kernels/cublas_utils.cc: -------------------------------------------------------------------------------- 1 | #include "cublas_utils.h" 2 | #include 3 | // (RussWong) notes:cublas gemm和stridedbatchgemm调库的写法,比较固定 4 | cublasWrapper::cublasWrapper(cublasHandle_t cublas_handle, 5 | cublasLtHandle_t cublaslt_handle): 6 | cublas_handle_(cublas_handle), 7 | cublaslt_handle_(cublaslt_handle) 8 | { 9 | } 10 | 11 | cublasWrapper::~cublasWrapper() 12 | { 13 | } 14 | // invoked in model example main function after initialize cublas wrapper 15 | void cublasWrapper::setFP32GemmConfig() 16 | { 17 | Atype_ = CUDA_R_32F; 18 | Btype_ = CUDA_R_32F; 19 | Ctype_ = CUDA_R_32F; 20 | computeType_ = CUDA_R_32F; 21 | } 22 | 23 | void cublasWrapper::setFP16GemmConfig() 24 | { 25 | Atype_ = CUDA_R_16F; 26 | Btype_ = CUDA_R_16F; 27 | Ctype_ = CUDA_R_16F; 28 | computeType_ = CUDA_R_32F; 29 | } 30 | 31 | //fp32 gemm and fp16 gemm 32 | void cublasWrapper::Gemm(cublasOperation_t transa, 33 | cublasOperation_t transb, 34 | const int m, 35 | const int n, 36 | const int k, 37 | const void* A, 38 | const int lda, 39 | const void* B, 40 | const int ldb, 41 | void* C, 42 | const int ldc, 43 | float f_alpha = 1.0f, 44 | float f_beta = 0.0f) 45 | { 46 | half h_alpha = (half)(f_alpha); 47 | half h_beta = (half)(f_beta); 48 | int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; //之前是CUDA_R_16F 49 | const void* alpha = is_fp16_computeType ? reinterpret_cast(&(h_alpha)) : reinterpret_cast(&f_alpha); 50 | const void* beta = is_fp16_computeType ? reinterpret_cast(&(h_beta)) : reinterpret_cast(&f_beta); 51 | CHECK_CUBLAS(cublasGemmEx(cublas_handle_, 52 | transa, 53 | transb, 54 | m, 55 | n, 56 | k, 57 | alpha, 58 | A, 59 | Atype_, 60 | lda, 61 | B, 62 | Btype_, 63 | ldb, 64 | beta, 65 | C, 66 | Ctype_, 67 | ldc, 68 | computeType_, 69 | CUBLAS_GEMM_DEFAULT)); 70 | } 71 | 72 | void cublasWrapper::stridedBatchedGemm(cublasOperation_t transa, 73 | cublasOperation_t transb, 74 | const int m, 75 | const int n, 76 | const int k, 77 | const void* A, 78 | const int lda, 79 | const int64_t strideA, 80 | const void* B, 81 | const int ldb, 82 | const int64_t strideB, 83 | void* C, 84 | const int ldc, 85 | const int64_t strideC, 86 | const int batchCount, 87 | float f_alpha = 1.0f, 88 | float f_beta = 0.0f) 89 | { 90 | int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; 91 | const void* alpha = 92 | is_fp16_computeType ? reinterpret_cast(&(f_alpha)) : reinterpret_cast(&f_alpha); 93 | const void* beta = is_fp16_computeType ? reinterpret_cast(&(f_beta)) : reinterpret_cast(&f_beta); 94 | CHECK_CUBLAS(cublasGemmStridedBatchedEx(cublas_handle_, 95 | transa, 96 | transb, 97 | m, 98 | n, 99 | k, 100 | alpha, 101 | A, 102 | Atype_, 103 | lda, 104 | strideA, 105 | B, 106 | Btype_, 107 | ldb, 108 | strideB, 109 | beta, 110 | C, 111 | Ctype_, 112 | ldc, 113 | strideC, 114 | batchCount, 115 | computeType_, 116 | CUBLAS_GEMM_DEFAULT)); 117 | } 118 | -------------------------------------------------------------------------------- /tests/unittests/test_concat_kv.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include 9 | #include "src/kernels/concat_past_kv.h" 10 | // (RussWong)note: 11 | // there is no concat kv cpu kernel implementation now 12 | // we compare the kernel correctnesss by eyes and result print infos 13 | // `./test_concat_kv` to test fp32 GPU kernel 14 | int main() 15 | { 16 | const int batch_size = 1; 17 | const int max_q_len = 16; 18 | const int max_seq_len = 32; 19 | const int head_size = 8; 20 | const int kv_head_num = 2; 21 | const int kv_size = 1 * batch_size * max_q_len * kv_head_num * head_size; 22 | const int layer_offset = 1 * batch_size * max_seq_len * kv_head_num * head_size; 23 | const int kvcache_size = layer_offset; 24 | // (RussWong)note: we plan to place layer id on CPU 25 | // const int layer_id = 0; 26 | 27 | float *h_k_src; 28 | float *d_k_src; 29 | h_k_src = (float *)malloc(sizeof(float) * kv_size); 30 | cudaMalloc((void **)&d_k_src, sizeof(float) * kv_size); 31 | 32 | float *h_v_src; 33 | float *d_v_src; 34 | h_v_src = (float *)malloc(sizeof(float) * kv_size); 35 | cudaMalloc((void **)&d_v_src, sizeof(float) * kv_size); 36 | 37 | int *cur_query_length = (int *)malloc(sizeof(int) * batch_size); 38 | int *history_length = (int *)malloc(sizeof(int) * batch_size); 39 | int *dcur_query_length; 40 | int *dhistory_length; 41 | cudaMalloc((void **)&dcur_query_length, sizeof(int) * batch_size); 42 | cudaMalloc((void **)&dhistory_length, sizeof(int) * batch_size); 43 | 44 | float *h_k_dst = (float *)malloc(sizeof(float) * kvcache_size); 45 | float *h_v_dst = (float *)malloc(sizeof(float) * kvcache_size); 46 | float *d_k_dst; 47 | float *d_v_dst; 48 | cudaMalloc((void **)&d_k_dst, sizeof(float) * kvcache_size); 49 | cudaMalloc((void **)&d_v_dst, sizeof(float) * kvcache_size); 50 | float *kv_scale; 51 | cudaMalloc((void **)&kv_scale, sizeof(float)); 52 | int *h_layer_id = (int *)malloc(sizeof(int) * batch_size); 53 | // (RussWong)note: we plan to place layer id on CPU 54 | // int *d_layer_id; 55 | // cudaMalloc((void **)&d_layer_id, sizeof(int) * batch_size); 56 | 57 | for (int i = 0; i < kv_size; i++) 58 | { 59 | h_k_src[i] = 1.0f; 60 | h_v_src[i] = 1.0f; 61 | } 62 | for (int i = 0; i < batch_size; i++) 63 | { 64 | cur_query_length[i] = 16; 65 | history_length[i] = 1; 66 | h_layer_id[i] = 0; 67 | } 68 | cudaMemcpy(d_v_src, h_v_src, sizeof(float) * kv_size, cudaMemcpyHostToDevice); 69 | cudaMemcpy(d_k_src, h_k_src, sizeof(float) * kv_size, cudaMemcpyHostToDevice); 70 | cudaMemcpy(dcur_query_length, cur_query_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice); 71 | cudaMemcpy(dhistory_length, history_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice); 72 | // cudaMemcpy(d_layer_id, h_layer_id, sizeof(int) * batch_size, cudaMemcpyHostToDevice); 73 | 74 | DataType type = getTensorType(); 75 | DataType type_int = getTensorType(); 76 | TensorWrapper *in_ksrc = new TensorWrapper(Device::GPU, type, {batch_size, kv_head_num, max_q_len, head_size}, d_k_src); 77 | TensorWrapper *in_vsrc = new TensorWrapper(Device::GPU, type, {batch_size, kv_head_num, max_q_len, head_size}, d_v_src); 78 | TensorWrapper *layer_id = new TensorWrapper(Device::CPU, type_int, {batch_size}, h_layer_id); 79 | TensorWrapper *cur_q_len = new TensorWrapper(Device::GPU, type_int, {batch_size}, dcur_query_length); 80 | TensorWrapper *history_len = new TensorWrapper(Device::GPU, type_int, {batch_size}, dhistory_length); 81 | TensorWrapper *out_kdst = new TensorWrapper(Device::GPU, type, {batch_size, kv_head_num, max_seq_len, head_size}, d_k_dst); 82 | TensorWrapper *out_vdst = new TensorWrapper(Device::GPU, type, {batch_size, kv_head_num, max_seq_len, head_size}, d_v_dst); 83 | // debug info, better to retain: std::cout << "before launch kernel" << std::endl; 84 | launchConcatKVCache(in_ksrc, in_vsrc, layer_id, cur_q_len, history_len, out_kdst, out_vdst); 85 | // debug info, better to retain: std::cout << "after launch kernel" << std::endl; 86 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 87 | cudaMemcpy(h_v_dst, d_v_dst, sizeof(float) * kvcache_size, cudaMemcpyDeviceToHost); 88 | cudaMemcpy(h_k_dst, d_k_dst, sizeof(float) * kvcache_size, cudaMemcpyDeviceToHost); 89 | // debug info, better to retain: std::cout << "cuda memcpy device to host" << std::endl; 90 | // note: need to add offset2index and index2offset API to help us program and check result 91 | for (int i = batch_size * (1) * kv_head_num * head_size; i < batch_size * max_seq_len * kv_head_num * head_size; i++) 92 | { 93 | printf("index = %d\n", i); 94 | printf("res k = %f\n", h_k_dst[i]); 95 | // debug info, better to retain: printf("topK id = %d\n", id); 96 | printf("res v = %f\n", h_v_dst[i]); 97 | printf("===============\n"); 98 | // debug info, better to retain: printf("topK val =%f\n", val); 99 | } 100 | // debug info, better to retain: std::cout << "before free" << std::endl; 101 | free(h_k_src); 102 | free(h_v_src); 103 | free(h_k_dst); 104 | free(h_v_dst); 105 | free(cur_query_length); 106 | free(history_length); 107 | free(h_layer_id); 108 | cudaFree(d_k_src); 109 | cudaFree(d_v_src); 110 | cudaFree(d_k_dst); 111 | cudaFree(d_v_dst); 112 | cudaFree(dcur_query_length); 113 | cudaFree(dhistory_length); 114 | cudaFree(kv_scale); 115 | } 116 | -------------------------------------------------------------------------------- /src/kernels/repeat_kv.cu: -------------------------------------------------------------------------------- 1 | #include "src/kernels/repeat_kv.h" 2 | #include "src/utils/cuda_debug_utils.cuh" 3 | #include 4 | // if MQA or GQA, we should use this transpose to broadcast kv head num to q head num 5 | //[num layers, bs, kv head num, max_seq_len, head size]=>[bs, q head num, max_k_len, head size] 6 | // context_length.shape=[bs] 7 | // bugs1: when k_dst.shape = [1,32,13,128],现在这个k_dst以13*128为单位循环第一个13*128的值 8 | // solu1: launcher函数里面获取kv cache的shape出错,需要仔细核对各个TensorWrapper的shape再通过正确索引获取 9 | template 10 | __global__ void repeat_value_cache(T *v_dst, 11 | const T *v_src, 12 | const size_t layer_offset, 13 | const int head_num, 14 | const int q_head_per_kv, 15 | const int head_size, 16 | const int *context_length, 17 | const int max_k_len, 18 | const int max_seq_len) 19 | { 20 | const int batch_id = blockIdx.y; 21 | const int head_id = blockIdx.z; 22 | 23 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 24 | 25 | const auto val_src = v_src + layer_offset; 26 | const auto val_dst = v_dst; 27 | 28 | const auto seq_len = context_length[batch_id]; 29 | 30 | const int v_head_size_id = idx % head_size; 31 | const int v_seq_len_id = idx / head_size; 32 | // only fetch context_length( 49 | void launchRepeatKVCache(TensorWrapper *k_cache_src, //{num_layers, batch_size, kv_head_num, max_seq_len, head_size} 50 | TensorWrapper *v_cache_src, //{num_layers, batch_size, kv_head_num, max_seq_len, head_size} 51 | TensorWrapper *context_length, 52 | TensorWrapper *layer_id, 53 | TensorWrapper *k_cache_dst, //{batch_size, head_num, max_k_len, head_size} 54 | TensorWrapper *v_cache_dst) 55 | { 56 | int batch_size = context_length->shape[0]; 57 | int kv_head_num = k_cache_src->shape[2]; // (RussWong)note: we should carefully access the shape value, corresponding to the place where tensorwapper is defined 58 | int max_seq_len = k_cache_src->shape[3]; 59 | int head_num = k_cache_dst->shape[1]; 60 | 61 | int max_k_len = k_cache_dst->shape[2]; 62 | int head_size = k_cache_dst->shape[3]; 63 | int layer = layer_id->getVal(); 64 | // (RussWong)note: if layer id is on GPU, here MUSTN'T use layer_id->getVal(), because we cant access GPU memory directly by [] if data is on GPU 65 | // (RussWong)note: so we can make layer data locate on CPU, so that we can access data by [] 66 | size_t layer_offset = layer * batch_size * kv_head_num * max_seq_len * head_size; 67 | int q_head_per_kv = head_num / kv_head_num; 68 | int blockSize = 128; 69 | dim3 block(blockSize); 70 | dim3 grid((max_k_len * head_size + blockSize - 1) / blockSize, batch_size, head_num); 71 | repeat_value_cache<<>>(v_cache_dst->data, 72 | v_cache_src->data, 73 | layer_offset, 74 | head_num, 75 | q_head_per_kv, 76 | head_size, 77 | context_length->data, 78 | max_k_len, 79 | max_seq_len); 80 | 81 | repeat_value_cache<<>>(k_cache_dst->data, 82 | k_cache_src->data, 83 | layer_offset, 84 | head_num, 85 | q_head_per_kv, 86 | head_size, 87 | context_length->data, 88 | max_k_len, 89 | max_seq_len); 90 | #ifdef PRINT_DATA 91 | printf("repeat kv kernel top2 result:\n"); 92 | print_data<<<1, 1>>>(k_cache_dst->data); 93 | #else 94 | #endif 95 | } 96 | 97 | template void launchRepeatKVCache(TensorWrapper *k_cache_src, 98 | TensorWrapper *v_cache_src, 99 | TensorWrapper *context_length, 100 | TensorWrapper *layer_id, 101 | TensorWrapper *k_cache_dst, 102 | TensorWrapper *v_cache_dst); 103 | template void launchRepeatKVCache(TensorWrapper *k_cache_src, 104 | TensorWrapper *v_cache_src, 105 | TensorWrapper *context_length, 106 | TensorWrapper *layer_id, 107 | TensorWrapper *k_cache_dst, 108 | TensorWrapper *v_cache_dst); 109 | -------------------------------------------------------------------------------- /tests/unittests/test_bmm.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | #include 8 | #include "src/utils/macro.h" 9 | #include "src/kernels/linear.h" 10 | #include "src/weights/base_weights.h" 11 | // (RussWong)note: this kernel's CPU implementation is absolutely right. 12 | // But when you are implementing LLMs inference on CPU, I dont recommend to reuse the CPU kernel, because its performance is bad 13 | void CPUlinear(float* input, float* weight, float* output, 14 | int m, int k, int n, int batch) { 15 | for(int b = 0; b < batch; b++) { 16 | for(int i = 0; i < m; i++) { 17 | for(int j = 0; j < n; j++) { 18 | for(int l = 0; l < k; l++) { 19 | output[b * m * n + i * n + j] += input[b * m * k + i * k + l] * weight[b * k * n + l * n + j]; 20 | } 21 | } 22 | } 23 | } 24 | } 25 | 26 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) { 27 | for(int i = 0; i < output_size; i++) { 28 | if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){ 29 | printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]); 30 | return false; 31 | } 32 | } 33 | return true; 34 | } 35 | // (RussWong)note: 36 | // `./bmm 1` to test fp32 GPU batch matmul with trans_b = true 37 | // `./bmm` to test fp32 GPU batch matmul with trans_b = false 38 | int main(int argc, char *argv[]) { 39 | const int batch_size = 1; 40 | const int seqlen_in = 16; 41 | const int seqlen_w = 16; 42 | const int hidden_units = 4096; 43 | const int head_num = 32; 44 | const int head_size = 128; 45 | int in_size = 0; 46 | int w_size = 0; 47 | int output_size = 0; 48 | if (argv[1]) {// enable trans_b for test lmhead linear 49 | in_size = batch_size * head_num * seqlen_in * head_size; // q 50 | w_size = batch_size * head_num * seqlen_w * head_size; // k 51 | output_size = batch_size * head_num * seqlen_in * seqlen_w; //q k 52 | } else { 53 | in_size = batch_size * head_num * seqlen_in * seqlen_w; //qk 54 | w_size = batch_size * head_num * seqlen_w * head_size; // v 55 | output_size = batch_size * head_num * seqlen_in * head_size; 56 | } 57 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 58 | float* h_w; 59 | float* d_w; 60 | h_w = (float*)malloc(sizeof(float) * w_size); 61 | cudaMalloc((void**)&d_w, sizeof(float) * w_size); 62 | for(int i = 0; i < w_size; i++) { 63 | h_w[i] = (float)(i % 2 + 1); 64 | //h_w[i] = 1.0f; // simple data 65 | } 66 | 67 | float* h_in = (float*) malloc(sizeof(float) * in_size); 68 | float* d_in; 69 | cudaMalloc((void**)&d_in, sizeof(float) * in_size); 70 | for(int i = 0; i < in_size; i++) { 71 | h_in[i] = (float)(i % 2 + 1); 72 | //h_in[i] = 1.0f; // simple data 73 | } 74 | 75 | float* h_out = (float*) malloc(sizeof(float) * output_size); 76 | float* d_out; 77 | cudaMalloc((void**)&d_out, sizeof(float) * output_size); 78 | 79 | CHECK(cudaMemcpy(d_in, h_in, sizeof(float) * in_size, cudaMemcpyHostToDevice)); 80 | CHECK(cudaMemcpy(d_w, h_w, sizeof(float) * w_size, cudaMemcpyHostToDevice)); 81 | DataType type = getTensorType(); 82 | WeightType wtype = getWeightType(); 83 | TensorWrapper* in; 84 | if (argv[1]) {// enable trans_b for test qk*v 85 | in = new TensorWrapper(Device::GPU, type, {batch_size, head_num, seqlen_in, head_size}, d_in); 86 | } else {// disable trans_b for test q*k 87 | in = new TensorWrapper(Device::GPU, type, {batch_size, head_num, seqlen_in, seqlen_w}, d_in); 88 | } 89 | TensorWrapper* weight = new TensorWrapper(Device::GPU, type, {batch_size, head_num, seqlen_w, head_size}, d_w); 90 | TensorWrapper* out; 91 | if (argv[1]) {// enable trans_b for test qk*v 92 | out = new TensorWrapper(Device::GPU, type, {batch_size, head_num, seqlen_in, seqlen_w}, d_out); 93 | } else {// disable trans_b for test q*k 94 | out = new TensorWrapper(Device::GPU, type, {batch_size, head_num, seqlen_in, head_size}, d_out); 95 | } 96 | cublasHandle_t cublas_handle; 97 | cublasLtHandle_t cublaslt_handle; 98 | cublasCreate(&cublas_handle); 99 | cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); 100 | cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle); 101 | cublas_wrapper->setFP32GemmConfig(); 102 | // debug info, better to retain: 103 | std::cout << "before launch kernel" << std::endl; 104 | if (argv[1]) {// enable trans_b for test qk*v 105 | launchLinearStridedBatchGemm(in, weight, out, cublas_wrapper, false, true); 106 | } else {// disable trans_b for test q*k 107 | launchLinearStridedBatchGemm(in, weight, out, cublas_wrapper); 108 | } 109 | // debug info, better to retain: 110 | std::cout << "after launch kernel" << std::endl; 111 | // debug info, better to retain: 112 | std::cout << "cuda memcpy device to host" << std::endl; 113 | // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault 114 | CHECK(cudaMemcpy(h_out, d_out, sizeof(float) * output_size, cudaMemcpyDeviceToHost)); 115 | float* CPUout = (float*) malloc(sizeof(float) * output_size); 116 | if (argv[1]) {// enable trans_b for ttest qk*v 117 | CPUlinear(h_in, h_w, CPUout, seqlen_in, head_size, seqlen_w, batch_size * head_num); 118 | } else {// disable trans_b for test q*k 119 | CPUlinear(h_in, h_w, CPUout, seqlen_in, seqlen_w, head_size, batch_size * head_num); 120 | } 121 | 122 | bool is_right = CheckResult(CPUout, h_out, output_size); 123 | // debug info, better to retain: 124 | std::cout << "before free" << std::endl; 125 | std::cout << "linear passed" << std::endl; 126 | free(h_in); 127 | free(h_w); 128 | free(h_out); 129 | free(CPUout); 130 | cudaFree(d_in); 131 | cudaFree(d_w); 132 | cudaFree(d_out); 133 | } 134 | -------------------------------------------------------------------------------- /src/utils/weight_utils.cu: -------------------------------------------------------------------------------- 1 | #include "src/utils/weight_utils.h" 2 | 3 | template 4 | inline __device__ T_OUT type_cast(T_IN val) { 5 | return val; 6 | } 7 | template<> 8 | inline __device__ float type_cast(half val) { 9 | return __half2float(val); 10 | } 11 | 12 | template<> 13 | inline __device__ half type_cast(float val) { 14 | return __float2half(val); 15 | } 16 | 17 | template 18 | void GPUMalloc(T** ptr, size_t size) 19 | { 20 | LLM_CHECK_WITH_INFO(size >= ((size_t)0), "Ask cudaMalloc size " + std::to_string(size) + "< 0 is invalid."); 21 | CHECK(cudaMalloc((void**)(ptr), sizeof(T) * size)); 22 | } 23 | template void GPUMalloc(float** ptr, size_t size); 24 | template void GPUMalloc(half** ptr, size_t size); 25 | 26 | template 27 | void GPUFree(T* ptr) 28 | { 29 | if (ptr != NULL) { 30 | CHECK(cudaFree(ptr)); 31 | ptr = NULL; 32 | } 33 | } 34 | template void GPUFree(float* ptr); 35 | template void GPUFree(half* ptr); 36 | 37 | template 38 | void cudaH2Dcpy(T* tgt, const T* src, const size_t size) 39 | { 40 | CHECK(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice)); 41 | } 42 | 43 | template void cudaH2Dcpy(float* tgt, const float* src, const size_t size); 44 | template void cudaH2Dcpy(half* tgt, const half* src, const size_t size); 45 | 46 | template 47 | __global__ void type_conversion(T_OUT* dst, const T_IN* src, const int size) 48 | { 49 | int gtid = threadIdx.x + blockIdx.x * blockDim.x; 50 | int total_thread_nums = blockDim.x * gridDim.x; 51 | for (int index = gtid; index < size; index += total_thread_nums) { 52 | dst[index] = type_cast(src[index]); 53 | } 54 | } 55 | 56 | template 57 | void cuda_type_conversion(T_OUT* dst, const T_IN* src, const int size) 58 | { 59 | dim3 grid(128); 60 | dim3 block(128); 61 | type_conversion<<>>(dst, src, size); 62 | } 63 | 64 | template void cuda_type_conversion(float* dst, const half* src, const int size); 65 | template void cuda_type_conversion(half* dst, const float* src, const int size); 66 | 67 | // from FT code 68 | // loads data from binary file. If it succeeds, returns a non-empty (shape size) vector. If loading fails or 69 | // the product of the elements in shape is 0, this function will return an empty vector. 70 | template 71 | std::vector loadWeightFromBinHelper(std::vector shape, std::string filename) 72 | { 73 | if (shape.size() > 2) { 74 | printf("[ERROR] shape should have less than two dims \n"); 75 | return std::vector(); 76 | } 77 | size_t dim0 = shape[0], dim1 = 1; 78 | if (shape.size() == 2) { 79 | dim1 = shape[1]; 80 | } 81 | size_t size = dim0 * dim1; 82 | if (size == 0) { 83 | std::cout << "shape is zero, skip loading weight from file: " << filename << std::endl; 84 | return std::vector(); 85 | } 86 | 87 | std::vector host_array(size); 88 | std::ifstream in(filename, std::ios::in | std::ios::binary); 89 | if (!in.is_open()) { 90 | std::cout << "file" << filename << "cannot be opened, loading model fails!" << std::endl; 91 | return std::vector(); 92 | } 93 | 94 | size_t loaded_data_size = sizeof(T) * size; 95 | in.seekg(0, in.end); 96 | in.seekg(0, in.beg); 97 | 98 | std::cout << "Read " << std::to_string(loaded_data_size) << " bytes from " << filename << std::endl; 99 | in.read((char*)host_array.data(), loaded_data_size); 100 | 101 | size_t in_get_size = in.gcount(); 102 | if (in_get_size != loaded_data_size) { 103 | return std::vector(); 104 | } 105 | in.close(); 106 | // If we succeed, return an array with values. 107 | return host_array; 108 | } 109 | 110 | template 111 | struct loadWeightFromBin 112 | { 113 | public: 114 | static void internalFunc(T_OUT* ptr, std::vector shape, std::string filename) { 115 | std::vector host_array = loadWeightFromBinHelper(shape, filename); 116 | if (host_array.empty()) { 117 | return; 118 | } 119 | 120 | cudaH2Dcpy(ptr, host_array.data(), host_array.size()); 121 | return; 122 | } 123 | }; 124 | 125 | template 126 | struct loadWeightFromBin 127 | { 128 | public: 129 | static void internalFunc(T_OUT* ptr, std::vector shape, std::string filename) { 130 | std::vector host_array = loadWeightFromBinHelper(shape, filename); 131 | if (host_array.empty()) { 132 | return; 133 | } 134 | 135 | T_FILE* ptr_tmp; 136 | GPUMalloc(&ptr_tmp, host_array.size()); 137 | cudaH2Dcpy(ptr_tmp, host_array.data(), host_array.size()); 138 | cuda_type_conversion(ptr, ptr_tmp, host_array.size()); 139 | GPUFree(ptr_tmp); 140 | return; 141 | } 142 | }; 143 | 144 | // !!(wrong case)C++委员会规定:函数模板不支持模板偏特化 145 | // template 146 | // typename std::enable_if::value, int>::type loadWeightFromBin(T_OUT* ptr, std::vector shape, std::string filename) 147 | // { 148 | // std::vector host_array = loadWeightFromBinHelper(shape, filename); 149 | 150 | // if (host_array.empty()) { 151 | // return 0; 152 | // } 153 | 154 | // cudaH2Dcpy(ptr, host_array.data(), host_array.size()); 155 | // return 0; 156 | // } 157 | 158 | // template 159 | // typename std::enable_if::value, int>::type loadWeightFromBin(T_OUT* ptr, std::vector shape, std::string filename) 160 | // { 161 | // std::vector host_array = loadWeightFromBinHelper(shape, filename); 162 | 163 | // if (host_array.empty()) { 164 | // return 0; 165 | // } 166 | 167 | 168 | // T_FILE* ptr_tmp; 169 | // GPUMalloc(&ptr_tmp, host_array.size()); 170 | // cudaH2Dcpy(ptr_tmp, host_array.data(), host_array.size()); 171 | // cuda_type_conversion(ptr, ptr_tmp, host_array.size()); 172 | // GPUFree(ptr_tmp); 173 | // return 0; 174 | // } 175 | 176 | template struct loadWeightFromBin; 177 | template struct loadWeightFromBin; 178 | template struct loadWeightFromBin; 179 | template struct loadWeightFromBin; 180 | -------------------------------------------------------------------------------- /tests/unittests/test_mask_softmax.cu: -------------------------------------------------------------------------------- 1 | #include // std::fill_n 2 | #include // snprintf 3 | #include // expf, log 4 | #include // rand 5 | #include // std::string 6 | #include // std::vector 7 | 8 | #include 9 | #include "src/kernels/attn_softmax_kernel.h" 10 | // (RussWong)note: 11 | // there is no cpu kernel implementation now, and if you bought my CUDA lesson, you can find CPU softmax kernel. 12 | // we compare the kernel correctnesss by eyes and result print infos 13 | // `./test_mask_softmax 1` to test half GPU kernel 14 | // `./test_mask_softmax` to test fp32 GPU kernel 15 | #define TEST_MASKED_SOFTMAX(dtype) \ 16 | dtype *h_qk; \ 17 | dtype *d_qk; \ 18 | h_qk = (dtype *)malloc(sizeof(dtype) * qk_size); \ 19 | cudaMalloc((void **)&d_qk, sizeof(dtype) * qk_size); \ 20 | dtype *h_score; \ 21 | dtype *d_score; \ 22 | h_score = (dtype *)malloc(sizeof(dtype) * qk_size); \ 23 | cudaMalloc((void **)&d_score, sizeof(dtype) * qk_size); \ 24 | dtype *h_mask; \ 25 | dtype *d_mask; \ 26 | h_mask = (dtype *)malloc(sizeof(dtype) * batch_size * q_length * k_length); \ 27 | cudaMalloc((void **)&d_mask, sizeof(dtype) * batch_size * q_length * k_length); \ 28 | for (int i = 0; i < qk_size; i++) \ 29 | { \ 30 | h_qk[i] = i % 8; \ 31 | } \ 32 | for (int i = 0; i < batch_size * q_length * k_length; i++) \ 33 | { \ 34 | h_mask[i] = (dtype)(1); \ 35 | } \ 36 | cudaMemcpy(d_qk, h_qk, sizeof(dtype) * qk_size, cudaMemcpyHostToDevice); \ 37 | cudaMemcpy(d_mask, h_mask, sizeof(dtype) * batch_size * q_length * k_length, cudaMemcpyHostToDevice); \ 38 | DataType type = getTensorType(); \ 39 | TensorWrapper *qk = new TensorWrapper(Device::GPU, type, {batch_size, head_num, q_length, k_length}, d_qk); \ 40 | TensorWrapper *mask = new TensorWrapper(Device::GPU, type, {batch_size, q_length, k_length}, d_mask); \ 41 | TensorWrapper *score = new TensorWrapper(Device::GPU, type, {batch_size, head_num, q_length, k_length}, d_score); \ 42 | std::cout << "before launch softmax kernel" << std::endl; \ 43 | launchScaleMaskAndSoftmax(qk, mask, score, scale); \ 44 | std::cout << "after launch softmax kernel" << std::endl; \ 45 | std::cout << "cuda memcpy device to host" << std::endl; \ 46 | cudaMemcpy(h_score, score->data, sizeof(dtype) * qk_size, cudaMemcpyDeviceToHost); \ 47 | for (int i = 0; i < qk_size; i++) \ 48 | { \ 49 | printf("attn score[%d] = %f\n", i, (float)h_score[i]); \ 50 | } \ 51 | free(h_qk); \ 52 | free(h_score); \ 53 | free(h_mask); \ 54 | cudaFree(d_qk); \ 55 | cudaFree(d_score); \ 56 | cudaFree(d_mask); 57 | 58 | int main(int argc, char *argv[]) 59 | { 60 | const int batch_size = 1; 61 | const int head_num = 2; 62 | const int q_length = 8; 63 | const int k_length = 8; 64 | const int head_size = 4; 65 | float scale = rsqrtf(float(head_size)); 66 | // debug info, better to retain: std::cout <<"batch_size=" << batch_size << " vocab_size=" << vocab_size << std::endl; 67 | const int qk_size = batch_size * head_num * q_length * k_length; 68 | if (argv[1]) 69 | { 70 | TEST_MASKED_SOFTMAX(half); 71 | } 72 | else 73 | { 74 | TEST_MASKED_SOFTMAX(float); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/kernels/topK.cu: -------------------------------------------------------------------------------- 1 | #include //FLT_MIN 2 | #include 3 | #include 4 | #include "src/kernels/topK.h" 5 | #include 6 | 7 | // Note: a b两个topK reduce输出一个topK 8 | template 9 | __device__ topK reduce_functor(const topK& a, const topK& b) { 10 | topK res = a; 11 | for(int i = 0; i < K; i++){ 12 | res.insertHeap(b.val[i], b.id[i]); 13 | } 14 | return res; 15 | } 16 | // gridsize:bs * beamwidth * BlockPerBeam 17 | // blocksize: 256 18 | // shape infer: [bs, beamwidth, vocab size] => [bs, beamwidth, BlockPerBeam, K] 19 | template 20 | __global__ void topK_kernel_round1(const T* probs, const int vocab_size, 21 | int* topK_ids, T* topK_vals) 22 | { 23 | typedef cub::BlockReduce, blockSize> blockreduce; 24 | __shared__ typename blockreduce::TempStorage temp_storage; 25 | 26 | int tid = threadIdx.x; 27 | int bid = blockIdx.x; 28 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 29 | int row_id = bid / BlockPerBeam; 30 | int block_lane = bid % BlockPerBeam; 31 | topK thread_topK; 32 | thread_topK.init(); 33 | // thread local reduce 34 | for(int data_id = tid + block_lane * blockSize; data_id < vocab_size; data_id += BlockPerBeam * blockSize){ 35 | int data_offset = data_id + row_id * vocab_size; 36 | T data = probs[data_offset]; 37 | //thread_topK.insertHeap(data, data_offset); // bug, id should be local in bsxbm, if use this line, assume bsxbm=2, prob=1-50000,the 2nd bsxbm res topk id will be 59999,59998..., but in bsxbm internal, this id will be 29999,29998... rather than not global id 38 | thread_topK.insertHeap(data, data_id); 39 | } 40 | //block local reduce 41 | topK block_topK = blockreduce(temp_storage).Reduce(thread_topK, reduce_functor); 42 | 43 | if(tid == 0){ 44 | for(int k_offset = 0; k_offset < K; k_offset++) { 45 | // topK_vals[row_id * vocab_size + block_lane * blockSize + k_offset] = block_topK.val[k_offset]; //bug 46 | topK_vals[row_id * BlockPerBeam * K + block_lane * K + k_offset] = block_topK.val[k_offset]; 47 | topK_ids[row_id * BlockPerBeam * K + block_lane * K + k_offset] = block_topK.id[k_offset];//output offset要根据output buffer的shape来计算 48 | 49 | } 50 | } 51 | } 52 | // shape infer: [bs, beamwidth, BlockPerBeam, K] => [bs, beamwidth, K] 53 | // ids是beam width * vocab size中的全局word id 54 | // gridSize = bs 55 | // blockSize = 256 56 | template 57 | __global__ void topK_kernel_round2(const int* topK_ids, const T* topK_vals, 58 | int* final_topK_ids, T* final_topK_vals) 59 | { 60 | typedef cub::BlockReduce, blockSize> blockreduce; 61 | __shared__ typename blockreduce::TempStorage temp_storage; 62 | 63 | int tid = threadIdx.x; 64 | int bid = blockIdx.x; 65 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 66 | int row_id = bid; 67 | topK thread_topK; 68 | // thread local reduce 69 | for(int i = tid; i < BlockPerBeam * K; i += blockDim.x) { 70 | int data_offset = bid * BlockPerBeam * K + i; 71 | thread_topK.insertHeap(topK_vals[data_offset], topK_ids[i]); 72 | } 73 | // block reduce 74 | topK block_topK = blockreduce(temp_storage).Reduce(thread_topK, reduce_functor); 75 | if(tid == 0){ 76 | for(int k_offset = 0; k_offset < K; k_offset++) { 77 | // topK_vals[row_id * vocab_size + block_lane * blockSize + k_offset] = block_topK.val[k_offset]; //bug 78 | final_topK_vals[bid * K + k_offset] = block_topK.val[k_offset]; 79 | final_topK_ids[bid * K + k_offset] = block_topK.id[k_offset]; 80 | } 81 | } 82 | } 83 | 84 | template 85 | void launchTopKforBeamSearch(TensorWrapper *probs, 86 | TensorWrapper *topk_ids, 87 | TensorWrapper *topk_vals, 88 | TensorWrapper *final_topk_ids, 89 | TensorWrapper *final_topk_vals) 90 | { 91 | // support both beamserach and sampling topk by integrate beamwidth into batchsize, we get variable bsxbw = bs*bw, the probs shape is [bs*bw, vocabsize] 92 | int bsxbm = probs->shape[0]; 93 | int vocab_size = probs->shape[1]; 94 | constexpr int BlockPerBeam = 8; 95 | constexpr int beamwidth = 1; 96 | constexpr int K = 5; 97 | // buffer size 98 | int topK_val_buf_size = bsxbm * BlockPerBeam * K; 99 | int topK_ids_buf_size = bsxbm * BlockPerBeam * K; 100 | int final_topK_val_buf_size = bsxbm * K; 101 | 102 | T* topK_vals = topk_vals->data; 103 | int* topK_ids = topk_ids->data; 104 | T* final_topK_vals = final_topk_vals->data; 105 | int* final_topK_ids = final_topk_ids->data; 106 | // prepare launch 107 | // TODO: add GPUconfig API to easily get GPU config, ep: maxblocknums 108 | // GPUConfig config; 109 | // int maxBlockNums = config.getMaxBlockNums(); 110 | // TODO: how to alloc block nums more flexable according to shape 111 | //constexpr int BlockPerBeam = 8; 112 | int maxBlockNums = 1024; 113 | int BlockNums1 = std::min(bsxbm * BlockPerBeam, maxBlockNums); 114 | int BlockNums2 = std::min(bsxbm, maxBlockNums); 115 | dim3 grid_round1(BlockNums1); 116 | dim3 block_round1(256); 117 | dim3 grid_round2(BlockNums2); 118 | dim3 block_round2(256); 119 | // debug info, better to retain: std::cout << "in cu file, before launch" << std::endl; 120 | topK_kernel_round1 121 | <<>>(probs->data, vocab_size, topK_ids, topK_vals); 122 | topK_kernel_round2 123 | <<>>(topK_ids, topK_vals, final_topK_ids, final_topK_vals); 124 | // debug info, better to retain: std::cout << "in cu file, after launch" << std::endl; 125 | } 126 | 127 | template void launchTopKforBeamSearch(TensorWrapper *probs, 128 | TensorWrapper *topk_ids, 129 | TensorWrapper *topk_vals, 130 | TensorWrapper *final_topk_ids, 131 | TensorWrapper *final_topk_vals); 132 | 133 | template void launchTopKforBeamSearch(TensorWrapper *probs, 134 | TensorWrapper *topk_ids, 135 | TensorWrapper *topk_vals, 136 | TensorWrapper *final_topk_ids, 137 | TensorWrapper *final_topk_vals); 138 | -------------------------------------------------------------------------------- /src/kernels/topK_bk.cu: -------------------------------------------------------------------------------- 1 | #include //FLT_MIN 2 | #include 3 | #include "src/kernels/topK.h" 4 | #include 5 | 6 | // Note: a b两个topK reduce输出一个topK 7 | template 8 | __device__ topK reduce_functor(const topK &a, const topK &b) 9 | { 10 | topK res = a; 11 | for (int i = 0; i < K; i++) { 12 | res.insertHeap(b.val[i], b.id[i]); 13 | } 14 | return res; 15 | } 16 | // gridsize:bs * beam_width * BlockPerBeam 17 | // blocksize:256 18 | // shape infer: [bs, beam_width, vocab size] => [bs, beam_width, BlockPerBeam, K],在vocabsize的大小里选出blockPerBeam个topK 19 | template 20 | __global__ void topK_kernel_round1(const T *probs, const int vocab_size, 21 | int *topK_ids, T *topK_vals) 22 | { 23 | typedef cub::BlockReduce, blockSize> blockreduce; 24 | __shared__ typename blockreduce::TempStorage tmp_storage; 25 | 26 | int tid = threadIdx.x; 27 | int bid = blockIdx.x; 28 | int row_id = bid / BlockPerBeam; 29 | int block_lane = bid % BlockPerBeam; 30 | topK thread_topK; 31 | thread_topK.init(); 32 | //thread local reduce 33 | for (int data_id = tid + block_lane * blockSize; data_id < vocab_size; data_id += BlockPerBeam * blockSize) { 34 | int data_offset = data_id + row_id * vocab_size; 35 | T data = probs[data_offset]; 36 | thread_topK.insertHeap(data, data_offset); 37 | // if (bid == 1 && data_id < 10) { 38 | // printf("ROUND1, 1st block, top1 vals = %f, top1 id = %d\n", data, data_offset); 39 | // } 40 | } 41 | // typedef cub::BlockReduce, blockSize> blockreduce; 42 | // __shared__ typename blockreduce::TempStorage tmp_storage; 43 | topK block_topk = blockreduce(tmp_storage).Reduce(thread_topK, reduce_functor); 44 | 45 | if (tid == 0) { 46 | for (int k_offset = 0; k_offset < K; k_offset++) { 47 | int dst_offset = row_id * BlockPerBeam * K + block_lane * K + k_offset; 48 | topK_vals[dst_offset] = block_topk.val[k_offset]; 49 | topK_ids[dst_offset] = block_topk.id[k_offset]; 50 | } 51 | } 52 | 53 | } 54 | // shape infer: [bs, beam_width, BlockPerBeam, K] => [bs, beam_width, K] ,这是sampling的topK(=>[bs, beam_width, K]才是beamsearch topK),后期注意重写一个beamsearch的topK 55 | // gridSize = bs 56 | // blockSize = 256 57 | template 58 | __global__ void topK_kernel_round2(const int *topK_ids, const T *topK_vals, 59 | int *final_topK_ids, T *final_topK_vals) 60 | { 61 | typedef cub::BlockReduce, blockSize> blockreduce; 62 | __shared__ typename blockreduce::TempStorage tmp_storage; 63 | 64 | int tid = threadIdx.x; 65 | int bid = blockIdx.x; 66 | int row_id = bid; 67 | topK thread_topK; 68 | thread_topK.init(); 69 | //thread local reduce 70 | for (int data_id = tid; data_id < BlockPerBeam * K; data_id += blockSize) { 71 | int data_offset = data_id + bid * BlockPerBeam * K; 72 | 73 | thread_topK.insertHeap(topK_vals[data_offset], topK_ids[data_offset]); 74 | // if (bid == 0 && data_id == 0) { 75 | // printf("ROUND2, 1st block, top1 vals = %f, top1 id = %d\n", topK_vals[data_offset], topK_ids[data_offset]); 76 | // } 77 | } 78 | 79 | // typedef cub::BlockReduce, blockSize> blockreduce; 80 | // __shared__ typename blockreduce::TempStorage tmp_storage; 81 | topK block_topk = blockreduce(tmp_storage).Reduce(thread_topK, reduce_functor); 82 | 83 | if (tid == 0) { 84 | //int beam_id = (blockDim.x * blockIdx.x + tid) / BlockPerBeam / K; 85 | for (int k_offset = 0; k_offset < K; k_offset++) { 86 | int dst_offset = bid * K + k_offset; 87 | final_topK_vals[dst_offset] = block_topk.val[k_offset]; 88 | final_topK_ids[dst_offset] = block_topk.id[k_offset]; 89 | } 90 | } 91 | } 92 | 93 | template 94 | void launchTopKforBeamSearch(TensorWrapper *probs, 95 | // TensorWrapper* topk_workspace 96 | TensorWrapper *tmp_topk_ids, 97 | TensorWrapper *tmp_topk_vals, 98 | TensorWrapper *final_topk_ids, 99 | TensorWrapper *final_topk_vals) 100 | { 101 | int batch_size = probs->shape[0]; 102 | int vocab_size = probs->shape[1]; 103 | constexpr int BlockPerBeam = 8; 104 | constexpr int beam_width = 1; 105 | constexpr int K = 5; 106 | // buffer size 107 | // int topK_val_buf_size = batch_size * beam_width * BlockPerBeam * beam_width; 108 | // int topK_ids_buf_size = batch_size * beam_width * BlockPerBeam * beam_width; 109 | // int final_topK_val_buf_size = batch_size * beam_width; // sampling topK buf size, beamsearch topK size = [batch_size * beam_width * beam_width] 110 | // memory plan 111 | T *topK_vals = tmp_topk_vals->data; // topK_val_buf_size 112 | int *topK_ids = tmp_topk_ids->data; // topK_ids_buf_size 113 | T *final_topK_vals = final_topk_vals->data; // final_topK_val_buf_size 114 | int *final_topK_ids = final_topk_ids->data; // final_topK_val_buf_size 115 | cudaSetDevice(0); 116 | cudaDeviceProp deviceProp; 117 | cudaGetDeviceProperties(&deviceProp, 0); 118 | int maxBlockNums = deviceProp.maxGridSize[0]; 119 | int BlockNums1 = std::min(batch_size * beam_width * BlockPerBeam, maxBlockNums); 120 | int BlockNums2 = std::min(batch_size * beam_width, maxBlockNums); 121 | dim3 grid_round1(BlockNums1); 122 | dim3 block_round1(256); 123 | dim3 grid_round2(BlockNums2); 124 | dim3 block_round2(256); 125 | // debug info, better to retain: std::cout << "in cu file, before launch" << std::endl; 126 | topK_kernel_round1 127 | <<>>(probs->data, vocab_size, topK_ids, topK_vals); 128 | topK_kernel_round2 129 | <<>>(topK_ids, topK_vals, final_topK_ids, final_topK_vals); 130 | // debug info, better to retain: std::cout << "in cu file, after launch" << std::endl; 131 | } 132 | 133 | template void launchTopKforBeamSearch(TensorWrapper *probs, 134 | TensorWrapper *tmp_topk_ids, 135 | TensorWrapper *tmp_topk_vals, 136 | TensorWrapper *final_topk_ids, 137 | TensorWrapper *final_topk_vals); 138 | template void launchTopKforBeamSearch(TensorWrapper *probs, 139 | TensorWrapper *tmp_topk_ids, 140 | TensorWrapper *tmp_topk_vals, 141 | TensorWrapper *final_topk_ids, 142 | TensorWrapper *final_topk_vals); 143 | -------------------------------------------------------------------------------- /src/kernels/rmsnorm_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "src/utils/cuda_debug_utils.cuh" 3 | #include "src/kernels/rmsnorm_kernel.h" 4 | //bugs1: 2nd warpreducesum returns 0, because blockDim.x < 32, blockDim.x / 32=0 5 | //bugs2: output buffer valuse is the same as ones before call, thats because we didn't successfully write into the output address 6 | //bugs3: output buffer's 1st 32 values are right, the latter is wrong, because when we use vec, the ele nums of a row is hiddenunits/vecsize, we should note the row stride to move the ptr carefully 7 | //bugs4: remeber add __syncthreads() in fp32/fp16 kernel, or we cant get the right res, ep, here we didnt add it, we get some res equal to 0 8 | template 9 | __device__ T warpReduceSum(T val){ 10 | for(int i = 32 / 2; i > 0; i >>= 1){ 11 | val += __shfl_xor_sync(0xffffffff, val, i); 12 | } 13 | return val; // 32 threads return val, but only 0th thread is sum val 14 | } 15 | //note:!!!when blocksize < 32, use blockDim.x/32 to get warp nums is wrong, we should instead ceil it 16 | template 17 | __device__ T blockReduceSum(T val){ 18 | int tid = threadIdx.x; 19 | int wid = tid / 32; 20 | int laneid = tid % 32; 21 | int warpnum = (blockDim.x + 31) / 32; 22 | static __shared__ T warpsum[64]; 23 | val = warpReduceSum(val); 24 | if(laneid == 0){ 25 | warpsum[wid] = val; 26 | } 27 | __syncthreads(); 28 | 29 | T sum = tid < warpnum ? warpsum[tid] : (T)0; 30 | sum = warpReduceSum(sum); //though 0th own the sum, but dont need to shfl sync 31 | return sum; 32 | } 33 | // 1.this kernel is used at the begin of every decoder layer and the end of 32 decoder layers 34 | // 2.I allocate threads number by assuming head size can be divided by 4 and 2 35 | template 36 | __global__ void RMSNorm(T* decoder_out, // [num tokens, q_hidden_units] 37 | T* decoder_residual, 38 | T* scale, //[q_hidden_units], RMSNorm weights 39 | float eps, //RMSNorm eps 40 | int num_tokens, 41 | int hidden_units){ 42 | int vec_size = Vec::size; 43 | using Vec_t = typename Vec::Type; 44 | float thread_sum = 0.0f; 45 | Vec_t* dout = reinterpret_cast(decoder_out + blockIdx.x * hidden_units); 46 | Vec_t* rsd; 47 | rsd = reinterpret_cast(decoder_residual + blockIdx.x * hidden_units); 48 | for (int idx = threadIdx.x; idx < hidden_units / vec_size; idx += blockDim.x) { 49 | Vec_t vec = dout[idx]; 50 | rsd[idx] = vec; 51 | thread_sum += vec.x * vec.x; 52 | thread_sum += vec.y * vec.y; 53 | thread_sum += vec.z * vec.z; 54 | thread_sum += vec.w * vec.w; 55 | } 56 | thread_sum = blockReduceSum(thread_sum); 57 | __shared__ float inv_mean; 58 | if (threadIdx.x == 0) { 59 | inv_mean = rsqrtf((float)thread_sum / hidden_units + eps); 60 | } 61 | __syncthreads(); 62 | Vec_t* s = reinterpret_cast(scale); 63 | for (int idx = threadIdx.x; idx < hidden_units / vec_size; idx += blockDim.x) { 64 | Vec_t out = dout[idx];// note the offset should divide vec size 65 | 66 | dout[idx].x = out.x * inv_mean * s[idx].x; 67 | dout[idx].y = out.y * inv_mean * s[idx].y; 68 | dout[idx].z = out.z * inv_mean * s[idx].z; 69 | dout[idx].w = out.w * inv_mean * s[idx].w; 70 | } 71 | } 72 | 73 | template <> 74 | __global__ void RMSNorm(half* decoder_out, // [num tokens, q_hidden_units] 75 | half* decoder_residual, 76 | half* scale, //[q_hidden_units], RMSNorm weights 77 | float eps, //RMSNorm eps 78 | int num_tokens, 79 | int hidden_units){ 80 | int vec_size = Vec::size; 81 | using Vec_t = typename Vec::Type; 82 | int batch_id = blockIdx.x; 83 | int tid = threadIdx.x; 84 | Vec_t* s; 85 | Vec_t* dout = reinterpret_cast(decoder_out + batch_id * hidden_units); 86 | Vec_t* rsd; 87 | if (decoder_residual != nullptr) { 88 | rsd = reinterpret_cast(decoder_residual + batch_id * hidden_units); 89 | } 90 | float thread_accm = 0.0f; 91 | for(int i = tid; i < hidden_units / vec_size; i += blockDim.x) { 92 | Vec_t out = dout[i];// note the offset should divide vec size 93 | if (decoder_residual != nullptr) { 94 | rsd[i] = out; 95 | } 96 | thread_accm += __half2float(out.x) * __half2float(out.x); 97 | thread_accm += __half2float(out.y) * __half2float(out.y); 98 | } //x^2 99 | 100 | // mean(x^2) 101 | float blocksum = blockReduceSum(thread_accm); 102 | __shared__ float inv_fenmu; 103 | if(tid == 0){ 104 | inv_fenmu = rsqrtf(float(blocksum / hidden_units) + eps); 105 | } 106 | __syncthreads(); 107 | // rmsnorm 108 | s = reinterpret_cast(scale); 109 | for(int i = tid; i < hidden_units / vec_size; i += blockDim.x) { 110 | Vec_t dout_h2 =dout[i]; 111 | dout[i].x = s[i].x * __float2half(__half2float(dout_h2.x) * inv_fenmu); 112 | dout[i].y = s[i].y * __float2half(__half2float(dout_h2.y) * inv_fenmu); 113 | } 114 | } 115 | 116 | 117 | template 118 | void launchRMSNorm( TensorWrapper* decoder_out, // [num tokens, hidden_units] 119 | TensorWrapper* decoder_residual, 120 | LayerNormWeight& attn_norm_weight, //RMSNorm weights 121 | float eps, //RMSNorm eps 122 | bool is_last // for print last rmsnorm output to debug 123 | ) 124 | { 125 | int num_tokens = decoder_out->shape[0]; 126 | int hidden_units = decoder_out->shape[1]; 127 | int vec_size = Vec::size; 128 | int num_threads = hidden_units / 4; //vec size // assume head size can be divided by 4 and 2 129 | T* rsd = decoder_residual->data; 130 | dim3 grid(num_tokens); 131 | dim3 block(num_threads); 132 | RMSNorm<<>>(decoder_out->data, 133 | rsd, 134 | attn_norm_weight.gamma, 135 | eps, 136 | num_tokens, 137 | hidden_units); 138 | #ifdef PRINT_DATA 139 | printf("rmsnorm kernel top2 result:\n"); 140 | print_data<<<1, 1>>>(decoder_out->data); 141 | #else 142 | #endif 143 | } 144 | 145 | template void launchRMSNorm( TensorWrapper* decoder_out, // [num tokens, hidden_units] 146 | TensorWrapper* decoder_residual, 147 | LayerNormWeight& attn_norm_weight, //RMSNorm weights 148 | float eps, //RMSNorm eps 149 | bool is_last 150 | ); 151 | template void launchRMSNorm( TensorWrapper* decoder_out, // [num tokens, hidden_units] 152 | TensorWrapper* decoder_residual, 153 | LayerNormWeight& attn_norm_weight, //RMSNorm weights 154 | float eps, //RMSNorm eps 155 | bool is_last 156 | ); 157 | --------------------------------------------------------------------------------