├── src
    ├── models
    │   ├── common_params.h
    │   ├── CMakeLists.txt
    │   ├── llama
    │   │   └── llama_params.h
    │   └── basemodel.h
    ├── weights
    │   ├── CMakeLists.txt
    │   ├── llama
    │   │   ├── norm_weights.h
    │   │   ├── embedding_weights.h
    │   │   ├── CMakeLists.txt
    │   │   ├── ffn_weights.h
    │   │   ├── attention_weights.h
    │   │   ├── layer_weights.h
    │   │   ├── llama_weights.h
    │   │   └── llama_weights.cc
    │   ├── weight.h
    │   └── base_weights.h
    ├── layers
    │   ├── CMakeLists.txt
    │   ├── ffn
    │   │   ├── CMakeLists.txt
    │   │   ├── ffn.h
    │   │   └── ffn.cpp
    │   ├── attention
    │   │   ├── CMakeLists.txt
    │   │   ├── masked_self_attention.h
    │   │   ├── context_attention.h
    │   │   └── masked_self_attention.cpp
    │   └── decoder
    │   │   ├── CMakeLists.txt
    │   │   ├── self_decoder.h
    │   │   ├── context_decoder.h
    │   │   └── self_decoder.cpp
    ├── CMakeLists.txt
    ├── kernels
    │   ├── act_kernel.h
    │   ├── build_casual_mask.h
    │   ├── input_embedding.h
    │   ├── cal_paddingoffset.h
    │   ├── fused_transpose_and_remv_pad.h
    │   ├── attn_softmax_kernel.h
    │   ├── add_residual.h
    │   ├── repeat_kv.h
    │   ├── sampling.h
    │   ├── rmsnorm_kernel.h
    │   ├── fused_addresidual_norm.h
    │   ├── concat_past_kv.h
    │   ├── fused_decoder_self_attention.h
    │   ├── linear.h
    │   ├── topK_bk.h
    │   ├── qkv_bias_and_RoPE.h
    │   ├── topK.h
    │   ├── build_casual_mask.cu
    │   ├── cal_paddingoffset.cu
    │   ├── input_embedding.cu
    │   ├── act_kernel.cu
    │   ├── cublas_utils.h
    │   ├── add_residual.cu
    │   ├── fused_transpose_and_remv_pad.cu
    │   ├── sampling.cu
    │   ├── CMakeLists.txt
    │   ├── cublas_utils.cc
    │   ├── repeat_kv.cu
    │   ├── topK.cu
    │   ├── topK_bk.cu
    │   └── rmsnorm_kernel.cu
    ├── utils
    │   ├── CMakeLists.txt
    │   ├── params.h
    │   ├── weight_utils.h
    │   ├── vectorize_utils.h
    │   ├── cuda_debug_utils.cuh
    │   ├── string_utils.h
    │   ├── model_utils.h
    │   ├── debug_utils.h
    │   ├── macro.h
    │   └── weight_utils.cu
    └── memory
    │   └── allocator
    │       └── base_allocator.h
├── tests
    ├── CMakeLists.txt
    └── unittests
    │   ├── test_fused_trans_remv_pad.cu
    │   ├── test_cal_paddingoffset.cu
    │   ├── test_act.cu
    │   ├── CMakeLists.txt
    │   ├── test_repeat_kv.cu
    │   ├── test_data_compare.cu
    │   ├── test_linear.cu
    │   ├── test_casual_mask.cu
    │   ├── test_topk.cu
    │   ├── test_residual.cu
    │   ├── test_concat_kv.cu
    │   ├── test_bmm.cu
    │   └── test_mask_softmax.cu
├── examples
    ├── CMakeLists.txt
    ├── cpp
    │   ├── CMakeLists.txt
    │   ├── ffn
    │   │   ├── CMakeLists.txt
    │   │   └── ffn_example.cpp
    │   ├── decoder
    │   │   └── CMakeLists.txt
    │   └── attention
    │   │   └── CMakeLists.txt
    └── README.md
├── tools
    ├── 1.png
    ├── HF_llama_run_script.py
    └── README.md
├── llama2-7b-tokenizer.bin
├── README.md
├── user_entry.cpp
└── CMakeLists.txt


/src/models/common_params.h:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(unittests)


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(cpp)
2 | 


--------------------------------------------------------------------------------
/src/weights/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(llama)
2 | 


--------------------------------------------------------------------------------
/tools/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RussWong/LLM-engineering/HEAD/tools/1.png


--------------------------------------------------------------------------------
/llama2-7b-tokenizer.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RussWong/LLM-engineering/HEAD/llama2-7b-tokenizer.bin


--------------------------------------------------------------------------------
/src/layers/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(ffn)
2 | add_subdirectory(attention)
3 | add_subdirectory(decoder)
4 | 


--------------------------------------------------------------------------------
/examples/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(attention)
2 | add_subdirectory(ffn)
3 | add_subdirectory(decoder)
4 | 


--------------------------------------------------------------------------------
/src/weights/llama/norm_weights.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | template<typename T>
3 | struct LayerNormWeight {
4 |     T* gamma;
5 | };


--------------------------------------------------------------------------------
/src/weights/weight.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | struct Weight {
3 |     virtual void loadWeights(std::string weight_path) = 0;
4 | };
5 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(weights)
2 | add_subdirectory(kernels)
3 | add_subdirectory(layers)
4 | add_subdirectory(utils)
5 | add_subdirectory(models)
6 | 


--------------------------------------------------------------------------------
/src/weights/llama/embedding_weights.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "src/weights/base_weights.h"
3 | template<typename T>
4 | struct EmbeddingWeight: public BaseWeight<T> {
5 | };
6 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | ## note
2 | 1. examples folder is the functionality verification of attention and ffn and decoder and other ones, not related to accuracy
3 | 2. I am not sure if the examples folder can run as expected, you can compile them and have a try.


--------------------------------------------------------------------------------
/src/weights/llama/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(layerweights STATIC layer_weights.cc)
2 | target_link_libraries(layerweights PUBLIC -lcudart weightutils)
3 | add_library(llamaweights STATIC llama_weights.cc)
4 | target_link_libraries(llamaweights PUBLIC layerweights)


--------------------------------------------------------------------------------
/src/weights/llama/ffn_weights.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/base_weights.h"
 3 | template<typename T>
 4 | struct LLaMAFFNWeights {
 5 |     BaseWeight<T> gate;
 6 |     BaseWeight<T> up;
 7 |     BaseWeight<T> down;
 8 |     BaseWeight<T> gateAndup;
 9 | };
10 | 


--------------------------------------------------------------------------------
/src/kernels/act_kernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <cuda_runtime.h>
3 | #include <cuda.h>
4 | #include <cuda_fp16.h>
5 | #include "src/utils/tensor.h"
6 | #include "src/utils/vectorize_utils.h"
7 | 
8 | template<typename T>
9 | void launchAct(TensorWrapper<T>* input, TensorWrapper<T>* out);


--------------------------------------------------------------------------------
/src/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(weightutils STATIC weight_utils.cu)
2 | set_property(TARGET weightutils PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
3 | set_property(TARGET weightutils PROPERTY POSITION_INDEPENDENT_CODE  ON)
4 | set_property(TARGET weightutils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)


--------------------------------------------------------------------------------
/src/utils/params.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <unordered_map>
3 | #include <string>
4 | // (RussWong) notes: some data structure to wrap many arguements of a function for simplicity
5 | using IntDict = std::unordered_map<std::string, int>;
6 | using floatDict = std::unordered_map<std::string, float>;


--------------------------------------------------------------------------------
/src/weights/llama/attention_weights.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/base_weights.h"
 3 | template<typename T>
 4 | struct LLaMAattentionWeights {
 5 |     BaseWeight<T> q;
 6 |     BaseWeight<T> k;
 7 |     BaseWeight<T> v;
 8 |     BaseWeight<T> qkv;
 9 |     BaseWeight<T> output;
10 | };
11 | 


--------------------------------------------------------------------------------
/examples/cpp/ffn/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8)
2 | 
3 | add_executable(ffnExample ffn_example.cpp)
4 | set_property(TARGET ffnExample PROPERTY POSITION_INDEPENDENT_CODE  ON)
5 | set_property(TARGET ffnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
6 | target_link_libraries(ffnExample PUBLIC Llamaffn)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLMengine
 2 | 具体运行步骤可见配套pdf文档中准备工作一栏
 3 | 
 4 | 这里再重复一波
 5 | 
 6 | # steps
 7 | ```
 8 |   1.模型转换，见tools/README.md
 9 | 
10 |   2.将转换后模型的路径，替换到根目录下user_entry.cpp#L5的路径
11 | 
12 |   3./path/to/LLM-engineering/llama2-7b-tokenizer.bin替换到user_entry.cpp#L6的路径
13 | 
14 |   4. mkdir build && cd build && cmake .. && make -j8 && ./bin/main
15 | ```
16 | 


--------------------------------------------------------------------------------
/src/models/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
2 | 
3 | add_library(Llama STATIC llama/llama.cpp)
4 | set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
5 | set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
6 | target_link_libraries(Llama PUBLIC LlamaCtxdecoder Llamaselfdecoder weightutils Llamaffn sample embeddingFunctor)
7 | 


--------------------------------------------------------------------------------
/src/kernels/build_casual_mask.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include<cuda_runtime.h>
 3 | #include<cuda.h>
 4 | #include<cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | #include "src/utils/macro.h"
 7 | template<typename T>
 8 | void launchBuildCausalMasks(TensorWrapper<T>* mask, 
 9 |                             TensorWrapper<int>* q_lens, 
10 |                             TensorWrapper<int>* k_lens);


--------------------------------------------------------------------------------
/src/kernels/input_embedding.h:
--------------------------------------------------------------------------------
1 | #include <cuda_runtime.h>
2 | #include <cuda.h>
3 | #include <cuda_fp16.h>
4 | #include "src/utils/tensor.h"
5 | #include "src/weights/llama/embedding_weights.h"
6 | template<typename T>
7 | void launchInputEmbedding(TensorWrapper<int>* input_ids,    
8 |                           TensorWrapper<T>* output,       
9 |                           EmbeddingWeight<T>* embed_table);


--------------------------------------------------------------------------------
/src/kernels/cal_paddingoffset.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/macro.h"
 6 | #include "src/utils/tensor.h"
 7 | 
 8 | void launchCalPaddingoffset(TensorWrapper<int>* padding_offset, 
 9 |                             TensorWrapper<int>* cum_seqlens,
10 |                             TensorWrapper<int>* input_lengths //actual input lens
11 | );


--------------------------------------------------------------------------------
/src/kernels/fused_transpose_and_remv_pad.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | 
 7 | template<typename T>
 8 | void launchTransposeOutRemovePadding(TensorWrapper<T>* qkv_buf_w_pad, 
 9 |                                      TensorWrapper<int>* padding_offset,
10 |                                      TensorWrapper<T>* qkv_buf_wo_pad_1);


--------------------------------------------------------------------------------
/src/kernels/attn_softmax_kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | #include "src/utils/vectorize_utils.h"
 7 | template<typename T>
 8 | void launchScaleMaskAndSoftmax(TensorWrapper<T>* qk,
 9 |                                TensorWrapper<T>* mask,
10 |                                TensorWrapper<T>* attn_score,
11 |                                float scale);
12 | 


--------------------------------------------------------------------------------
/src/layers/ffn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_library(Llamaffn STATIC ffn.cpp)
 4 | set_property(TARGET Llamaffn PROPERTY POSITION_INDEPENDENT_CODE  ON)
 5 | set_property(TARGET Llamaffn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 6 | target_link_libraries(Llamaffn PUBLIC
 7 |                              -lcudart
 8 |                              -lcudadevrt
 9 |                              act
10 |                              linear)


--------------------------------------------------------------------------------
/src/kernels/add_residual.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | #include "src/utils/vectorize_utils.h"
 7 | template <typename T>
 8 | void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units
 9 |     TensorWrapper<T> *residual,
10 |     TensorWrapper<T> *decoder_out, // [num tokens, hidden_units]
11 |     bool is_print=false
12 |     );
13 | 


--------------------------------------------------------------------------------
/src/models/llama/llama_params.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | struct LLaMAAttentionStaticParams {
 3 |     int   rotary_embedding_dim;
 4 |     float rotary_embedding_base;
 5 |     int   max_position_embeddings;
 6 |     bool  use_dynamic_ntk; // for dyn scaling rope
 7 | };
 8 | 
 9 | // (RussWong)note: llama类模型里面动态改变的变量, 注意非全部必需
10 | struct LLaMAAttentionDynParams {
11 |     int batch_size;
12 |     int num_tokens;
13 |     int max_q_len;
14 |     int max_k_len;
15 |     int num_layers;
16 |     bool is_ctx = false;
17 | };
18 | 


--------------------------------------------------------------------------------
/src/kernels/repeat_kv.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | 
 7 | template <typename T>
 8 | void launchRepeatKVCache(TensorWrapper<T> *k_cache_src,
 9 |                          TensorWrapper<T> *v_cache_src,
10 |                          TensorWrapper<int> *context_length,
11 |                          TensorWrapper<int> *layer_id,
12 |                          TensorWrapper<T> *k_cache_dst,
13 |                          TensorWrapper<T> *v_cache_dst);
14 | 


--------------------------------------------------------------------------------
/src/kernels/sampling.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <curand.h>
 3 | #include <curand_kernel.h>
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | 
 7 | #include "src/utils/tensor.h"
 8 | #include "src/utils/params.h"
 9 | 
10 | template<typename T>
11 | void launchSampling(TensorWrapper<int>* topk_id,
12 |                     TensorWrapper<T>* topk_val,
13 |                     TensorWrapper<int>* seqlen,
14 |                     TensorWrapper<bool>* is_finished,
15 |                     TensorWrapper<int>* output_id,
16 |                     IntDict& params);


--------------------------------------------------------------------------------
/tools/HF_llama_run_script.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, LlamaForCausalLM
2 | # 注意，这个py用来debug的，作为本课程各个kernel的groudtruth，且此huggingface接口只接受Llama-2-7b-hf，不接受Llama-2-7b
3 | # 注意，此脚本我只在pip install transformers==4.38 sentencepiece accelerate的环境下验证了能跑通并拿来作为groudtruth
4 | model = LlamaForCausalLM.from_pretrained("/path/to/Llama-2-7b-hf")
5 | tokenizer = AutoTokenizer.from_pretrained("/path/to/Llama-2-7b-hf")
6 | prompt = "Hey, are you conscious? Can you talk to me?"
7 | inputs = tokenizer(prompt, return_tensors="pt")
8 | generate_ids = model.generate(inputs.input_ids, max_length=30)
9 | 


--------------------------------------------------------------------------------
/src/utils/weight_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <vector>
 6 | #include <iostream>
 7 | #include "src/utils/macro.h"
 8 | 
 9 | template<typename T>
10 | void GPUMalloc(T** ptr, size_t size);
11 | 
12 | template<typename T>
13 | void GPUFree(T* ptr);
14 | 
15 | template <typename T_OUT, typename T_FILE, bool Enabled = std::is_same<T_OUT, T_FILE>::value> struct loadWeightFromBin{
16 | public:
17 |     static void internalFunc(T_OUT* ptr, std::vector<size_t> shape, std::string filename);
18 | };  // 模板的泛化形式（原型）
19 | 


--------------------------------------------------------------------------------
/src/kernels/rmsnorm_kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | #include "src/weights/llama/norm_weights.h"
 7 | #include "src/utils/vectorize_utils.h"
 8 | template<typename T>
 9 | void launchRMSNorm( TensorWrapper<T>* decoder_out, // [num tokens, hidden_units]
10 |                     TensorWrapper<T>* decoder_residual,
11 |                     LayerNormWeight<T>& attn_norm_weight, //RMSNorm weights
12 |                     float eps, //RMSNorm eps
13 |                     bool is_last = false
14 |                     );
15 | 


--------------------------------------------------------------------------------
/src/memory/allocator/base_allocator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | class BaseAllocator 
 6 | {
 7 | public:
 8 |     virtual ~BaseAllocator(){};
 9 |     // unified interface for all derived allocator to alloc buffer
10 |     template<typename T>
11 |     T* Malloc(T* ptr, size_t size, bool is_host){
12 |         return (T*)UnifyMalloc((void*)ptr, size, is_host);
13 |     }
14 |     virtual void* UnifyMalloc(void* ptr, size_t size, bool is_host = false) = 0;
15 |     template<typename T>
16 |     void Free(T* ptr, bool is_host = false){
17 |         UnifyFree((void*)ptr, is_host);
18 |     }
19 |     virtual void UnifyFree(void* ptr, bool is_host = false) = 0;
20 | };
21 | 


--------------------------------------------------------------------------------
/src/kernels/fused_addresidual_norm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/weights/base_weights.h"
 6 | #include "src/weights/llama/norm_weights.h"
 7 | #include "src/utils/tensor.h"
 8 | #include "src/utils/vectorize_utils.h"
 9 | template<typename T>
10 | void launchFusedAddBiasResidualRMSNorm( // residual.shape = [num tokens, hidden_units]
11 |                                     TensorWrapper<T>* residual, 
12 |                                     TensorWrapper<T>* decoder_out, // [num tokens, hidden_units]
13 |                                     BaseWeight<T>& norm,
14 |                                     T* scale, //RMSNorm weights
15 |                                     float eps);
16 | 


--------------------------------------------------------------------------------
/examples/cpp/decoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_executable(ctxDecoderExample context_decoder_example.cpp)
 4 | set_property(TARGET ctxDecoderExample PROPERTY POSITION_INDEPENDENT_CODE  ON)
 5 | set_property(TARGET ctxDecoderExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 6 | target_link_libraries(ctxDecoderExample PUBLIC
 7 |                              embeddingFunctor
 8 |                              LlamaCtxdecoder)
 9 | 
10 | add_executable(selfDecoderExample self_decoder_example.cpp)
11 | set_property(TARGET selfDecoderExample PROPERTY POSITION_INDEPENDENT_CODE  ON)
12 | set_property(TARGET selfDecoderExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
13 | target_link_libraries(selfDecoderExample PUBLIC
14 |                             Llamaselfdecoder)


--------------------------------------------------------------------------------
/src/kernels/concat_past_kv.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | 
 7 | template <typename T>
 8 | void launchConcatKVCache(TensorWrapper<T> *k_src, // from qkv bias and rope
 9 |                           TensorWrapper<T> *v_src,
10 |                           TensorWrapper<int> *layer_id,         // layer offset = layer_id * batchxbeam * max_seq_len * kv_head_num * head_size
11 |                           TensorWrapper<int> *cur_query_length, // current epoch or local input length,[batchsize]
12 |                           TensorWrapper<int> *history_length,
13 |                           TensorWrapper<T> *k_dst,
14 |                           TensorWrapper<T> *v_dst); // (RussWong)note: 少写一个;都会发生很多奇怪的错误
15 | 


--------------------------------------------------------------------------------
/src/kernels/fused_decoder_self_attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/utils/tensor.h"
 6 | #include "src/models/llama/llama_params.h"
 7 | #include "src/weights/base_weights.h"
 8 | #include "src/utils/vectorize_utils.h"
 9 | template<typename T>
10 | void launchDecoderMaskedMHA(TensorWrapper<T>* qkv_buf,
11 |                             BaseWeight<T>& qkv,
12 |                             TensorWrapper<int>* layer_id,
13 |                             TensorWrapper<T>* k_cache,
14 |                             TensorWrapper<T>* v_cache,
15 |                             TensorWrapper<bool>* finished,
16 |                             TensorWrapper<int>* step,
17 |                             TensorWrapper<T>* mha_output,
18 |                             LLaMAAttentionStaticParams& static_params);
19 | 


--------------------------------------------------------------------------------
/examples/cpp/attention/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_executable(CtxAttnExample context_attn_example.cpp)
 4 | set_property(TARGET CtxAttnExample PROPERTY POSITION_INDEPENDENT_CODE  ON)
 5 | set_property(TARGET CtxAttnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 6 | target_link_libraries(CtxAttnExample PUBLIC
 7 |                              -lcudart
 8 |                              -lcudadevrt
 9 |                              LlamaCtxAttn)
10 | add_executable(selfAttnExample self_attn_example.cpp)
11 | set_property(TARGET selfAttnExample PROPERTY POSITION_INDEPENDENT_CODE  ON)
12 | set_property(TARGET selfAttnExample PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
13 | target_link_libraries(selfAttnExample PUBLIC
14 |                             -lcudart
15 |                             -lcudadevrt                             
16 |                              LlamaselfAttn
17 | 			     linear)
18 | 


--------------------------------------------------------------------------------
/src/weights/base_weights.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <vector>
 3 | #include <cstdint>
 4 | #include <cuda_fp16.h>
 5 | enum class WeightType
 6 | {
 7 |     FP32_W,
 8 |     FP16_W,
 9 |     INT8_W,
10 |     UNSUPPORTED_W
11 | };
12 | 
13 | template<typename T>
14 | inline WeightType getWeightType()
15 | {
16 |     if (std::is_same<T, float>::value || std::is_same<T, const float>::value) {
17 |         return WeightType::FP32_W;
18 |     }
19 |     else if (std::is_same<T, half>::value || std::is_same<T, const half>::value) {
20 |         return WeightType::FP16_W;
21 |     }
22 |     else if (std::is_same<T, int8_t>::value || std::is_same<T, const int8_t>::value) {
23 |         return WeightType::INT8_W;
24 |     }
25 |     else {
26 |         return WeightType::UNSUPPORTED_W;
27 |     }
28 | }
29 | template<typename T>
30 | struct BaseWeight {
31 |     std::vector<int> shape;
32 |     T*   data;
33 |     WeightType type;
34 |     T*   bias;
35 | };
36 | 


--------------------------------------------------------------------------------
/src/kernels/linear.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <fstream>
 6 | #include "src/kernels/cublas_utils.h"
 7 | #include "src/utils/tensor.h"
 8 | #include "src/weights/base_weights.h"
 9 | #include "src/utils/macro.h"
10 | //TODO: when enable int8/int4 weight only, we can add a new type param T2 to represent weight type
11 | template<typename T>
12 | void launchLinearGemm(TensorWrapper<T>* input,
13 |                       BaseWeight<T>& weight, 
14 |                       TensorWrapper<T>* output,
15 |                       cublasWrapper* cublas_wrapper,
16 |                       bool trans_a = false,
17 |                       bool trans_b = false);
18 | template<typename T>
19 | void launchLinearStridedBatchGemm(TensorWrapper<T>* input1,
20 |                                   TensorWrapper<T>* input2,
21 |                                   TensorWrapper<T>* output,
22 |                                   cublasWrapper* cublas_wrapper,
23 |                                   bool trans_a = false,
24 |                                   bool trans_b = false);
25 | 


--------------------------------------------------------------------------------
/src/weights/llama/layer_weights.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/llama/norm_weights.h"
 3 | #include "src/weights/llama/attention_weights.h"
 4 | #include "src/weights/llama/ffn_weights.h"
 5 | #include "src/utils/weight_utils.h"
 6 | template<typename T>
 7 | class LlamaLayerWeight {
 8 | private:
 9 |     int     head_num;
10 |     int     kv_head_num;
11 |     int     head_size;
12 |     int     hidden_units;
13 |     int     inter_size;
14 |     WeightType weight_type;
15 |     int     bit_size;
16 |     bool    attn_bias;
17 | 
18 | public:
19 |     LlamaLayerWeight() = delete;
20 |     LlamaLayerWeight(int head_num,
21 |                     int  kv_head_num,
22 |                     int  head_size,
23 |                     int  inter_size,
24 |                     WeightType weight_type,
25 |                     bool attn_bias);
26 |     ~LlamaLayerWeight();
27 | 
28 |     void loadWeights(std::string weight_path, WeightType weight_type);
29 |     
30 |     void loadWeights();
31 | 
32 |     LayerNormWeight<T> attn_norm_weight;
33 |     LayerNormWeight<T> ffn_norm_weight;
34 |     LLaMAattentionWeights<T> self_attn_weight;
35 |     LLaMAFFNWeights<T> ffn_weight;
36 | };
37 | 


--------------------------------------------------------------------------------
/src/kernels/topK_bk.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <float.h>
 4 | #include <cuda.h>
 5 | #include <cuda_fp16.h>
 6 | #include "src/utils/tensor.h"
 7 | template<typename T, int K>
 8 | struct topK
 9 | {
10 | 	T val[K];
11 | 	int id[K];
12 | 
13 | 	__device__ void init() {
14 | 		for (int i = 0; i < K; i++) {
15 | 			id[i] = -1;
16 | 			val[i] = FLT_MIN;
17 | 		}
18 | 	}
19 | 
20 | 	__device__ void insertHeap(T data, int data_id) {
21 | 		if (id[K-1] == -1 || val[K-1] < data) {
22 | 			id[K-1] = data_id;
23 | 			val[K-1] = data;
24 | 		}
25 | 		for (int i = K - 2; i >= 0; i--) {
26 | 			if (val[i + 1] > val[i] || id[i] == -1) {
27 | 				T tmp = val[i];
28 | 				val[i] = val[i + 1];
29 | 				val[i + 1] = tmp;
30 | 				int tmp_id = id[i];
31 | 				id[i] = id[i + 1];
32 | 				id[i + 1] = tmp_id;
33 | 			}
34 | 		}
35 | 	}
36 | };
37 | 
38 | template<typename T>
39 | void launchTopKforBeamSearch(TensorWrapper<T> *probs,
40 |                              TensorWrapper<int> *tmp_topk_ids,
41 |                              TensorWrapper<T> *tmp_topk_vals,
42 |                              TensorWrapper<int> *final_topk_ids,
43 |                              TensorWrapper<T> *final_topk_vals);
44 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | **steps to download HF llama weights and convert to bin file**
 2 | 
 3 | 1. download weight from https://huggingface.co/meta-llama/Llama-2-7b/tree/main, note that maybe apply for access first
 4 | 
 5 | 2. run `python convert_downloaded_llama_weights.py --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path` and then, we can get some output files
 6 | 
 7 | 3. based on files got by step 2, we run `python weights_convert.py -i path/to/step2's_output_dir -o path/to/final_bin_file`
 8 | 
 9 | then we can get the weights like below pic:
10 | 
11 | ![image-20240208212828680](1.png)
12 | 
13 | 4. at this time, the weight is ready, and replace the weight path in user_entry.cpp by your step3's output path
14 | 
15 | **note**
16 | 1. 下载Llama-2-7b-hf这个模型也是可以的，但是它不需要做以上的step2，直接做step3即可
17 | 2. 下载Llama-2-7b-chat-hf这个模型是无法复现出课程结果的，这个模型的weight和Llama-2-7b-hf的weight不一样
18 | 3. 针对之前有同学反馈的无法复现出课程视频所示结果，修改代码weights_convert.py的97和115行的np.hstack为np.vstack即可，拿qkv linear来举例，我们想要计算的是y=x * (w^T) (这个等价于torch.nn.linear)，但是hstack后，x.shape = [num tokens, hidden units], w.shape=[4096, 4096 * 3]，很明显x*(w^T)不成立，搞人的地方在于cublas接收这种shape的数据时居然没有报错。如果是vstack，w.shape=[4096 * 3, 4096]，恰好x*(w^T)符合矩阵乘法维度规则
19 | 


--------------------------------------------------------------------------------
/src/weights/llama/llama_weights.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | #include "src/weights/weight.h"
 4 | #include "src/weights/base_weights.h"
 5 | #include "src/weights/llama/embedding_weights.h"
 6 | #include "src/weights/llama/layer_weights.h"
 7 | template<typename T>
 8 | struct LlamaWeight : public Weight {
 9 | private: 
10 |     int     hidden_units;
11 |     int     inter_size;
12 |     int     vocab_size;
13 |     int     vocab_size_padded;
14 |     int     num_layer;
15 |     WeightType weight_type;
16 |     
17 | public:
18 |     std::vector<LlamaLayerWeight<T>*> llama_layer_weight;
19 |     LayerNormWeight<T> out_rmsnorm_weight;
20 |     EmbeddingWeight<T> post_decoder_embedding_weight;
21 |     EmbeddingWeight<T> pre_decoder_embedding_weight;
22 |     
23 |     LlamaWeight() = default;
24 |     LlamaWeight(
25 |         int     head_num,
26 |         int     kv_head_num,
27 |         int     head_size,
28 |         int     inter_size,
29 |         int     vocab_size,
30 |         int     num_layer,
31 |         bool    attn_bias,
32 |         WeightType weight_type       
33 |     );
34 |     ~LlamaWeight();
35 |     void loadWeights(std::string weight_path);
36 |     void loadWeightsFromDummy();
37 | };


--------------------------------------------------------------------------------
/src/layers/attention/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_library(LlamaCtxAttn STATIC context_attention.cpp)
 4 | set_property(TARGET LlamaCtxAttn PROPERTY POSITION_INDEPENDENT_CODE  ON)
 5 | set_property(TARGET LlamaCtxAttn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 6 | target_link_libraries(LlamaCtxAttn PUBLIC
 7 | #                             -lcudart
 8 |                              -lcudadevrt
 9 |                              qkv_bias_and_rope
10 |                              concat_kv
11 | #                             cublasWrapper
12 |                              linear
13 |                              fused_transpose_and_remv_pad
14 |                              repeat_kv
15 |                              mask_softmax)
16 | 
17 | add_library(LlamaselfAttn STATIC masked_self_attention.cpp)
18 | 
19 | set_property(TARGET LlamaselfAttn PROPERTY POSITION_INDEPENDENT_CODE  ON)
20 | set_property(TARGET LlamaselfAttn PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
21 | target_link_libraries(LlamaselfAttn PUBLIC
22 |                              -lcudart
23 |                              -lcudadevrt
24 |                              fused_decoder_self_attention
25 |                              qkv_bias_and_rope
26 | )
27 | 


--------------------------------------------------------------------------------
/src/kernels/qkv_bias_and_RoPE.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include "src/models/llama/llama_params.h"
 6 | #include "src/utils/tensor.h"
 7 | #include "src/weights/base_weights.h"
 8 | #include "src/utils/vectorize_utils.h"
 9 | 
10 | template<typename T>
11 | void launchAddFusedQKVBiasTransposeAndRoPE(TensorWrapper<T>* q_buf,
12 |                                            TensorWrapper<T>* k_buf,
13 |                                            TensorWrapper<T>* v_buf,
14 |                                            TensorWrapper<T>* QKV,
15 |                                            BaseWeight<T>& qkv,
16 |                                            //Tensor* qkv_bias,
17 |                                            TensorWrapper<int>* padding_offset,
18 |                                            TensorWrapper<int>* history_length,
19 |                                            TensorWrapper<int>* input_length,
20 |                                            LLaMAAttentionStaticParams& params);
21 | 
22 | template<typename T>
23 | void launchRoPE(TensorWrapper<T>* qkv_buf,
24 |                 TensorWrapper<int>* step,
25 |                 LLaMAAttentionStaticParams& static_params);


--------------------------------------------------------------------------------
/src/layers/decoder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_library(LlamaCtxdecoder STATIC context_decoder.cpp)
 4 | set_property(TARGET LlamaCtxdecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
 5 | set_property(TARGET LlamaCtxdecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 6 | target_link_libraries(LlamaCtxdecoder PUBLIC
 7 |                                         LlamaCtxAttn
 8 |                                         Llamaffn 
 9 |                                         llamaweights
10 |                                         cal_paddingoffset
11 |                                         build_casual_mask
12 |                                         rmsnorm
13 |                                         fused_addresidual_norm
14 |                                         add_residual
15 | )
16 | 
17 | add_library(Llamaselfdecoder STATIC self_decoder.cpp)
18 | set_property(TARGET Llamaselfdecoder PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET Llamaselfdecoder PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | target_link_libraries(Llamaselfdecoder PUBLIC
21 |                                         LlamaselfAttn
22 |                                         Llamaffn 
23 |                                         llamaweights
24 |                                         rmsnorm
25 |                                         fused_addresidual_norm
26 |                                         add_residual
27 | )
28 | 


--------------------------------------------------------------------------------
/src/utils/vectorize_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <cuda_fp16.h>
 5 | //(RussWong)note: below 5 overloaded function can convert different scalar type data to specified vector type data.
 6 | template<typename T_OUT, typename T_IN>
 7 | inline __device__ T_OUT scalar_cast_vec(T_IN val)
 8 | {
 9 |     return val;
10 | }
11 | 
12 | template<>
13 | inline __device__ half2 scalar_cast_vec<half2, float>(float val)
14 | {
15 |     return __float2half2_rn(val);
16 | }
17 | 
18 | template<>
19 | inline __device__ float4 scalar_cast_vec<float4, float>(float val)
20 | {
21 |     return make_float4(val, val, val, val);
22 | }
23 | 
24 | template<>
25 | inline __device__ float2 scalar_cast_vec<float2, float>(float val)
26 | {
27 |     return make_float2(val, val);
28 | }
29 | 
30 | template<>
31 | inline __device__ half2 scalar_cast_vec<half2, half>(half val)
32 | {
33 |     //(RussWong)note: __half2half2 cant be parsed by my nvcc compiler, so I give it up
34 |     //return __half2half2(val);
35 |     half2 res;
36 |     res.x = val;
37 |     res.y = val;
38 |     return res;
39 | }
40 | 
41 | template<typename T>
42 | struct Vec {
43 |     using Type = T;
44 |     static constexpr int size = 0;
45 | };
46 | template<>
47 | struct Vec<half> {
48 |     using Type = half2; 
49 |     static constexpr int size = 2;
50 | };
51 | template<>
52 | struct Vec<float> {
53 |     using Type = float4;
54 |     static constexpr int size = 4;
55 | };
56 | //(RussWong)note: temply dont know which LLM use two continuous elements do RoPE
57 | struct TwoFloat2{
58 |     float2 x;
59 |     float2 y;
60 | };
61 | 


--------------------------------------------------------------------------------
/src/utils/cuda_debug_utils.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | // usage: print_data<<<1, 1>>>()
 6 | // notes: you can self define the print info using your actual case.
 7 | template<typename T>
 8 | __global__ void print_data(T* src1, bool is_target=false) {
 9 |     int tid = threadIdx.x;
10 |     if(tid == 0) {
11 |     	printf("%dth = %f\n", tid, src1[tid]);
12 |     	printf("%dth = %f\n", tid + 1, src1[tid + 1]);
13 | 		// is_target is used to print the info for specified function, to avoid too much print info in screen. 
14 | 		if (is_target){
15 | 			printf("%dth = %f\n", tid + 128, src1[tid + 128]);
16 | 			printf("%dth = %f\n", tid + 129, src1[tid + 129]);
17 | 			printf("%dth = %f\n", tid + 130, src1[tid + 130]);
18 | 			printf("%dth = %f\n", tid + 131, src1[tid + 131]);
19 | 			printf("%dth = %f\n", tid + 1024, src1[tid + 1024]);	
20 | 		}
21 | 	    // printf("from_tensor/outlinearin data[%d] = %f\n", tid, src3[tid]);
22 |     	// printf("from_tensor/outlinearin data[%d] = %f\n", tid + 1, src3[tid+1]);
23 |    	    // printf("from_tensor/outlinearin data[%d] = %f\n", tid + 128, src3[tid+128]);
24 |     	// printf("from_tensor/outlinearin data[%d] = %f\n", tid + 129, src3[tid+129]);
25 |     	
26 | 	    // printf("qkvweight/outweight data[%d] = %f\n", tid, src2[tid]);
27 |     	// printf("qkvweight/outweight data[%d] = %f\n", tid + 1, src2[tid+1]);    
28 |     	// printf("qkvweight/outweight data[%d] = %f\n", tid + 128, src2[tid+128]);
29 |     	// printf("qkvweight/outweight data[%d] = %f\n", tid + 129, src2[tid +129]);
30 |     	// printf("linear done\n");
31 | 
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/layers/ffn/ffn.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/llama/attention_weights.h"
 3 | #include "src/weights/llama/ffn_weights.h"
 4 | #include "src/memory/allocator/cuda_allocator.h"
 5 | #include "src/kernels/linear.h"
 6 | #include "src/utils/tensor.h"
 7 | #include "src/kernels/cublas_utils.h"
 8 | #include "src/models/llama/llama_params.h"
 9 | #include "src/kernels/act_kernel.h"
10 | #include "src/utils/macro.h"
11 | template<typename T>
12 | class LLaMAFFNLayer {
13 | private:
14 |     // this params are shared across all LLMs
15 |     const int head_num;
16 |     const int head_size;
17 |     const int inter_size;
18 |     const int hidden_units;
19 |     int count = -1; // used to record layer index currently
20 | 
21 |     cudaStream_t stream;
22 |     BaseAllocator* allocator;
23 |     // for linear proj
24 |     cublasWrapper* cublas_wrapper;
25 | 
26 |     // buffer
27 |     // [2, num tokens, intersize]
28 |     TensorWrapper<T>*  SwiGLU_input = nullptr;  //gate proj and up proj output buf   
29 |     // [num tokens, intersize] 
30 |     TensorWrapper<T>*  down_proj_input = nullptr;   
31 | 
32 | 
33 | public:
34 |     LLaMAFFNLayer(int head_num,
35 |                     int head_size,
36 |                     int inter_size,
37 |                     cudaStream_t stream,
38 |                     cublasWrapper* cublas_wrapper,
39 |                     BaseAllocator* allocator);
40 | 
41 |     void allocForForward(LLaMAAttentionDynParams& params);
42 |     void allocForForward(int batch_size);
43 |     void freeBuf();
44 |     void forward(TensorMap& inputs, TensorMap& outputs, LLaMAFFNWeights<T>& weights, LLaMAAttentionDynParams& params);
45 | };
46 | 


--------------------------------------------------------------------------------
/src/models/basemodel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | #include <functional>
 4 | #include "src/utils/tensor.h"
 5 | #include "src/models/common_params.h"
 6 | #include "src/memory/allocator/base_allocator.h"
 7 | #include "src/kernels/cublas_utils.h"
 8 | // (RussWong)note: 回调函数, 用于打印当前轮次对话的LLM生成内容
 9 | using CallBack = std::function<void(int index, const char* GenerateContent)>;
10 | 
11 | class BaseModel{
12 | public:
13 |     std::string model_name;
14 |     // (RussWong)note: 必需的且所有模型子类都共有的4个数据成员
15 |     cudaStream_t stream;
16 |     cublasWrapper* cublas_wrapper;
17 |     BaseAllocator* allocator;
18 |     cudaDeviceProp* cuda_device_prop;
19 |     BaseModel(cudaStream_t stream,
20 |               cublasWrapper* cublas_wrapper,
21 |               BaseAllocator* allocator,
22 |               cudaDeviceProp* cuda_device_prop = nullptr):
23 |         stream(stream),
24 |         cublas_wrapper(cublas_wrapper),
25 |         allocator(allocator),
26 |         cuda_device_prop(cuda_device_prop){};
27 |     // (RussWong)note: 3个纯虚函数API, 每个具体模型子类需要实现
28 |     virtual void loadTokenizer(std::string file) = 0;
29 |     virtual void loadWeights(std::string file) = 0;
30 |     virtual void loadWeightsFromDummy() = 0;
31 |     // (RussWong)note: 3个纯虚函数API, 用于定义每轮对话的输入、历史记录和回复API, 每个具体模型子类需要实现
32 |     // 根据历史信息和当前输入生成当前轮次的prompt
33 |     virtual std::vector<std::string> MakeInput(const std::string &history, int round, const std::string &input) = 0;
34 |     // 根据当前轮次回复更新到history string
35 |     virtual std::string MakeHistory(const std::string &history, int round, const std::string &input, const std::string &output) = 0;
36 |     // 回复内容的返回接口
37 |     virtual std::string Response(const std::vector<std::string>& input, CallBack PrintRes) = 0;
38 | };
39 | 


--------------------------------------------------------------------------------
/user_entry.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "src/utils/model_utils.h"
 3 | 
 4 | struct ConvertedModel {
 5 |     std::string model_path = "/home/llamaweight/"; // 模型文件路径
 6 |     std::string tokenizer_path = "/home/llama2-7b-tokenizer.bin"; // tokenizer文件路径
 7 | };
 8 | 
 9 | int main(int argc, char **argv) {
10 |     int round = 0;
11 |     std::string history = "";
12 |     ConvertedModel model;
13 |     // auto model = llm::CreateDummyLLMModel<float>(model.tokenizer_file);//load dummy weight + load tokenizer
14 |     auto llm_model = llm::CreateRealLLMModel<float>(model.model_path, model.tokenizer_path);//load real weight + load tokenizer
15 |     std::string model_name = llm_model->model_name;
16 |     // exist when generate end token or reach max seq
17 |     while (true) {
18 |         printf("please input the question: ");
19 |         std::string input;
20 |         std::getline(std::cin, input);
21 |         if (input == "s") {//停止对话
22 |             break;
23 |         }    
24 |         // (RussWong)notes: index = 生成的第几个token，从0开始
25 |         std::string retString = llm_model->Response(llm_model->MakeInput(history, round, input), [model_name](int index, const char* content) {
26 |             if (index == 0) {
27 |                 printf(":%s", content);
28 |                 fflush(stdout);
29 |             }
30 |             if (index > 0) {
31 |                 printf("%s", content);
32 |                 fflush(stdout);
33 |             }
34 |             if (index == -1) {
35 |                 printf("\n");
36 |             }
37 |         });
38 |         //(RussWong)notes: 多轮对话保留history，和当前轮次input制作成新的上下文context
39 |         history = llm_model->MakeHistory(history, round, input, retString);
40 |         round++;
41 |     }
42 |     return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/src/utils/string_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <memory>   // std::make_unique
 3 | #include <sstream>  // std::stringstream
 4 | #include <string>
 5 | #include <vector>
 6 | //(RussWong)note: this function allow us can self define print string
 7 | template<typename... Args>
 8 | inline std::string fmtstr(const std::string& format, Args... args)
 9 | {
10 |     // This function came from a code snippet in stackoverflow under cc-by-1.0
11 |     //   https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf
12 | 
13 |     // Disable format-security warning in this function.
14 |     int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;  // Extra space for '\0'
15 |     if (size_s <= 0) {
16 |         throw std::runtime_error("Error during formatting.");
17 |     }
18 |     auto size = static_cast<size_t>(size_s);
19 |     std::unique_ptr<char> buf(new char[size]);
20 |     std::snprintf(buf.get(), size, format.c_str(), args...);
21 |     return std::string(buf.get(), buf.get() + size - 1);  // We don't want the '\0' inside
22 | }
23 | //(RussWong)note: below two functions allow us can convert elements in vector or pointer to string
24 | template<typename T>
25 | inline std::string vec2str(std::vector<T> vec)
26 | {
27 |     std::stringstream ss;
28 |     ss << "(";
29 |     if (!vec.empty()) {
30 |         for (size_t i = 0; i < vec.size() - 1; ++i) {
31 |             ss << vec[i] << ", ";
32 |         }
33 |         ss << vec.back();
34 |     }
35 |     ss << ")";
36 |     return ss.str();
37 | }
38 | 
39 | template<typename T>
40 | inline std::string arr2str(T* arr, size_t size)
41 | {
42 |     std::stringstream ss;
43 |     ss << "(";
44 |     for (size_t i = 0; i < size - 1; ++i) {
45 |         ss << arr[i] << ", ";
46 |     }
47 |     if (size > 0) {
48 |         ss << arr[size - 1];
49 |     }
50 |     ss << ")";
51 |     return ss.str();
52 | }
53 | 


--------------------------------------------------------------------------------
/src/kernels/topK.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <float.h>
 4 | #include <cuda.h>
 5 | #include <cuda_fp16.h>
 6 | #include "src/utils/tensor.h"
 7 | template<typename T, int K>
 8 | struct topK
 9 | {
10 |     T val[K];
11 |     int id[K];
12 | 
13 |     __device__ void init(){
14 |         for (int i = 0; i < K; i++) {
15 |             id[i] = -1;
16 |             val[i] = -1e-20;
17 |         }
18 |     }
19 | 
20 |     __device__ void insertHeap(T data, int data_id){
21 |         float v = (float)val[K-1];
22 |         if(id[K-1] == -1 || v < (float)data){
23 |             id[K-1] = data_id;
24 |             val[K-1] = data;
25 |         }
26 |         //Note: 仅需一轮冒泡排序（插入新元素的重排），因为此时除了最后一个新元素，其它都是有序
27 |         for (int i = K - 2; i >= 0; i--){
28 |             if(val[i + 1] > val[i] || id[i] == -1) {
29 |                 T tmp = val[i];
30 |                 val[i] = val[i + 1];
31 |                 val[i + 1] = tmp;                
32 |                 int tmp_id = id[i];
33 |                 id[i] = id[i + 1];
34 |                 id[i + 1] = tmp_id;
35 |             }
36 |         }
37 |     }
38 | };
39 | 
40 | 
41 | template <typename T>
42 | void launchTopKforBeamSearch(TensorWrapper<T> *probs,
43 |                              TensorWrapper<int> *topk_ids,
44 |                              TensorWrapper<T> *topk_vals,
45 |                              TensorWrapper<int> *final_topk_ids,
46 |                              TensorWrapper<T> *final_topk_vals);
47 | // template<typename T>
48 | // void launchTopKforBeamSearch(const T* probs, 
49 | //                             const int batch_size,
50 | //                             const int vocab_size, 
51 | //                             int* topk_ids,
52 | //                             T* topk_vals,
53 | //                             int* final_topk_ids,
54 | //                             T* final_topk_vals);
55 | 


--------------------------------------------------------------------------------
/src/kernels/build_casual_mask.cu:
--------------------------------------------------------------------------------
 1 | #include "src/kernels/build_casual_mask.h"
 2 | // mask shape =  [bs, max_q_len, max_k_len]
 3 | template<typename T>
 4 | __global__ void BuildCausalMasksConsideringContextPastKV(T* mask,
 5 |                                                 const int* q_lens,  //input lens, shape=[batch size]
 6 |                                                 const int* k_lens,  //context lens, shape=[batch size]
 7 |                                                 int max_q_len,
 8 |                                                 int max_k_len){
 9 |     int tid = threadIdx.x;
10 |     int qlen = q_lens[blockIdx.x];
11 |     int klen = k_lens[blockIdx.x];
12 |     mask += blockIdx.x * max_q_len * max_k_len;
13 |     int offset = threadIdx.x;
14 |     // note: this judgement confirms we dont exceed data boundry
15 |     while (offset < max_q_len * max_k_len){
16 |         int q = offset / max_k_len;
17 |         int k = offset % max_k_len;
18 |         bool is_one = q < qlen && k < klen && k <= q + (klen - qlen) && k >= klen - qlen;
19 |         mask[offset] = static_cast<T>(is_one);
20 | 
21 |         offset += blockDim.x;
22 |     }
23 | }
24 | 
25 | template<typename T>
26 | void launchBuildCausalMasks(TensorWrapper<T>* mask, 
27 |                             TensorWrapper<int>* q_lens, 
28 |                             TensorWrapper<int>* k_lens)
29 | {
30 |     int batch_size = mask->shape[0];
31 |     int max_q_len = mask->shape[1];
32 |     int max_k_len = mask->shape[2];
33 |     BuildCausalMasksConsideringContextPastKV<T><<<batch_size, 256>>>(mask->data, q_lens->data, k_lens->data, max_q_len, max_k_len);
34 | }
35 | 
36 | template void launchBuildCausalMasks(TensorWrapper<float>* mask, 
37 |                             TensorWrapper<int>* q_lens, 
38 |                             TensorWrapper<int>* k_lens);
39 | 
40 | template void launchBuildCausalMasks(TensorWrapper<half>* mask, 
41 |                             TensorWrapper<int>* q_lens, 
42 |                             TensorWrapper<int>* k_lens);
43 | 


--------------------------------------------------------------------------------
/src/kernels/cal_paddingoffset.cu:
--------------------------------------------------------------------------------
 1 | #include "src/kernels/cal_paddingoffset.h"
 2 | // shape:
 3 |     //seq_lengths:[batch size]
 4 |     //cum_seqlens:[batch size + 1],first ele is 0
 5 |     //padding_offset:[batch size * max q len]
 6 | // note: the point is to calc padding offset and cum offset
 7 | // TODO: we first use serial algo, then can enhance to CUDA scan algo
 8 | 
 9 | __global__ void CalPaddingoffset(int*         padding_offset, 
10 |                                 int*         cum_seqlens,
11 |                                 const int*   input_lengths, //actual input lens
12 |                                 const int    batch_size,
13 |                                 const int    max_q_len) {
14 |     int ind = 0;
15 |     int cum_offset = 0;
16 |     int total_seqlen = 0;
17 |     for(int b = 0; b < batch_size; b++) {
18 |         int seqlen = input_lengths[b];
19 | 
20 |         cum_seqlens[b] = total_seqlen;
21 |         // each token in one seq has same cum offset
22 |         for (int i = 0; i < seqlen; i++) {
23 |             padding_offset[ind] = cum_offset;
24 |             ind++;
25 |         }
26 |         cum_offset += max_q_len - seqlen;
27 |         total_seqlen += seqlen;
28 |     }
29 |     cum_seqlens[batch_size] = total_seqlen;
30 | }
31 | 
32 | void launchCalPaddingoffset(TensorWrapper<int>* padding_offset, 
33 |                             TensorWrapper<int>* cum_seqlens,
34 |                             TensorWrapper<int>* input_lengths)//actual input lens
35 | {
36 |     const int batch_size = padding_offset->shape[0];                            
37 |     const int max_q_len = padding_offset->shape[1]; 
38 |     LLM_CHECK_WITH_INFO(batch_size == input_lengths->shape[0], "input lenghts numbers should equal to padding offset bs dim!") ;                        
39 |     LLM_CHECK_WITH_INFO(batch_size == cum_seqlens->shape[0] - 1, "cum seqlen numbers should equal to padding offset bs dim + 1!") ;                        
40 |     CalPaddingoffset<<<1, 1>>>( 
41 |         padding_offset->data, cum_seqlens->data, input_lengths->data, batch_size, max_q_len
42 |     );
43 | }


--------------------------------------------------------------------------------
/src/kernels/input_embedding.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "src/kernels/input_embedding.h"
 3 | #include "src/utils/cuda_debug_utils.cuh"
 4 | template<typename T>
 5 | __global__ void embeddingFunctor(const int* input_ids,
 6 |                T* output, 
 7 |                const T* embed_table,
 8 |                const int max_context_token_num,
 9 |                const int hidden_size)
10 | {
11 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
12 |     while (index < max_context_token_num * hidden_size) {
13 |         int id = input_ids[index / hidden_size];
14 |         output[index] = embed_table[id * hidden_size + index % hidden_size];
15 |         index += blockDim.x * gridDim.x;
16 |     }
17 | }
18 | 
19 | template<typename T>
20 | void launchInputEmbedding(TensorWrapper<int>* input_ids,    // INT [token num]
21 |                           TensorWrapper<T>* output,       // FP32 [token num, hidden_size] = [token num, 4096]
22 |                           EmbeddingWeight<T>* embed_table// FP32 [vocal_size, hidden_size]
23 |                           ) {
24 |     const int blockSize = 256;
25 |     const int max_context_token_num = output->shape[0]; // token num
26 |     const int hidden_size = output->shape[1];
27 |     const int gridSize = 2048;
28 |     LLM_CHECK_WITH_INFO(max_context_token_num == input_ids->shape[0], "input ids 1st shape should equal to 1st shape of output");
29 |     embeddingFunctor<T><<<gridSize, blockSize>>>(input_ids->data,
30 |                                                  output->data,
31 |                                                  embed_table->data,
32 |                                                  max_context_token_num,
33 |                                                  hidden_size);
34 | #ifdef PRINT_DATA
35 |     print_data<<<1, 1>>>(output->data);
36 | #else
37 | #endif
38 | }
39 | 
40 | template void launchInputEmbedding(TensorWrapper<int>* input_ids,    
41 |                                    TensorWrapper<float>* output,       
42 |                                    EmbeddingWeight<float>* embed_table);
43 | template void launchInputEmbedding(TensorWrapper<int>* input_ids,    
44 |                                    TensorWrapper<half>* output,       
45 |                                    EmbeddingWeight<half>* embed_table);
46 | 


--------------------------------------------------------------------------------
/src/layers/attention/masked_self_attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/llama/attention_weights.h"
 3 | #include "src/memory/allocator/cuda_allocator.h"
 4 | #include "src/kernels/linear.h" //1st/4th kernel of masked self attention, qkv gemm
 5 | #include "src/kernels/attn_softmax_kernel.h"
 6 | #include "src/kernels/qkv_bias_and_RoPE.h" //2nd rope
 7 | #include "src/kernels/fused_decoder_self_attention.h" //3rd kernel 
 8 | #include "src/utils/tensor.h"
 9 | #include "src/kernels/cublas_utils.h"
10 | #include "src/models/llama/llama_params.h"
11 | #include "src/utils/macro.h"
12 | 
13 | // (RussWong)note: 这里面的数据成员都是只存在于attention layer，而不像finished，seq lengths这种贯穿整个过程
14 | template<typename T>
15 | class LLaMASelfAttentionLayer {
16 | private:
17 |     // this params are shared across all LLMs
18 |     const int head_num;
19 |     const int head_size;
20 |     const int hidden_units;
21 |     const int q_head_per_kv; //for GQA and MQA
22 |     const int kv_head_num;
23 |     float scale;
24 |     // this params are only saw in llama and are unchanged 
25 |     LLaMAAttentionStaticParams attn_static_params;
26 |     cudaStream_t stream;
27 |     BaseAllocator* allocator;
28 |     // for linear and batchgemm
29 |     cublasWrapper* cublas_wrapper;
30 | 
31 |     // intermedia buffer
32 |     TensorWrapper<T>* qkv_buf     = nullptr; // for qkv linear output and rope input/output
33 |     TensorWrapper<T>* mha_output = nullptr; // mha output, then invoke a linear to attention output
34 | 
35 | public:
36 |     LLaMASelfAttentionLayer(int head_num,
37 |                                int kv_head_num,
38 |                                int head_size,
39 |                                LLaMAAttentionStaticParams attn_params,
40 |                                cudaStream_t stream,
41 |                                cublasWrapper* cublas_wrapper,
42 |                                BaseAllocator* allocator);
43 |     // (RussWong)note: private data member can only be accessed by member function
44 |     LLaMAAttentionStaticParams& GetAttnStaticParams(){
45 |         return attn_static_params;
46 |     }
47 |     void allocForForward(LLaMAAttentionDynParams& params);
48 |     void freeBuf();
49 |     void forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights<T>& weights, LLaMAAttentionDynParams& params);
50 | };
51 | 


--------------------------------------------------------------------------------
/src/kernels/act_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/kernels/act_kernel.h"
 3 | #include "src/utils/cuda_debug_utils.cuh"
 4 | #include "src/utils/macro.h"
 5 | template<typename T>
 6 | __device__ __forceinline__ T silu(const T& in) {
 7 |   // x * sigmoid(x)
 8 |   return (T) (((float) in) / (1.0f + expf((float) -in)));
 9 | }
10 | 
11 | template<>
12 | __device__ __forceinline__ half2 silu<half2>(const half2& in) {
13 |   return make_half2(__float2half(silu<float>((float)(in.x))), __float2half(silu<float>((float)(in.y))));
14 | }
15 | 
16 | //代码逻辑：第一个intermediate 去做silu，结果与第二个intermediate mul
17 | template<typename T>
18 | __global__ void silu_and_mul_kernel(
19 |   T* out,               // [bs, intermedia size]
20 |   const T* input,       // [bs, 2, intermedia size]
21 |   const int intermedia_size) {
22 |   const int batch_idx = blockIdx.x;
23 |   for (int idx = threadIdx.x; idx < intermedia_size; idx += blockDim.x) { 
24 |     const T x = input[batch_idx * 2 * intermedia_size + idx];
25 |     const T y = input[batch_idx * 2 * intermedia_size + intermedia_size + idx];
26 |     out[batch_idx * intermedia_size + idx] = silu<T>(x) * y;
27 |   }
28 | }
29 | 
30 | template<>
31 | __global__ void silu_and_mul_kernel<half>(
32 |   half* out,               // [bs, intermedia size]
33 |   const half* input,       // [bs, 2, intermedia size]
34 |   const int intermedia_size) {
35 |   const int batch_idx = blockIdx.x;
36 |   int vec_size = Vec<half>::size;
37 |   using Vec_t = typename Vec<half>::Type;
38 |   for (int idx = threadIdx.x * vec_size; idx < intermedia_size; idx += blockDim.x) {
39 |     const Vec_t x = *reinterpret_cast<Vec_t*>(const_cast<half*>(&input[batch_idx * 2 * intermedia_size + idx]));
40 |     const Vec_t y = *reinterpret_cast<Vec_t*>(const_cast<half*>(&input[batch_idx * 2 * intermedia_size + intermedia_size + idx]));
41 |     *reinterpret_cast<Vec_t*>(&out[batch_idx * intermedia_size + idx]) = __hmul2(silu<Vec_t>(x), y);
42 |   }
43 | 
44 | }
45 | 
46 | template<typename T>
47 | void launchAct(TensorWrapper<T>* input, TensorWrapper<T>* out) {
48 |     int batch_size = input->shape[0];
49 |     LLM_CHECK(input->shape[1] == 2);
50 |     int intermedia_size = input->shape[2];
51 |     dim3 grid(batch_size);
52 |     dim3 block(256);
53 |     silu_and_mul_kernel<T><<<grid, block>>>(out->data, input->data, intermedia_size);
54 | #ifdef PRINT_DATA
55 |     printf("act kernel top2 result:\n");
56 |     print_data<<<1, 1>>>(out->data);
57 | #else
58 | #endif
59 | }
60 | // We must instancite the template, if not, will report linking issue
61 | template void launchAct(TensorWrapper<float>* input, TensorWrapper<float>* output);
62 | template void launchAct(TensorWrapper<half>* input, TensorWrapper<half>* output);
63 | 


--------------------------------------------------------------------------------
/src/layers/attention/context_attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/weights/llama/attention_weights.h"
 3 | #include "src/memory/allocator/cuda_allocator.h"
 4 | #include "src/kernels/linear.h"
 5 | #include "src/kernels/attn_softmax_kernel.h"
 6 | #include "src/kernels/qkv_bias_and_RoPE.h"
 7 | #include "src/kernels/fused_transpose_and_remv_pad.h"
 8 | #include "src/kernels/concat_past_kv.h"
 9 | #include "src/kernels/repeat_kv.h"
10 | #include "src/utils/tensor.h"
11 | #include "src/kernels/cublas_utils.h"
12 | #include "src/models/llama/llama_params.h"
13 | template<typename T>
14 | class LLaMAContextAttentionLayer {
15 | private:
16 |     // this params are shared across all LLMs
17 |     const int head_num;
18 |     const int head_size;
19 |     const int hidden_units;
20 |     const int q_head_per_kv; //for GQA and MQA
21 |     const int kv_head_num;
22 |     float scale;
23 |     // this params are only saw in llama and are unchanged 
24 |     LLaMAAttentionStaticParams attn_static_params;
25 |     cudaStream_t stream;
26 |     BaseAllocator* allocator;
27 |     // for linear and batchgemm
28 |     cublasWrapper* cublas_wrapper;
29 | 
30 |     TensorWrapper<T>*  qkv_buf_wo_pad = nullptr;      
31 |     TensorWrapper<T>*  q_buf_w_pad = nullptr;
32 |     TensorWrapper<T>*  k_buf_w_pad = nullptr;
33 |     TensorWrapper<T>*  v_buf_w_pad = nullptr;
34 |     TensorWrapper<T>*  k_cache_buf = nullptr;
35 |     TensorWrapper<T>*  v_cache_buf = nullptr;
36 |     TensorWrapper<T>*  qk_buf = nullptr;
37 |     TensorWrapper<T>*  qkv_buf_w_pad = nullptr;
38 |     TensorWrapper<T>*  qkv_buf_wo_pad_1 = nullptr;      
39 | 
40 | public:
41 |     LLaMAContextAttentionLayer(int head_num,
42 |                                int kv_head_num,
43 |                                int head_size,
44 |                                LLaMAAttentionStaticParams attn_params,
45 |                                cudaStream_t stream,
46 |                                cublasWrapper* cublas_wrapper,
47 |                                BaseAllocator* allocator);
48 |     LLaMAAttentionStaticParams& GetAttnStaticParams(){
49 |         return attn_static_params;
50 |     }
51 |     
52 |     void allocForForward(LLaMAAttentionDynParams& params);
53 |     void freeBuf();
54 |     void forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights<T>& weights, LLaMAAttentionDynParams& params, LLaMAAttentionStaticParams& static_params);
55 |     // whats the diff across these 3 max len:
56 |     // max_seq_len is the max kv len considering context, ep. multiple epochs chat
57 |     // max_q_len is the current max q len after padding in this batch
58 |     // all kv cache is max seq len to save all kv cache in all epochs, but in context attention, all kv cache should be broadcast to adapt q as kv cache buf whose shape is max k len
59 |     // so max k len is the max context len in cur batch  
60 |     // void flashAttn();
61 | };
62 | 


--------------------------------------------------------------------------------
/src/kernels/cublas_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cublasLt.h>
 3 | #include <cublas_v2.h>
 4 | #include <cuda_runtime.h>
 5 | #include <map>
 6 | #include <string>
 7 | #include "src/utils/macro.h"
 8 | //1.cublas API: must allocate the required matrices in the GPU memory space, 
 9 | // fill them with data, call the sequence of desired cuBLAS functions, and then upload the results back to the host.
10 | //2.cublasXt API: have the data on the Host
11 | //3.cuBLASLt API: lightweight library dedicated to GEMM  with a new flexible API. 
12 | // adds flexibility in matrix data layouts, input types, compute types, and also in choosing the algorithmic implementations and heuristics through parameter programmability
13 | class cublasWrapper {
14 |     private:
15 |         cublasHandle_t   cublas_handle_;
16 |         cublasLtHandle_t cublaslt_handle_;     
17 | 
18 |         cudaDataType_t Atype_;
19 |         cudaDataType_t Btype_;
20 |         cudaDataType_t Ctype_;
21 |         cudaDataType_t computeType_;   
22 |     
23 |     public:
24 |         cublasWrapper(cublasHandle_t cublas_handle_,
25 |                       cublasLtHandle_t cublaslt_handle_);
26 |                       // BaseAllocator* allocator); enable it when we use cublasLt API
27 | 
28 |         ~cublasWrapper();
29 |         void setFP32GemmConfig();
30 |         void setFP16GemmConfig();
31 |         //for proj matmul
32 |         void Gemm(cublasOperation_t transa,
33 |                 cublasOperation_t transb,
34 |                 const int         m,
35 |                 const int         n,
36 |                 const int         k,
37 |                 const void*       A,
38 |                 const int         lda,
39 |                 const void*       B,
40 |                 const int         ldb,
41 |                 void*             C,
42 |                 const int         ldc,
43 |                 float             alpha,
44 |                 float             beta);
45 |         // for qk*v and q*k
46 |         void stridedBatchedGemm(cublasOperation_t transa,
47 |                                 cublasOperation_t transb,
48 |                                 const int         m,
49 |                                 const int         n,
50 |                                 const int         k,
51 |                                 const void*       A,
52 |                                 const int         lda,
53 |                                 const int64_t     strideA,
54 |                                 const void*       B,
55 |                                 const int         ldb,
56 |                                 const int64_t     strideB,
57 |                                 void*             C,
58 |                                 const int         ldc,
59 |                                 const int64_t     strideC,
60 |                                 const int         batchCount,
61 |                                 float             f_alpha,
62 |                                 float             f_beta);
63 | };
64 | 


--------------------------------------------------------------------------------
/src/kernels/add_residual.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "src/kernels/add_residual.h"
 3 | #include "src/utils/cuda_debug_utils.cuh"
 4 | 
 5 | // (RussWong)note: this kernel is used at the end of FFN in every decoder layer
 6 | template <typename T>
 7 | __global__ void AddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units
 8 |     T *residual,
 9 |     T *decoder_out, // [num tokens, hidden_units]
10 |     int num_tokens,
11 |     int hidden_units)
12 | {
13 |     int vec_size = Vec<T>::size;
14 |     using Vec_t = typename Vec<T>::Type;
15 |     int batch_id = blockIdx.x;
16 |     int tid = threadIdx.x;
17 |     Vec_t *dout = reinterpret_cast<Vec_t *>(decoder_out + batch_id * hidden_units);
18 |     Vec_t *rsd = reinterpret_cast<Vec_t *>(residual + batch_id * hidden_units);
19 |     for (int i = tid; i < hidden_units / vec_size; i += blockDim.x)
20 |     {
21 |         dout[i].x += rsd[i].x;
22 |         dout[i].y += rsd[i].y;
23 |         dout[i].z += rsd[i].z;
24 |         dout[i].w += rsd[i].w;
25 |     } // addresidual
26 | }
27 | 
28 | template <>
29 | __global__ void AddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units
30 |     half *residual,
31 |     half *decoder_out, // [num tokens, hidden_units]
32 |     int num_tokens,
33 |     int hidden_units)
34 | {
35 |     int vec_size = Vec<half>::size;
36 |     using Vec_t = typename Vec<half>::Type;
37 |     int batch_id = blockIdx.x;
38 |     int tid = threadIdx.x;
39 |     Vec_t *dout = reinterpret_cast<Vec_t *>(decoder_out + batch_id * hidden_units);
40 |     Vec_t *rsd = reinterpret_cast<Vec_t *>(residual + batch_id * hidden_units);
41 |     for (int i = tid; i < hidden_units / vec_size; i += blockDim.x)
42 |     {
43 |         dout[i] = __hadd2(dout[i], rsd[i]);
44 |     } // addresidual
45 | }
46 | 
47 | template <typename T>
48 | void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, 256 threads travrse hiddenunits eles recursely
49 |     TensorWrapper<T> *residual,
50 |     TensorWrapper<T> *decoder_out, // [num tokens, hidden_units]
51 |     bool is_print
52 | )
53 | {
54 |     int batch_size = decoder_out->shape[0];
55 |     int hidden_units = decoder_out->shape[1];
56 |     int vec_size = Vec<T>::size;
57 |     dim3 grid(batch_size);
58 |     dim3 block(256);
59 |     AddResidual<T><<<grid, block>>>(residual->data,
60 |                                     decoder_out->data,
61 |                                     batch_size,
62 |                                     hidden_units);
63 | #ifdef PRINT_DATA
64 |     if (is_print){
65 |         print_data<<<1, 1>>>(decoder_out->data);
66 |     }
67 | #else
68 | #endif
69 | }
70 | template void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units
71 |     TensorWrapper<float> *residual,
72 |     TensorWrapper<float> *decoder_out, // [num tokens, hidden_units]
73 |     bool is_print
74 |     );
75 | template void launchAddResidual( // residual.shape = [num tokens, hidden_units], batch_size = num tokens, n_dims = hidden_units
76 |     TensorWrapper<half> *residual,
77 |     TensorWrapper<half> *decoder_out, // [num tokens, hidden_units]
78 |     bool is_print
79 |     );
80 | 


--------------------------------------------------------------------------------
/tests/unittests/test_fused_trans_remv_pad.cu:
--------------------------------------------------------------------------------
 1 | #include "src/kernels/fused_transpose_and_remv_pad.h"
 2 | #include <iostream>
 3 | // [b,h,s,d]=>[b,s,h,d]=>[num tokens,h,d]
 4 | // padding_offset.shape = [num_tokens]
 5 | // (RussWong)note: this kernel is only supporting fp32 type UT
 6 | // we compare the kernel correctnesss by eyes and result print infos
 7 | // `./test_fused_trans_remv_pad` to test fp32 kernel
 8 | int main() {
 9 |     const int batch_size = 2;
10 |     const int head_num = 2;
11 |     const int max_seq_len = 4;
12 |     const int head_size = 2;
13 |     const int num_tokens = 5;
14 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
15 |     const int in_size = batch_size * head_num * max_seq_len * head_size;
16 |     const int out_size = num_tokens * head_num * head_size;
17 |     float* h_in;
18 |     float* d_in;
19 |     h_in = (float*)malloc(sizeof(float) * in_size);
20 |     cudaMalloc((void**)&d_in, sizeof(float) * in_size);
21 |     float* h_out;
22 |     float* d_out;
23 |     h_out = (float*)malloc(sizeof(float) * out_size);
24 |     cudaMalloc((void**)&d_out, sizeof(float) * out_size);
25 |     int* h_padding_offset;
26 |     int* d_padding_offset;
27 |     h_padding_offset = (int*)malloc(sizeof(int) * num_tokens);
28 |     cudaMalloc((void**)&d_padding_offset, sizeof(int) * num_tokens);
29 | 
30 |     //1st seqlen: 2, due to 1st seq, so its padding offset are all 0
31 |     //2nd seqlen: 3, so its padding offset are all 4-2=2
32 |     for(int i = 0; i < in_size; i++) {
33 |        h_in[i] = i;
34 |     }
35 |     for(int i = 0; i < 2; i++) {
36 |        h_padding_offset[i] = 0;
37 |     } 
38 |     h_padding_offset[2] = 2;  
39 |     h_padding_offset[3] = 2;
40 |     h_padding_offset[4] = 2;
41 | 
42 |     cudaMemcpy(d_in, h_in, sizeof(float) * in_size, cudaMemcpyHostToDevice);
43 |     cudaMemcpy(d_padding_offset, h_padding_offset, sizeof(int) * num_tokens, cudaMemcpyHostToDevice);
44 | 
45 |     DataType type = getTensorType<float>(); 
46 |     DataType type_pad = getTensorType<int>(); 
47 |     TensorWrapper<float>* in = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, max_seq_len, head_size}, d_in);
48 |     TensorWrapper<int>* in_pad = new TensorWrapper<int>(Device::GPU, type_pad, {num_tokens}, d_padding_offset);
49 |     TensorWrapper<float>* out = new TensorWrapper<float>(Device::GPU, type, {num_tokens, head_num, head_size}, d_out);
50 |     std::cout << "before launch softmax kernel" << std::endl;
51 |     launchTransposeOutRemovePadding(in, in_pad, out);
52 |     std::cout << "after launch softmax kernel" << std::endl;
53 |     std::cout << "cuda memcpy device to host" << std::endl;
54 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
55 |     cudaMemcpy(h_out, out->data, sizeof(float) * out_size, cudaMemcpyDeviceToHost);
56 |     for(int i = 0; i < out_size; i++) {
57 |         printf("after trans and remv pad, out[%d] = %f\n", i, h_out[i]);
58 |     }
59 |     // debug info, better to retain: std::cout << "before free" << std::endl;
60 |     free(h_in);
61 |     free(h_out);
62 |     free(h_padding_offset);
63 |     cudaFree(d_in);
64 |     cudaFree(d_out);
65 |     cudaFree(d_padding_offset);
66 | }


--------------------------------------------------------------------------------
/tests/unittests/test_cal_paddingoffset.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>   // std::fill_n
 2 | #include <iostream>    // snprintf
 3 | #include <math.h>      // expf, log
 4 | #include <stdlib.h>    // rand
 5 | #include <string>      // std::string
 6 | #include <vector>      // std::vector
 7 | 
 8 | #include "src/kernels/cal_paddingoffset.h"
 9 | // (RussWong)note: this kernel is only int type input and output, not fp32 or half
10 | // we compare the kernel correctnesss by eyes and result print infos
11 | // `./paddingoffset` to run
12 | int main() {
13 |     const int batch_size = 3;
14 |     const int max_q_len = 5;
15 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
16 |     int* h_seq_lens;
17 |     int *d_seq_lens;
18 |     h_seq_lens = (int*)malloc(sizeof(int) * batch_size);
19 |     cudaMalloc((void**)&d_seq_lens, sizeof(int) * batch_size);
20 | 
21 |     int* h_cum_seqlens;
22 |     int* d_cum_seqlens;
23 |     h_cum_seqlens = (int*)malloc(sizeof(int) * (batch_size + 1));
24 |     cudaMalloc((void**)&d_cum_seqlens, sizeof(int) * (batch_size + 1));
25 |     
26 |     int* h_padding_offset;
27 |     int* d_padding_offset;
28 |     h_padding_offset = (int*)malloc(sizeof(int) * batch_size * max_q_len);
29 |     cudaMalloc((void**)&d_padding_offset, sizeof(int) * batch_size * max_q_len);
30 | 
31 |     for(int i = 0; i < batch_size; i++) { // 3
32 |        h_seq_lens[i] = batch_size;
33 |     }
34 |     cudaMemcpy(d_seq_lens, h_seq_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice);
35 |     DataType type_int = getTensorType<int>();
36 |     TensorWrapper<int>* padding_offset = new TensorWrapper<int>(Device::GPU, type_int, {batch_size, max_q_len}, d_padding_offset);
37 |     TensorWrapper<int>* cum_seqlens = new TensorWrapper<int>(Device::GPU, type_int, {batch_size + 1}, d_cum_seqlens);
38 |     TensorWrapper<int>* input_lengths = new TensorWrapper<int>(Device::GPU, type_int, {batch_size}, d_seq_lens);
39 |     // debug info, better to retain: std::cout << "before launch kernel" << std::endl;
40 |     launchCalPaddingoffset(padding_offset, 
41 |                            cum_seqlens,
42 |                            input_lengths);
43 |     // debug info, better to retain: std::cout << "after launch kernel" << std::endl;
44 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
45 |     cudaMemcpy(h_padding_offset, d_padding_offset, sizeof(int) * batch_size * max_q_len, cudaMemcpyDeviceToHost);
46 |     cudaMemcpy(h_cum_seqlens, d_cum_seqlens, sizeof(int) * (batch_size + 1), cudaMemcpyDeviceToHost);
47 |     // debug info, better to retain: std::cout << "cuda memcpy device to host" << std::endl;    
48 |     for(int i = 0; i < batch_size * max_q_len; i++) {
49 |         printf("padding_offset = %d\n", h_padding_offset[i]);
50 |     }
51 |     for(int i = 0; i < batch_size + 1; i++){
52 |         printf("cum_seqlens =%d\n", h_cum_seqlens[i]);
53 |     }
54 |     //expected result is:
55 |     // padding_offset: 0,0,0,2,2,2,4,4,4,0.... shape = [batchsize, max_q_len]
56 |     // cum_seqlens: 0,3,6,9. shape=[batchsize+1]
57 |     // debug info, better to retain: std::cout << "before free" << std::endl;
58 |     free(h_seq_lens);
59 |     free(h_padding_offset);
60 |     free(h_cum_seqlens);
61 |     cudaFree(d_seq_lens);
62 |     cudaFree(d_padding_offset);
63 |     cudaFree(d_cum_seqlens);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/layers/decoder/self_decoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/kernels/fused_decoder_self_attention.h"
 3 | #include "src/kernels/fused_addresidual_norm.h"
 4 | #include "src/kernels/rmsnorm_kernel.h"
 5 | #include "src/kernels/add_residual.h"
 6 | #include "src/layers/attention/masked_self_attention.h"
 7 | #include "src/layers/ffn/ffn.h"
 8 | #include "src/weights/llama/llama_weights.h"
 9 | #include "src/utils/tensor.h"
10 | 
11 | // layer weights is ready at the model_utils.h                                                                                                                                           by loadweights in onellm.cpp, outside of the decoder
12 | template <typename T>
13 | class LlamaSelfDecoder
14 | {
15 | private:
16 |     int head_num;
17 |     int kv_head_num;
18 |     int head_size;
19 |     int inter_size;
20 |     int num_layer;
21 |     int hidden_units;
22 |     float rmsnorm_eps;
23 | 
24 |     cudaStream_t stream;
25 |     cublasWrapper *cublas_wrapper;
26 |     BaseAllocator *allocator;
27 | 
28 |     TensorWrapper<T> *decoder_residual;
29 | 
30 |     LLaMASelfAttentionLayer<T> *selfAttn;
31 |     LLaMAFFNLayer<T> *ffn;
32 |     DataType data_type;
33 | 
34 | public:
35 |     LlamaSelfDecoder(int head_num,
36 |                      int kv_head_num,
37 |                      int head_size,
38 |                      int inter_size,
39 |                      int num_layer,
40 |                      const LLaMAAttentionStaticParams &attn_params,
41 |                      float rmsnorm_eps,
42 |                      cudaStream_t stream,
43 |                      cublasWrapper *cublas_wrapper,
44 |                      BaseAllocator *allocator) : head_num(head_num),
45 |                                                  head_size(head_size),
46 |                                                  inter_size(inter_size),
47 |                                                  hidden_units(head_num * head_size),
48 |                                                  num_layer(num_layer),
49 |                                                  rmsnorm_eps(rmsnorm_eps),
50 |                                                  data_type(getTensorType<float>()),
51 |                                                  stream(stream),
52 |                                                  cublas_wrapper(cublas_wrapper),
53 |                                                  allocator(allocator)
54 |     {
55 |         selfAttn = new LLaMASelfAttentionLayer<T>(head_num,
56 |                                                   kv_head_num,
57 |                                                   head_size,
58 |                                                   attn_params,
59 |                                                   stream,
60 |                                                   cublas_wrapper,
61 |                                                   allocator);
62 | 
63 |         ffn = new LLaMAFFNLayer<T>(head_num,
64 |                                    head_size,
65 |                                    inter_size,
66 |                                    stream,
67 |                                    cublas_wrapper,
68 |                                    allocator);
69 |     };
70 |     void allocForForward(LLaMAAttentionDynParams &dyn_params);
71 |     void freeBuf();
72 |     void forward(TensorMap &input_tensors, const std::vector<LlamaLayerWeight<T> *> &layerWeights, TensorMap &output_tensors, LLaMAAttentionDynParams &dyn_params);
73 | };
74 | 


--------------------------------------------------------------------------------
/src/layers/decoder/context_decoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "src/kernels/build_casual_mask.h"
 3 | #include "src/kernels/cal_paddingoffset.h"
 4 | #include "src/kernels/fused_addresidual_norm.h"
 5 | #include "src/kernels/add_residual.h"
 6 | #include "src/kernels/rmsnorm_kernel.h"
 7 | #include "src/layers/attention/context_attention.h"
 8 | #include "src/layers/ffn/ffn.h"
 9 | #include "src/weights/llama/llama_weights.h"
10 | #include "src/utils/tensor.h"
11 | 
12 | // layer weights is ready at the model_utils.h
13 | template <typename T>
14 | class LlamaContextDecoder
15 | {
16 | private:
17 |     int head_num;
18 |     int kv_head_num;
19 |     int head_size;
20 |     int inter_size;
21 |     int num_layer;
22 |     int hidden_units;
23 |     float rmsnorm_eps;
24 |     TensorWrapper<T> *attention_mask;
25 |     TensorWrapper<int> *padding_offset;
26 |     TensorWrapper<int> *cum_seqlens;
27 |     TensorWrapper<T> *decoder_residual;
28 |     cudaStream_t stream;
29 |     cublasWrapper *cublas_wrapper;
30 |     BaseAllocator *allocator;
31 | 
32 |     LLaMAContextAttentionLayer<T> *ctxAttn;
33 |     LLaMAFFNLayer<T> *ffn;
34 |     DataType data_type;
35 | 
36 | public:
37 |     LlamaContextDecoder(int head_num,
38 |                         int kv_head_num,
39 |                         int head_size,
40 |                         int inter_size,
41 |                         int num_layer,
42 |                         const LLaMAAttentionStaticParams &attn_params,
43 |                         float rmsnorm_eps,
44 |                         cudaStream_t stream,
45 |                         cublasWrapper *cublas_wrapper,
46 |                         BaseAllocator *allocator) : head_num(head_num),
47 |                                                     head_size(head_size),
48 |                                                     inter_size(inter_size),
49 |                                                     hidden_units(head_num * head_size),
50 |                                                     num_layer(num_layer),
51 |                                                     rmsnorm_eps(rmsnorm_eps),
52 |                                                     data_type(getTensorType<T>()),
53 |                                                     stream(stream),
54 |                                                     cublas_wrapper(cublas_wrapper),
55 |                                                     allocator(allocator)
56 |     {
57 |         ctxAttn = new LLaMAContextAttentionLayer<T>(head_num,
58 |                                                     kv_head_num,
59 |                                                     head_size,
60 |                                                     attn_params,
61 |                                                     stream,
62 |                                                     cublas_wrapper,
63 |                                                     allocator);
64 | 
65 |         ffn = new LLaMAFFNLayer<T>(head_num,
66 |                                    head_size,
67 |                                    inter_size,
68 |                                    stream,
69 |                                    cublas_wrapper,
70 |                                    allocator);
71 |     };
72 |     void allocForForward(LLaMAAttentionDynParams &dyn_params);
73 |     void freeBuf();
74 |     void forward(TensorMap &input_tensors, const std::vector<LlamaLayerWeight<T> *> &layerWeights, TensorMap &output_tensors, LLaMAAttentionDynParams &dyn_params);
75 | };
76 | 


--------------------------------------------------------------------------------
/src/kernels/fused_transpose_and_remv_pad.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/utils/cuda_debug_utils.cuh"
 3 | #include "src/kernels/fused_transpose_and_remv_pad.h"
 4 | // [bs,head nums,seqlen,head size]=>[bs,seqlen,head nums,head size]=>[num tokens,head nums,head size]
 5 | // padding_offset.shape = [num_tokens]
 6 | template <typename T>
 7 | __global__ void fused_transpose_reshape_remv_pad(T *src,
 8 |                                                  T *dst,
 9 |                                                  const int num_tokens,
10 |                                                  const int batch_size,
11 |                                                  const int seq_len,
12 |                                                  const int head_num,
13 |                                                  const int head_size,
14 |                                                  const int *padding_offset /*for remove padding*/)
15 | {
16 |     int token_id = blockIdx.x; 
17 |     // map to input id
18 |     int batch_id = (blockIdx.x + padding_offset[token_id]) / seq_len;
19 |     int seq_id = (blockIdx.x + padding_offset[token_id]) % seq_len;
20 |     // compute the offset of transpose and remove padding before or after
21 |     int src_offset = batch_id * head_num * seq_len * head_size + seq_id * head_size;
22 |     int dst_offset = token_id * head_num * head_size;
23 | 
24 |     for (int i = threadIdx.x; i < head_num * head_size; i += blockDim.x)
25 |     {
26 |         int head_id = i / head_size;
27 |         int head_size_id = i % head_size;
28 |         dst[dst_offset + i] = src[src_offset + head_id * seq_len * head_size + head_size_id];
29 |     }
30 | }
31 | template <typename T>
32 | void launchTransposeOutRemovePadding(TensorWrapper<T> *qkv_buf_w_pad,
33 |                                      TensorWrapper<int> *padding_offset,
34 |                                      TensorWrapper<T> *qkv_buf_wo_pad_1)
35 | {
36 |     int batch_size = qkv_buf_w_pad->shape[0];
37 |     int head_num = qkv_buf_w_pad->shape[1];
38 |     int seq_len = qkv_buf_w_pad->shape[2];
39 |     int head_size = qkv_buf_w_pad->shape[3];
40 |     int num_tokens = qkv_buf_wo_pad_1->shape[0];
41 |     dim3 grid(num_tokens);
42 |     dim3 block(std::min(head_num * head_size, 1024));
43 |     fused_transpose_reshape_remv_pad<T><<<grid, block>>>(qkv_buf_w_pad->data,
44 |                                                          qkv_buf_wo_pad_1->data,
45 |                                                          num_tokens,
46 |                                                          batch_size,
47 |                                                          seq_len,
48 |                                                          head_num,
49 |                                                          head_size,
50 |                                                          padding_offset->data);
51 | #ifdef PRINT_DATA
52 |     print_data<<<1, 1>>>(qkv_buf_wo_pad_1->data);
53 | #else
54 | #endif
55 | }
56 | 
57 | template void launchTransposeOutRemovePadding(TensorWrapper<float> *qkv_buf_w_pad,
58 |                                               TensorWrapper<int> *padding_offset,
59 |                                               TensorWrapper<float> *qkv_buf_wo_pad_1);
60 | template void launchTransposeOutRemovePadding(TensorWrapper<half> *qkv_buf_w_pad,
61 |                                               TensorWrapper<int> *padding_offset,
62 |                                               TensorWrapper<half> *qkv_buf_wo_pad_1);
63 | 


--------------------------------------------------------------------------------
/src/utils/model_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cublas_v2.h>
 5 | #include "src/models/basemodel.h"
 6 | #include "src/models/llama/llama.h"
 7 | #include "src/utils/macro.h"
 8 | #include "src/memory/allocator/cuda_allocator.h"
 9 | #include "src/models/llama/llama_params.h"
10 | // (RussWong) note: all LLM models are created in the header file, and I provided two ways, one is real weight model, the other is dummy weight model for functionality
11 | namespace llm {
12 |     template<typename T>
13 |     BaseModel *CreateModelWithName(const std::string& model_name) {
14 |         LLM_CHECK_WITH_INFO(model_name == "llama", "dont support other models except llama yet!");
15 |         int head_num = 32;
16 |         int kv_head_num = 32;
17 |         int head_size = 128;
18 |         int inter_size = 11008;
19 |         int num_layers = 32;
20 |         int max_seq_len = 64;
21 |         int vocab_size = 32000;
22 |         int hidden_units = (head_num + 2 * kv_head_num) * head_size;
23 |         int q_hidden_units = head_num * head_size;
24 |         bool attn_bias = false;
25 |         LLaMAAttentionStaticParams attn_static_params;
26 |         attn_static_params.rotary_embedding_dim = 128;
27 |         attn_static_params.rotary_embedding_base = 10000;
28 |         attn_static_params.max_position_embeddings = 4096;
29 |         attn_static_params.use_dynamic_ntk = false; // true is for dyn scaling rope
30 |         cublasHandle_t cublas_handle;
31 |         cublasLtHandle_t cublaslt_handle;
32 |         cudaStream_t stream;
33 |         cublasCreate(&cublas_handle);
34 |         cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
35 |         cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle);
36 |         cublas_wrapper->setFP32GemmConfig();
37 | 	BaseAllocator* allocator = new CudaAllocator;
38 |         cudaDeviceProp deviceProp;
39 |         cudaGetDeviceProperties(&deviceProp, 0);
40 |         BaseModel *model = new Llama<T>(head_num,
41 |                                         kv_head_num,
42 |                                         head_size,
43 |                                         inter_size,
44 |                                         num_layers,
45 |                                         vocab_size,
46 |                                         attn_static_params,
47 |                                         max_seq_len,
48 |                                         stream,
49 |                                         cublas_wrapper,
50 |                                         allocator,
51 |                                         &deviceProp);
52 |         return model;
53 |     }
54 | 
55 |     template<typename T>
56 |     std::unique_ptr<BaseModel> CreateDummyLLMModel(std::string tokenizer_file){
57 |         BaseModel *model = CreateModelWithName<T>("llama");
58 |         model->loadTokenizer(tokenizer_file);
59 |         model->loadWeightsFromDummy();
60 |         return std::unique_ptr<BaseModel> (model);        
61 |     }
62 | 
63 |     template<typename T>
64 |     std::unique_ptr<BaseModel> CreateRealLLMModel(std::string model_dir, std::string tokenizer_file){
65 |         BaseModel *model = CreateModelWithName<T>("llama");
66 | 	std::cout << "start creating model..." << "\n";
67 | 	model->loadTokenizer(tokenizer_file);
68 |         model->loadWeights(model_dir);
69 | 	std::cout << "finish creating model..." << "\n";
70 |         return std::unique_ptr<BaseModel> (model);        
71 |     }
72 | } // namespace llm
73 | 
74 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
  2 | project(oneLLM LANGUAGES CXX CUDA)
  3 | 
  4 | find_package(CUDA 10.0 REQUIRED)
  5 | 
  6 | set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
  7 | 
  8 | 
  9 | list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
 10 | find_package(CUDA REQUIRED)
 11 | 
 12 | # setting compiler flags
 13 | set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")	
 14 | set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
 15 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall")
 16 | 
 17 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
 18 |                       -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
 19 |                       -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
 20 |                       -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
 21 |                       -gencode=arch=compute_86,code=\\\"sm_86,compute_86\\\" \
 22 |                         ")
 23 | #                      -rdc=true") # not sure the effect of this option, retain it temply
 24 | 
 25 | set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
 26 | message("-- Assign GPU architecture (sm=70 75 80 86)")
 27 | 
 28 | set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
 29 | set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
 30 | set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
 31 | 
 32 | message(STATUS "CMAKE_CXX_FLAGS" ${CMAKE_CXX_FLAGS})
 33 | 
 34 | set(CMAKE_CXX_STANDARD 11)
 35 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 36 | 
 37 | if(CMAKE_CXX_STANDARD STREQUAL "11")
 38 |   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 39 |   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 40 |   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11")
 41 | endif()
 42 | 
 43 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 44 | set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3")
 45 | 
 46 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 47 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 48 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 49 | 
 50 | set(COMMON_HEADER_DIRS
 51 |   ${PROJECT_SOURCE_DIR}
 52 |   ${CUDA_PATH}/include
 53 | )
 54 | 
 55 | set(COMMON_LIB_DIRS
 56 |   ${CUDA_PATH}/lib64
 57 | )
 58 | 
 59 | include_directories(
 60 |   ${COMMON_HEADER_DIRS}
 61 | )
 62 | 
 63 | link_directories(
 64 |   ${COMMON_LIB_DIRS}
 65 | )
 66 | option (PERF
 67 |   "measure model inference performance"
 68 |   OFF
 69 | )
 70 | option (PRINT_DATA
 71 |   "print kernel output to debug"
 72 |   OFF
 73 | )
 74 | option (SAVE_DATA
 75 |   "save kernel output to debug"
 76 |   OFF
 77 | )
 78 | if (PERF)
 79 |     add_compile_options(-DPERF)
 80 | endif()
 81 | if (PRINT_DATA)
 82 |     add_compile_options(-DPRINT_DATA)
 83 | endif()
 84 | if (SAVE_DATA)
 85 |     add_compile_options(-DSAVE_DATA)
 86 | endif()
 87 | #cmake .. -DPRINT_DATA=ON && make
 88 | #cmake .. -DPRINT_DATA=ON -DSAVE_DATA=ON && make
 89 | #cmake .. -DPERF=ON && make
 90 | #cmake .. && make
 91 | file(GLOB_RECURSE LLM_CXX_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cc)
 92 | file(GLOB_RECURSE LLM_CUDA_SOURCES ${PROJECT_SOURCE_DIR}/src/*.cu)
 93 | 
 94 | add_library(llmengine OBJECT
 95 |            ${LLM_CXX_SOURCES}
 96 |            ${LLM_CUDA_SOURCES}
 97 |            )
 98 | 
 99 | add_subdirectory(src)
100 | add_subdirectory(tests)
101 | # add_subdirectory(examples)
102 | 
103 | add_executable(main user_entry.cpp)
104 | target_link_libraries(main PUBLIC -lcublas -lcudart -lcudadevrt llmengine)
105 | 


--------------------------------------------------------------------------------
/src/utils/debug_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime.h>
 3 | #include <cuda.h>
 4 | #include <cuda_fp16.h>
 5 | #include <fstream>
 6 | #include "src/utils/tensor.h"
 7 | #include "src/weights/base_weights.h"
 8 | #include "src/utils/macro.h"
 9 | // (RussWong)note: overloaded 3 different function for saving intermediate output tensor to debug
10 | // because LLMs have many layers, so I provide some overloaded function to specify layer id to print specify layer output tensor to debug
11 | // after you save tensor into specified file ,you can turn to tests/unitests/test_data_compare.cu to specify file path to compare res with HF.
12 | template<typename T>
13 | void save_tensor(TensorWrapper<T>* input, std::string filename){
14 |     int Bm = 0;
15 |     int Bk = 0;
16 |     if (input->shape.size() == 4){
17 |         Bm = input->shape[0] * input->shape[1];
18 |         Bk = input->shape[3] * input->shape[2];
19 |     } else if (input->shape.size() == 3){
20 |         Bm = input->shape[0];
21 |         Bk = input->shape[1] * input->shape[2];
22 |     } else if (input->shape.size() == 2){
23 |         Bm = input->shape[0];
24 |         Bk = input->shape[1];
25 |     }
26 |     T* icpu = (T*)malloc(sizeof(T) * Bm * Bk);
27 |     cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost);
28 |     std::ofstream F;
29 |     std::cout << "saving intermediate tensor in " << filename << "\n";
30 |     F.open("/home/data/"+ filename, std::ofstream::binary);
31 |     F.write(reinterpret_cast<const char*>(icpu), sizeof(T)*Bm*Bk);
32 |     F.close();
33 | }
34 | 
35 | template<typename T>
36 | void save_tensor(TensorWrapper<T>* input, std::string filename, TensorWrapper<int>* layer_id){
37 |     int id = layer_id->getVal();
38 |     if (id > 2) {
39 |         return;
40 |     }
41 |     int Bm = 0;
42 |     int Bk = 0;
43 |     if (input->shape.size() == 4){
44 |         Bm = input->shape[0] * input->shape[1];
45 |         Bk = input->shape[3] * input->shape[2];
46 |     } else if (input->shape.size() == 3){
47 |         Bm = input->shape[0];
48 |         Bk = input->shape[1] * input->shape[2];
49 |     } else if (input->shape.size() == 2){
50 |         Bm = input->shape[0];
51 |         Bk = input->shape[1];
52 |     }
53 |     T* icpu = (T*)malloc(sizeof(T) * Bm * Bk);
54 |     cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost);
55 |     std::ofstream F;
56 |     std::cout << "saving intermediate tensor in " << filename << "\n";
57 |     F.open("/home/data/" + std::to_string(id) + "_" + filename, std::ofstream::binary);
58 |     F.write(reinterpret_cast<const char*>(icpu), sizeof(T)*Bm*Bk);
59 |     F.close();
60 | }
61 | 
62 | template<typename T>
63 | void save_tensor(TensorWrapper<T>* input, std::string filename, int layer_id){
64 |     int id = layer_id;
65 |     if (id > 2) {
66 |         return;
67 |     }
68 |     int Bm = 0;
69 |     int Bk = 0;
70 |     if (input->shape.size() == 4){
71 |         Bm = input->shape[0] * input->shape[1];
72 |         Bk = input->shape[3] * input->shape[2];
73 |     } else if (input->shape.size() == 3){
74 |         Bm = input->shape[0];
75 |         Bk = input->shape[1] * input->shape[2];
76 |     } else if (input->shape.size() == 2){
77 |         Bm = input->shape[0];
78 |         Bk = input->shape[1];
79 |     }
80 |     T* icpu = (T*)malloc(sizeof(T) * Bm * Bk);
81 |     cudaMemcpy(icpu, input->data, sizeof(T) * Bm * Bk, cudaMemcpyDeviceToHost);
82 |     std::ofstream F;
83 |     std::cout << "saving intermediate tensor in " << filename << "\n";
84 |     F.open("/home/data/" + std::to_string(id) + "_" + filename, std::ofstream::binary);
85 |     F.write(reinterpret_cast<const char*>(icpu), sizeof(T)*Bm*Bk);
86 |     F.close();
87 | }
88 | 


--------------------------------------------------------------------------------
/tests/unittests/test_act.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>   // std::fill_n
 2 | #include <iostream>    // snprintf
 3 | #include <math.h>      // expf, log
 4 | #include <stdlib.h>    // rand
 5 | #include <string>      // std::string
 6 | #include <vector>      // std::vector
 7 | #include "src/kernels/act_kernel.h"
 8 | // (RussWong)note: not sure CPU implementation is absolutely right and the GPU kernel is right compared with HF.
 9 | // when you are implementing LLMs inference on CPU, you can reuse the CPU kernel and test its correctness
10 | // (RussWong)note:
11 | // `./test_act 1` to test half GPU kernel
12 | // `./test_act` to test fp32 GPU kernel
13 | template<typename T>
14 | void CPUSwiGLU(T* input, T* output, int batch_size, int intermedia_size){
15 |     float silu_out = 0.0f;
16 |     for(int batch_id = 0; batch_id < batch_size; batch_id++){
17 |         for(int i = 0; i < intermedia_size; i++) {
18 |             int offset1 = batch_id * 2 * intermedia_size + i;
19 |             int offset2 = batch_id * 2 * intermedia_size + i + intermedia_size;
20 |             int out_offset = batch_id * intermedia_size + i;
21 |             silu_out = (float)input[offset1] / (1.0f + expf(-1 * (float)input[offset1]));
22 |             output[out_offset] = static_cast<T>(silu_out * (float)input[offset2]);
23 |         }
24 |     }
25 | }
26 | template<typename T>
27 | bool CheckResult(T* CPUoutput, T* GPUoutput, int output_size) {
28 |     for(int i = 0; i < output_size; i++) {
29 |         if(fabs((float)CPUoutput[i] - (float)GPUoutput[i]) > 1e-6){
30 | 	    printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, (float)CPUoutput[i], (float)GPUoutput[i]);
31 |         }
32 |     }
33 |     return true;
34 | }
35 | 
36 | template<typename T>
37 | void test_act(int batch_size, int intermedia_size, int input_size , int output_size) {
38 |     T* h_input;
39 |     T* d_input;
40 |     h_input = (T*)malloc(sizeof(T) * input_size);
41 |     cudaMalloc((void**)&d_input, sizeof(T) * input_size);
42 |     T* h_output;
43 |     T* d_output;
44 |     h_output = (T*)malloc(sizeof(T) * output_size);
45 |     cudaMalloc((void**)&d_output, sizeof(T) * output_size);
46 |     for(int i = 0; i < input_size; i++) { // initialize host data
47 |         h_input[i] = (T)1;
48 |     }
49 |     cudaMemcpy(d_input, h_input, sizeof(T) * input_size, cudaMemcpyHostToDevice);
50 |     DataType type = getTensorType<T>();
51 |     TensorWrapper<T>* input_tensor = new TensorWrapper<T>(GPU, type, {batch_size, 2, intermedia_size}, d_input);
52 |     TensorWrapper<T>* output_tensor = new TensorWrapper<T>(GPU, type, {batch_size, intermedia_size}, d_output);
53 |     launchAct(input_tensor, output_tensor);
54 |     cudaMemcpy(h_output, d_output, sizeof(T) * output_size, cudaMemcpyDeviceToHost);
55 |     T* CPU_output = (T*)malloc(sizeof(T) * output_size);
56 |     CPUSwiGLU(h_input, CPU_output, batch_size, intermedia_size);
57 |     bool is_true = CheckResult(CPU_output, h_output, output_size);
58 |     if(is_true){
59 |         printf("test passed");
60 |     } else {
61 |         printf("test failed");
62 |     }
63 | 
64 |     free(h_input);
65 |     free(h_output);
66 |     free(CPU_output);
67 |     cudaFree(d_input);
68 |     cudaFree(d_output);    
69 | }
70 | 
71 | int main(int argc, char** argv) {
72 |     constexpr int batch_size = 16;
73 |     constexpr int intermedia_size = 11008;
74 |     constexpr int input_size = batch_size * intermedia_size * 2;
75 |     constexpr int output_size = batch_size * intermedia_size;
76 |     if (argv[1]){
77 |         test_act<half>(batch_size, intermedia_size, input_size, output_size);
78 |     } else {
79 |         test_act<float>(batch_size, intermedia_size, input_size, output_size);
80 |     }
81 |     
82 | }
83 | 


--------------------------------------------------------------------------------
/src/kernels/sampling.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/kernels/sampling.h"
 3 | // mini-softmax + curand_sample
 4 | // input: [bs, K] from topK output
 5 | // output: [bs]
 6 | // (Russwong)note: beamsearch不存在sampling，所以bsxbm = bs
 7 | template<typename T>
 8 | __global__ void SamplingKernel(int* topk_id,
 9 |                                T* topk_val, //[bs, K] from topK
10 |                                int* output_id, //[bs]
11 |                                int* seqlen, //cumulated seq len,[bs]
12 |                                bool* is_finished, //[bs]
13 |                                int K,
14 |                                int rand_num, // step
15 |                                int end_id, // when initialize llama model, we will init it, and this is a fixed val
16 |                                int vocab_size)
17 | {
18 |     int batch_id = blockIdx.x;
19 |     int bid = batch_id;
20 |     int tid = threadIdx.x;
21 |     int offset = batch_id * K + tid;
22 |     T max_val = topk_val[batch_id * K]; // max val is the top of the buffer, because topK
23 |     topk_val[offset] = (T)(expf((float)topk_val[offset] - (float)max_val));
24 |     __shared__ float thredhold, sum;
25 |     if(tid == 0) {
26 |         sum = 0.0f;
27 |         for(int i = 0; i < K; i++) {
28 |             sum += (float)topk_val[batch_id * K + i];
29 |         }
30 |         curandState_t state;
31 |         // (Russwong)note: curand_init API only support ulonglong data type
32 |         curand_init((unsigned long long)rand_num,(unsigned long long)bid, (unsigned long long)0, &state);
33 |         thredhold = (float)curand_uniform(&state) * sum; // for a block
34 |         output_id[bid] = topk_id[bid * K] % vocab_size; 
35 |         for(int i = 0; i < K; i++) {
36 |             thredhold = thredhold - (float)topk_val[batch_id * K + i];
37 |             if(thredhold < 0) {
38 |                 output_id[bid] = topk_id[batch_id * K + i] % vocab_size;
39 |                 break;
40 |             }
41 |         }
42 |         seqlen[bid] = is_finished[bid] ? seqlen[bid] : seqlen[bid] + 1;
43 |         is_finished[bid] = output_id[bid] == end_id ? 1 : 0;
44 |     }
45 | }
46 | 
47 | template<typename T>
48 | void launchSampling(TensorWrapper<int>* topk_id,
49 |                     TensorWrapper<T>* topk_val,
50 |                     TensorWrapper<int>* seqlen,
51 |                     TensorWrapper<bool>* is_finished,
52 |                     TensorWrapper<int>* output_id,
53 |                     IntDict& params) {
54 |     int batch_size = topk_id->shape[0];
55 |     int K = topk_id->shape[1];
56 |     int vocab_size = params["vocab_size"];
57 |     int step = params["step"];
58 |     int end_id = params["end_id"];
59 | 
60 |     dim3 grid(batch_size);
61 |     dim3 block(K); // K is small, so directly allocate K threads is enough
62 |     SamplingKernel<<<grid, block>>>(
63 |         topk_id->data,
64 |         topk_val->data,
65 |         output_id->data,
66 |         seqlen->data,
67 |         is_finished->data,
68 |         K,
69 |         step,
70 |         end_id,
71 |         vocab_size
72 |     );
73 | }
74 | 
75 | template void launchSampling(TensorWrapper<int>* topk_id,
76 |                             TensorWrapper<float>* topk_val,
77 |                             TensorWrapper<int>* seqlen,
78 |                             TensorWrapper<bool>* is_finished,
79 |                             TensorWrapper<int>* output_id,
80 |                             IntDict& params);
81 | 
82 | template void launchSampling(TensorWrapper<int>* topk_id,
83 |                             TensorWrapper<half>* topk_val,
84 |                             TensorWrapper<int>* seqlen,
85 |                             TensorWrapper<bool>* is_finished,
86 |                             TensorWrapper<int>* output_id,
87 |                             IntDict& params);
88 | 


--------------------------------------------------------------------------------
/tests/unittests/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | add_executable(embedding
  2 |     test_input_embedding.cu
  3 | )
  4 | target_link_libraries(
  5 |     embedding PUBLIC
  6 |     -lcudart
  7 |     -lcudadevrt
  8 |     embeddingFunctor)
  9 | 
 10 | add_executable(rms_norm
 11 |     test_rmsnorm.cu
 12 | )
 13 | target_link_libraries(
 14 |     rms_norm PUBLIC
 15 |     -lcudart
 16 |     -lcudadevrt
 17 |     rmsnorm)
 18 | 
 19 | add_executable(paddingoffset
 20 |     test_cal_paddingoffset.cu
 21 | )
 22 | target_link_libraries(
 23 |   paddingoffset PUBLIC
 24 |     -lcudart
 25 |     -lcudadevrt
 26 |     cal_paddingoffset) 
 27 | 
 28 | add_executable(causalmask
 29 |     test_casual_mask.cu
 30 | )
 31 | target_link_libraries(  # Libs for test_build_casual_mask
 32 |    causalmask PUBLIC
 33 |     -lcudart
 34 |     -lcudadevrt
 35 |     build_casual_mask) 
 36 | 
 37 | add_executable(testlinear
 38 |     test_linear.cu
 39 | )
 40 | target_link_libraries(  # Libs for test_build_casual_mask
 41 |     testlinear PUBLIC
 42 |     -lcudart
 43 |     -lcudadevrt
 44 |     linear) 
 45 | 
 46 | add_executable(debug
 47 |     test_data_compare.cu
 48 | )
 49 | target_link_libraries(  # Libs for test_build_casual_mask
 50 |     debug PUBLIC
 51 |     -lcudart
 52 |     -lcudadevrt) 
 53 | 
 54 | add_executable(bmm
 55 |     test_bmm.cu
 56 | )
 57 | target_link_libraries(  # Libs for test_build_casual_mask
 58 |     bmm PUBLIC
 59 |     -lcudart
 60 |     -lcudadevrt
 61 |     linear) 
 62 | 
 63 | add_executable(biasRope
 64 |     test_bias_and_RoPE.cu
 65 | )
 66 | target_link_libraries(  # Libs for test_qkv_bias_and_rope
 67 |   biasRope PUBLIC
 68 |     -lcudart
 69 |     -lcudadevrt
 70 |     qkv_bias_and_rope) 
 71 | 
 72 | add_executable(test_concat_kv
 73 |     test_concat_kv.cu
 74 | )
 75 | target_link_libraries(  # Libs for test_qkv_bias_and_rope
 76 |     test_concat_kv PUBLIC
 77 |     -lcudart
 78 |     -lcudadevrt
 79 |     concat_kv) 
 80 | 
 81 | add_executable(test_repeat_kv
 82 |     test_repeat_kv.cu
 83 | )
 84 | target_link_libraries( 
 85 |     test_repeat_kv PUBLIC
 86 |     -lcudart
 87 |     -lcudadevrt
 88 |     repeat_kv) 
 89 | 
 90 | add_executable(test_mask_softmax
 91 |     test_mask_softmax.cu
 92 | )
 93 | target_link_libraries(  
 94 |     test_mask_softmax PUBLIC
 95 |     -lcudart
 96 |     -lcudadevrt
 97 |     mask_softmax) 
 98 | 
 99 | add_executable(test_fused_trans_remv_pad
100 |     test_fused_trans_remv_pad.cu
101 | )
102 | target_link_libraries(  
103 |     test_fused_trans_remv_pad PUBLIC
104 |     -lcudart
105 |     -lcudadevrt
106 |     fused_transpose_and_remv_pad) 
107 | 
108 | add_executable(test_fused_addresidual_norm
109 |     test_fused_addresidual_norm.cu
110 | )
111 | target_link_libraries(  
112 |     test_fused_addresidual_norm PUBLIC
113 |     -lcudart
114 |     -lcudadevrt
115 |     fused_addresidual_norm) 
116 | 
117 | add_executable(test_act
118 |     test_act.cu
119 | )
120 | target_link_libraries(  
121 |     test_act PUBLIC
122 |     -lcudart
123 |     -lcudadevrt
124 |     act) 
125 | 
126 | add_executable(test_topk
127 |     test_topk.cu
128 | )
129 | target_link_libraries(  
130 |     test_topk PUBLIC
131 |     -lcudart
132 |     -lcudadevrt
133 |     topk) 
134 | 
135 | add_executable(test_fused_decoder_attention
136 |     test_fused_decoder_attention.cu
137 | )
138 | target_link_libraries(  
139 |     test_fused_decoder_attention PUBLIC
140 |     -lcudart
141 |     -lcudadevrt
142 |     fused_decoder_self_attention)
143 | 
144 | add_executable(test_sampling
145 |     test_sampling.cu
146 | )
147 | target_link_libraries(  
148 |     test_sampling PUBLIC
149 |     -lcudart
150 |     -lcudadevrt
151 |     sampling) 
152 | 
153 | add_executable(test_residual
154 |     test_residual.cu
155 | )
156 | target_link_libraries(  
157 |     test_residual PUBLIC
158 |     -lcudart
159 |     -lcudadevrt
160 |     add_residual) 
161 | 


--------------------------------------------------------------------------------
/src/layers/ffn/ffn.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/layers/ffn/ffn.h"
 3 | #include "src/utils/debug_utils.h"
 4 | //(RussWong) note: layers文件夹下，很多操作后面我都加了`DeviceSyncAndCheckCudaError();`，大家可手动删除或者按照lesson30所示添加条件编译代码
 5 | template<typename T>
 6 | LLaMAFFNLayer<T>::LLaMAFFNLayer(int head_num,
 7 |                                int head_size,
 8 |                                int inter_size,
 9 |                                cudaStream_t stream,
10 |                                cublasWrapper* cublas_wrapper,
11 |                                BaseAllocator* allocator):
12 |     head_num(head_num),
13 |     head_size(head_size),
14 |     inter_size(inter_size),
15 |     stream(stream),
16 |     cublas_wrapper(cublas_wrapper),
17 |     allocator(allocator),
18 |     hidden_units(head_num * head_size) {}
19 | 
20 | template<typename T>
21 | void LLaMAFFNLayer<T>::allocForForward(LLaMAAttentionDynParams& params){
22 |     int num_tokens = params.num_tokens;
23 |     DataType type = getTensorType<T>(); 
24 |     SwiGLU_input = new TensorWrapper<T>(Device::GPU, type, {num_tokens, 2, inter_size});
25 |     down_proj_input = new TensorWrapper<T>(Device::GPU, type, {num_tokens, inter_size});
26 |     SwiGLU_input->data = allocator->Malloc(SwiGLU_input->data, sizeof(T) * num_tokens * 2 * inter_size, false);
27 |     down_proj_input->data = allocator->Malloc(down_proj_input->data, sizeof(T) * num_tokens * inter_size, false);
28 | }
29 | template<typename T>
30 | void LLaMAFFNLayer<T>::allocForForward(int batch_size){
31 |     DataType type = getTensorType<T>(); 
32 |     SwiGLU_input = new TensorWrapper<T>(Device::GPU, type, {batch_size, 2, inter_size});
33 |     down_proj_input = new TensorWrapper<T>(Device::GPU, type, {batch_size, inter_size});
34 |     SwiGLU_input->data = allocator->Malloc(SwiGLU_input->data, sizeof(T) * batch_size * 2 * inter_size, false);
35 |     down_proj_input->data = allocator->Malloc(down_proj_input->data, sizeof(T) * batch_size * inter_size, false);
36 | }
37 | template<typename T>
38 | void LLaMAFFNLayer<T>::freeBuf(){
39 |     allocator->Free(SwiGLU_input->data);
40 |     DeviceSyncAndCheckCudaError();
41 |     allocator->Free(down_proj_input->data);
42 |     DeviceSyncAndCheckCudaError();
43 | }
44 | template<typename T>
45 | void LLaMAFFNLayer<T>::forward(TensorMap& inputs, TensorMap& outputs, LLaMAFFNWeights<T>& weights, LLaMAAttentionDynParams& params){
46 |     if (params.num_tokens > 0) {
47 |         allocForForward(params);
48 |     } else {
49 |         allocForForward(params.batch_size);
50 |     }
51 |     Tensor* ffn_input = inputs["ffn_input"];
52 |     Tensor* ffn_output = outputs["ffn_output"];
53 |     count += 1;
54 |     bool is_ctx = params.is_ctx;
55 | #ifdef SAVE_DATA 
56 |     save_tensor(ffn_input->as<T>(), "ffn_input.bin", count);
57 | #else
58 | #endif
59 |     // 1.fusedGateUp proj
60 |     launchLinearGemm(ffn_input->as<T>(), weights.gateAndup, SwiGLU_input, cublas_wrapper, false, true);
61 |     DeviceSyncAndCheckCudaError();
62 |     // single up proj linear, deprecated due to fuse gate and up into fusedGateAndup
63 |     // launchLinearGemm(ffn_input->as<T>(), weights.up, SwiGLU_input, cublas_wrapper, false, false, true);
64 | #ifdef SAVE_DATA  
65 |     save_tensor(SwiGLU_input ,"swiglu_input.bin", count);
66 | #else
67 | #endif
68 |     // 2.swiGLU
69 |     launchAct(SwiGLU_input, down_proj_input);// down_proj_input maybe can reuse swiglu_input buf, will validate it later
70 |     DeviceSyncAndCheckCudaError();
71 | #ifdef SAVE_DATA
72 |     save_tensor(down_proj_input ,"down_proj_input.bin", count); 
73 | #else
74 | #endif
75 |     // 3.down proj
76 |     launchLinearGemm(down_proj_input, weights.down, ffn_output->as<T>(), cublas_wrapper, false, true);
77 |     DeviceSyncAndCheckCudaError();
78 |     this->freeBuf();
79 | };
80 | 
81 | template class LLaMAFFNLayer<float>;
82 | template class LLaMAFFNLayer<half>;
83 | 


--------------------------------------------------------------------------------
/tests/unittests/test_repeat_kv.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>   // std::fill_n
 2 | #include <iostream>    // snprintf
 3 | #include <math.h>      // expf, log
 4 | #include <stdlib.h>    // rand
 5 | #include <string>      // std::string
 6 | #include <vector>      // std::vector
 7 | 
 8 | #include <math.h>
 9 | #include "src/kernels/repeat_kv.h"
10 | // (RussWong)note:
11 | // there is no repeat kv cpu kernel implementation now
12 | // we compare the kernel correctnesss by eyes
13 | // `./test_repeat_kv` to test fp32 GPU kernel
14 | int main() {
15 |     const int batch_size = 1;
16 |     const int head_num = 2;
17 |     const int kv_head_num = 2;
18 |     const int max_seq_len = 4;
19 |     const int max_k_len = 2;
20 |     const int head_size = 2;
21 |     const int num_layers = 2;
22 |     const int k_size = num_layers * batch_size * kv_head_num * max_seq_len * head_size;
23 |     const int out_k_size = batch_size * head_num * max_k_len * head_size;
24 |     float* h_k;
25 |     float* d_k;
26 |     h_k = (float*)malloc(sizeof(float) * k_size);
27 |     cudaMalloc((void**)&d_k, sizeof(float) * k_size);
28 |     float* h_v;
29 |     float* d_v;
30 |     h_v = (float*)malloc(sizeof(float) * k_size);
31 |     cudaMalloc((void**)&d_v, sizeof(float) * k_size);
32 |     int* h_ctx_len;
33 |     int* d_ctx_len;
34 |     h_ctx_len = (int*)malloc(sizeof(int) * batch_size);
35 |     cudaMalloc((void**)&d_ctx_len, sizeof(int) * batch_size);
36 |     float* h_trans_k;
37 |     float* d_trans_k;
38 |     h_trans_k = (float*)malloc(sizeof(float) * out_k_size);
39 |     cudaMalloc((void**)&d_trans_k, sizeof(float) * out_k_size);
40 |     float* h_trans_v;
41 |     float* d_trans_v;
42 |     h_trans_v = (float*)malloc(sizeof(float) * out_k_size);
43 |     cudaMalloc((void**)&d_trans_v, sizeof(float) * out_k_size);   
44 | 
45 |     for(int i = 0; i < k_size; i++) {
46 |        h_v[i] = i;
47 |        h_k[i] = i;
48 |     }
49 |     int* h_layer_id = (int*)malloc(sizeof(int)*batch_size);
50 | 
51 |     for(int i = 0; i < batch_size; i++) {
52 |        h_ctx_len[i] = 2;
53 |        h_layer_id[i] = 0;
54 |     }    
55 |     
56 |     cudaMemcpy(d_k, h_k, sizeof(float) * k_size, cudaMemcpyHostToDevice);
57 |     cudaMemcpy(d_v, h_v, sizeof(float) * k_size, cudaMemcpyHostToDevice);
58 |     cudaMemcpy(d_ctx_len, h_ctx_len, sizeof(int) * batch_size, cudaMemcpyHostToDevice);
59 |     DataType type = getTensorType<float>(); 
60 |     DataType type_int = getTensorType<int>(); 
61 |     TensorWrapper<float>* in_k = new TensorWrapper<float>(Device::GPU, type, {num_layers, batch_size, kv_head_num, max_seq_len, head_size}, d_k);
62 |     TensorWrapper<float>* in_v = new TensorWrapper<float>(Device::GPU, type, {num_layers, batch_size, kv_head_num, max_seq_len, head_size}, d_v);
63 |     TensorWrapper<int>* ctx_len = new TensorWrapper<int>(Device::GPU, type_int, {batch_size}, d_ctx_len);
64 |     TensorWrapper<float>* out_k = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, max_k_len, head_size}, d_trans_k);
65 |     TensorWrapper<float>* out_v = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, max_k_len, head_size}, d_trans_v);
66 |     TensorWrapper<int>* layer_id = new TensorWrapper<int>(Device::CPU, type_int, {batch_size}, h_layer_id);
67 |     
68 |     std::cout << "before launch repeat kv kernel" << std::endl;
69 |     launchRepeatKVCache(in_k, in_v, ctx_len, layer_id, out_k, out_v);
70 |     std::cout << "after launch repeat kv kernel" << std::endl;
71 |     std::cout << "cuda memcpy device to host" << std::endl;
72 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
73 |     cudaMemcpy(h_trans_k, out_k->data, sizeof(float) * out_k_size, cudaMemcpyDeviceToHost);
74 |     for(int i = 0; i < out_k_size; i++) {
75 |         printf("k trans[%d] = %f\n", i, h_trans_k[i]);
76 |     }
77 |     // debug info, better to retain: std::cout << "before free" << std::endl;
78 |     free(h_k);
79 |     free(h_v);
80 |     free(h_ctx_len);
81 |     free(h_trans_k);
82 |     free(h_trans_v);
83 |     free(h_layer_id);
84 |     cudaFree(d_k);
85 |     cudaFree(d_v);
86 |     cudaFree(d_ctx_len);
87 |     cudaFree(d_trans_k);
88 |     cudaFree(d_trans_v);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/layers/attention/masked_self_attention.cpp:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include "src/utils/debug_utils.h"
 3 | #include "src/layers/attention/masked_self_attention.h"
 4 | //(RussWong) note: layers文件夹下，很多操作后面我都加了`DeviceSyncAndCheckCudaError();`，大家可手动删除或者按照lesson30所示添加条件编译代码
 5 | template<typename T>
 6 | LLaMASelfAttentionLayer<T>::LLaMASelfAttentionLayer(
 7 |                                int head_num,
 8 |                                int kv_head_num,
 9 |                                int head_size,
10 |                                LLaMAAttentionStaticParams attn_params,
11 |                                cudaStream_t stream,
12 |                                cublasWrapper* cublas_wrapper,
13 |                                BaseAllocator* allocator):
14 |     head_num(head_num),
15 |     kv_head_num(kv_head_num),
16 |     head_size(head_size),
17 |     stream(stream),
18 |     cublas_wrapper(cublas_wrapper),
19 |     allocator(allocator),
20 |     hidden_units(head_num * head_size),
21 |     attn_static_params(attn_params),
22 |     // TODO: check kv_head_num is divided by haed_num
23 |     q_head_per_kv(head_num / kv_head_num),
24 |     scale(float(1 / sqrt(head_size))){}
25 | 
26 | template<typename T>
27 | void LLaMASelfAttentionLayer<T>::allocForForward(LLaMAAttentionDynParams& params) {
28 |     int batch_size = params.batch_size;
29 |     int num_tokens = params.num_tokens;
30 |     int max_q_len = params.max_q_len;
31 |     int max_k_len = params.max_k_len;
32 |     DataType type = getTensorType<T>(); 
33 |     const int qkv_head_num = head_num + 2 * kv_head_num;
34 |     // (RussWong) note: 当前step的q k v的shape里面step或seqlen都是1，之前step的kv在做gemv的时候直接从kv cache拿
35 |     qkv_buf = new TensorWrapper<T>(Device::GPU, type, {batch_size, qkv_head_num, head_size}); 
36 |     mha_output = new TensorWrapper<T>(Device::GPU, type, {batch_size, hidden_units});
37 |     
38 |     qkv_buf->data = allocator->Malloc(qkv_buf->data, sizeof(T) * batch_size * qkv_head_num * head_size, false);
39 |     mha_output->data = allocator->Malloc(
40 |         mha_output->data, sizeof(T) * batch_size * hidden_units, false);
41 | }
42 | template<typename T>
43 | void LLaMASelfAttentionLayer<T>::freeBuf(){
44 |     allocator->Free(qkv_buf->data);
45 |     DeviceSyncAndCheckCudaError();
46 |     allocator->Free(mha_output->data);
47 |     DeviceSyncAndCheckCudaError();
48 | }
49 | // (RussWong) note: params order of launcher function in LaMAContextAttentionLayer<T>::forward: (input[Tensor], input[Tensor],...,weight[Weight], output[*])
50 | template<typename T>
51 | void LLaMASelfAttentionLayer<T>::forward(TensorMap& inputs, TensorMap& outputs, LLaMAattentionWeights<T>& weights, LLaMAAttentionDynParams& params)
52 | {   
53 |     // (RussWong) note: allocate intermediate buf of the layer forward
54 |     allocForForward(params);
55 |     //1. qkv linear
56 |     //shape:[bs,1,q_hidden_units] * [q_hidden_units, hidden_units] = [bs,1,hidden_units]
57 |     Tensor* attention_input = inputs["attention_input"];
58 |     launchLinearGemm(attention_input->as<T>(), weights.qkv, qkv_buf, cublas_wrapper, false, true);
59 |     DeviceSyncAndCheckCudaError();
60 |     //2. biasRope
61 |     Tensor* attention_output = outputs["attention_output"];
62 |     // kv cache shape = [bs, kv head num, max seq len head size]
63 |     Tensor* key_cache       = outputs["all_k_cache"];
64 |     Tensor* value_cache     = outputs["all_v_cache"];
65 |     Tensor* finished = inputs["finished"];
66 |     Tensor* step = inputs["step"];//[1] onCPU
67 |     Tensor* layer_id = inputs["layer_id"];//[1] onCPU
68 |     launchRoPE(qkv_buf, step->as<int>(), attn_static_params);
69 |     DeviceSyncAndCheckCudaError();
70 |     // 3. fused masked mha
71 |     launchDecoderMaskedMHA<T>(qkv_buf, weights.qkv, layer_id->as<int>(), key_cache->as<T>(), value_cache->as<T>(), finished->as<bool>(), step->as<int>(), mha_output, attn_static_params);
72 |     DeviceSyncAndCheckCudaError();
73 | #ifdef SAVE_DATA
74 |     save_tensor(mha_output ,"self_decoder_qk_v_after_bmm.bin", layer_id->as<int>());
75 | #else
76 | #endif
77 |     // 4. attention output linear
78 |     launchLinearGemm(mha_output, weights.output, attention_output->as<T>(), cublas_wrapper, false, true);
79 |     DeviceSyncAndCheckCudaError();
80 | #ifdef SAVE_DATA
81 |     save_tensor(mha_output ,"self_decoder_outlinear_out.bin", layer_id->as<int>());
82 | #else
83 | #endif
84 |     this->freeBuf();
85 | }
86 | 
87 | template class LLaMASelfAttentionLayer<float>;
88 | template class LLaMASelfAttentionLayer<half>;
89 | 


--------------------------------------------------------------------------------
/tests/unittests/test_data_compare.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>   // std::fill_n
  2 | #include <iostream>    // snprintf
  3 | #include <math.h>      // expf, log
  4 | #include <stdlib.h>    // rand
  5 | #include <string>      // std::string
  6 | #include <vector>      // std::vector
  7 | #include <stdio.h>
  8 | #include <fstream>
  9 | #include "src/utils/macro.h"
 10 | #include "src/utils/debug_utils.h"
 11 | // (RussWong)note:
 12 | // this test is for debug, to compare intermediate tensor and HF intermediate tensor
 13 | // and the intermediate tensor will be saved in file when you compile the proj by `cmake .. -DSAVE_DATA=ON && make -j8`
 14 | // before run, you should change the path to your local right dir
 15 | // `./debug` to compare 
 16 | 
 17 | std::vector<float> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
 18 | {
 19 |     size_t dim0 = 1, dim1 = 1;
 20 |     if (shape.size() > 2) {
 21 |         dim0 = shape[0] * shape[1];
 22 |         dim1 = shape[2];
 23 |     }
 24 | 
 25 |     if (shape.size() == 2) {
 26 |         dim0 = shape[0];
 27 |         dim1 = shape[1];
 28 |     }
 29 |     size_t size = dim0 * dim1;
 30 |     if (size == 0) {
 31 |         std::cout << "shape is zero, skip loading weight from file: " << filename << std::endl;
 32 |         return std::vector<float>();
 33 |     }
 34 | 
 35 |     std::vector<float> host_array(size);
 36 |     std::ifstream  in(filename, std::ios::in | std::ios::binary);
 37 |     if (!in.is_open()) {
 38 |         std::cout << "file" << filename << "cannot be opened, loading model fails!" << std::endl;
 39 |         return std::vector<float>();
 40 |     }
 41 | 
 42 |     size_t loaded_data_size = sizeof(float) * size;
 43 |     in.seekg(0, in.end);
 44 |     in.seekg(0, in.beg);
 45 | 
 46 |     std::cout << "Read " << std::to_string(loaded_data_size) << " bytes from " << filename << std::endl;
 47 |     in.read((char*)host_array.data(), loaded_data_size);
 48 | 
 49 |     size_t in_get_size = in.gcount();
 50 |     if (in_get_size != loaded_data_size) {
 51 |         return std::vector<float>();
 52 |     }
 53 |     in.close();
 54 |     // If we succeed, return an array with values.
 55 |     return host_array;
 56 | }
 57 | void internalFunc(float* ptr, std::vector<size_t> shape, std::string filename) {
 58 |     std::vector<float> host_array = loadWeightFromBinHelper(shape, filename);
 59 |     if (host_array.empty()) {
 60 |         std::cout << "[warning] data from file is empty!!" << "\n";
 61 |         return;
 62 |     }
 63 |     // copy host_array to our defined ptr
 64 |     memcpy(ptr, host_array.data(), host_array.size());
 65 |     return;
 66 | }
 67 | void loadWeights(float* ptr1, std::string weight_path, int shape0, int shape1) // weighttype参数比较多余
 68 | {
 69 |     // load attn output
 70 |     internalFunc(ptr1, {(size_t)shape0, (size_t)shape1}, weight_path);
 71 | 
 72 | }
 73 | void loadWeights_trans(float* ptr1, std::string weight_path, int shape0, int shape1) // weighttype参数比较多余
 74 | {
 75 |     // load attn output
 76 |     internalFunc(ptr1, {(size_t)shape0, (size_t)shape1}, weight_path);
 77 | 
 78 | }
 79 | 
 80 | bool CheckResult(float* CPUoutput, float* GPUoutput, int in_size) {
 81 |     for(int i = 0; i < in_size; i++) {
 82 | 	if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){
 83 | 	    printf("the %dth res is wrong, onellm = %f, trans = %f\n", i, CPUoutput[i], GPUoutput[i]);
 84 |     	}
 85 |     }
 86 |     return true;
 87 | }
 88 | // 1.for example: the path of two data files is below, and you should replace L101&L102 with the two
 89 | // /home/data/trans/q_buf_after_rope_trans.bin
 90 | // /home/data/onellm/q_buf_after_rope.bin
 91 | // 2.And you should change the L93&L94 to the right data size according to your tensor shape of the data file
 92 | int main(int argc, char *argv[]) {
 93 |     int shape0 = 1; // TO MODIFY before run
 94 |     int shape1 = 4096; // TO MODIFY before run
 95 |     
 96 |     int in_size = shape0 * shape1;
 97 | 
 98 |     float* d_in = (float*) malloc(sizeof(float) * in_size);
 99 |     float* d_in_trans = (float*) malloc(sizeof(float) * in_size);
100 | 
101 |     loadWeights(d_in, "/home/data/onellm/0_self_decoder_qk_v_after_bmm.bin", shape0, shape1); // TO MODIFY
102 |     loadWeights_trans(d_in_trans, "/home/data/trans/self_decoder_qk_v_buf_after_bmm_trans.bin", shape0, shape1); // TO MODIFY
103 |     std::cout << "====intermediate tensor comparison result====" << "\n";
104 |     CheckResult(d_in, d_in_trans, shape0 * shape1);
105 | 
106 |     free(d_in);
107 |     free(d_in_trans);
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/src/layers/decoder/self_decoder.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/utils/macro.h"
 3 | #include "src/layers/decoder/self_decoder.h"
 4 | //(RussWong) note: layers文件夹下，很多操作后面我都加了`DeviceSyncAndCheckCudaError();`，大家可手动删除或者按照lesson30所示添加条件编译代码
 5 | template<typename T>
 6 | void LlamaSelfDecoder<T>::allocForForward(LLaMAAttentionDynParams& params)
 7 | {
 8 |     DataType type = getTensorType<T>(); 
 9 |     int batch_size = params.batch_size;
10 |     decoder_residual = new TensorWrapper<T>(Device::GPU, type, {batch_size, hidden_units});
11 |     decoder_residual->data = allocator->Malloc(decoder_residual->data, sizeof(T) * batch_size * hidden_units, false);
12 | 
13 | }
14 | template<typename T>
15 | void LlamaSelfDecoder<T>::freeBuf()
16 | {
17 |     allocator->Free(decoder_residual->data);
18 | }
19 | template<typename T>
20 | void LlamaSelfDecoder<T>::forward(TensorMap& input_tensors, const std::vector<LlamaLayerWeight<T>*>& layerWeights, TensorMap& output_tensors, LLaMAAttentionDynParams& dyn_params)
21 | {
22 |     allocForForward(dyn_params);
23 |     Tensor* decoder_input = input_tensors["decoder_input"];
24 |     Tensor* step = input_tensors["step"];
25 |     Tensor* finished = input_tensors["finished"];
26 |     Tensor* decoder_output = output_tensors["decoder_output"];
27 |     Tensor* all_k_cache = output_tensors["all_k_cache"];
28 |     Tensor* all_v_cache = output_tensors["all_v_cache"];
29 |     Tensor* layer_id = input_tensors["layer_id"];
30 |     DataType type_int = getTensorType<int>();
31 |     LLM_CHECK_WITH_INFO(decoder_input->as<T>()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!");
32 |     LLM_CHECK_WITH_INFO(step->as<int>()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!");
33 |     LLM_CHECK_WITH_INFO(finished->as<bool>()->data != nullptr, "the data ptr of tensor inserted into TensorMap is nullptr!");
34 | 
35 |     TensorMap self_attn_inputs{
36 |         {"attention_input", decoder_input},
37 |         {"layer_id", layer_id},
38 |         {"step", step},
39 |         {"finished", finished}
40 |     };
41 |     TensorMap self_attn_outputs{
42 |         {"attention_output", decoder_output},
43 |         {"all_k_cache", all_k_cache},
44 |         {"all_v_cache", all_v_cache}
45 |     }; 
46 |        
47 |     for(int layer_id = 0; layer_id < num_layer; layer_id++) {
48 | 	//std::cout << "=============in layer " << layer_id << "==============" << "\n";
49 | 	if (layer_id > 0){
50 |             TensorWrapper<int>* layer = new TensorWrapper<int>(Device::CPU, type_int, {1}, &layer_id);
51 |             self_attn_inputs.insert("layer_id", layer);
52 |         }
53 |         decoder_input = self_attn_inputs["attention_input"];
54 |         launchRMSNorm(decoder_input->as<T>(), //in&out, [bs, q_hidden_units]
55 |                     decoder_residual, // = rmsnorm input hidden states, as input of next add residual
56 |                     layerWeights[layer_id]->attn_norm_weight,//rmsnorm weights, [q_hidden_units]
57 |                     rmsnorm_eps);
58 |         DeviceSyncAndCheckCudaError();  
59 |         selfAttn->forward(self_attn_inputs, self_attn_outputs, layerWeights[layer_id]->self_attn_weight, dyn_params);
60 |         launchFusedAddBiasResidualRMSNorm(decoder_residual, //in residual from tensor before rmsnorm and return decoder_residual + decoder_output, [bs, q hidden_units]
61 |                                           decoder_output->as<T>(), //in&out from attention output, [bs, q hidden_units]
62 |                                           layerWeights[layer_id]->self_attn_weight.output, //bias
63 |                                           layerWeights[layer_id]->ffn_norm_weight.gamma,//rmsnorm weights, [q hidden_units]
64 |                                           rmsnorm_eps);
65 |         DeviceSyncAndCheckCudaError();
66 |         TensorMap ffn_inputs{
67 |             {"ffn_input", decoder_output}
68 |         };
69 |         TensorMap ffn_outputs{
70 |             {"ffn_output", decoder_output}
71 |         };
72 |         ffn->forward(ffn_inputs, ffn_outputs, layerWeights[layer_id]->ffn_weight, dyn_params);
73 |         launchAddResidual(decoder_residual, //in, [bs, hidden_units]
74 |                         decoder_output->as<T>(), //in&out, [bs, hidden_units]
75 |                         true);
76 |         
77 | 	DeviceSyncAndCheckCudaError();
78 |         self_attn_inputs.insert("attention_input", decoder_output); // for next iter
79 |     }
80 |     // no intermedia buffer to free, so ignore call free
81 | }
82 | 
83 | template class LlamaSelfDecoder<float>;
84 | template class LlamaSelfDecoder<half>;
85 | 


--------------------------------------------------------------------------------
/tests/unittests/test_linear.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>   // std::fill_n
  2 | #include <iostream>    // snprintf
  3 | #include <math.h>      // expf, log
  4 | #include <stdlib.h>    // rand
  5 | #include <string>      // std::string
  6 | #include <vector>      // std::vector
  7 | #include <stdio.h>
  8 | #include <fstream>
  9 | #include "src/utils/macro.h"
 10 | #include "src/kernels/linear.h"
 11 | #include "src/weights/base_weights.h"
 12 | 
 13 | void CPUlinear(float* input, float* weight, float* output,
 14 |                 int m, int k, int n) {
 15 |     for(int i = 0; i < m; i++) {
 16 |         for(int j = 0; j < n; j++) {
 17 |             for(int l = 0; l < k; l++) {
 18 |                 output[i * n + j] += input[i * k + l] * weight[l * n + j];
 19 |             }
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) {
 25 |     for(int i = 0; i < output_size; i++) {
 26 |     if (i < 5) {
 27 |         printf("0th res, CPUoutput = %f, GPUoutput = %f\n", CPUoutput[i], GPUoutput[i]);
 28 |     }
 29 |     if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){
 30 |             printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]);
 31 |             return false;
 32 |         }
 33 | 
 34 |     }
 35 |     return true;
 36 | }
 37 | 
 38 | int main(int argc, char *argv[]) {
 39 |     const int seqlen = 13;
 40 |     const int hidden_units = 4096;
 41 |     const int vocab_size = 32;
 42 |     const int inter_size = 10;
 43 |     int hidden_units_2 = 0;
 44 |     int output_size = 0;
 45 | 
 46 |     hidden_units_2 = hidden_units * hidden_units;
 47 |     output_size = seqlen * hidden_units;
 48 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
 49 |     float* h_w;
 50 |     float* d_w;
 51 |     h_w = (float*)malloc(sizeof(float) * hidden_units_2);
 52 |     cudaMalloc((void**)&d_w, sizeof(float) * hidden_units_2);
 53 |     for(int i = 0; i < hidden_units_2; i++) {
 54 |        h_w[i] = (float)(i % 3); // 1 2 1 2
 55 |     }
 56 | 
 57 |     float* h_in = (float*) malloc(sizeof(float) * hidden_units * seqlen);
 58 |     float* d_in;
 59 |     cudaMalloc((void**)&d_in, sizeof(float) * seqlen *  hidden_units);
 60 |     for(int i = 0; i < hidden_units * seqlen; i++) {
 61 |        h_in[i] = (float)(i % 3);
 62 |     }
 63 | 
 64 |     float* h_out = (float*) malloc(sizeof(float) * output_size);
 65 |     float* d_out;
 66 |     cudaMalloc((void**)&d_out, sizeof(float) * output_size);
 67 |     CHECK(cudaMemcpy(d_in, h_in, sizeof(float) * hidden_units * seqlen, cudaMemcpyHostToDevice));
 68 |     CHECK(cudaMemcpy(d_w, h_w, sizeof(float) * hidden_units_2, cudaMemcpyHostToDevice));
 69 |     DataType type = getTensorType<float>();
 70 |     WeightType wtype = getWeightType<float>();
 71 |     TensorWrapper<float>* in = new TensorWrapper<float>(Device::GPU, type, {seqlen, hidden_units}, d_in);
 72 |     BaseWeight<float> weight;
 73 |     weight.shape = {hidden_units, hidden_units};
 74 |     weight.data = d_w;
 75 |     weight.type = wtype;
 76 |     TensorWrapper<float>* out;
 77 |     out = new TensorWrapper<float>(Device::GPU, type, {seqlen, hidden_units}, d_out);
 78 |     cublasHandle_t cublas_handle;
 79 |     cublasLtHandle_t cublaslt_handle;
 80 |     cublasCreate(&cublas_handle);
 81 |     cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
 82 |     cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle);
 83 |     cublas_wrapper->setFP32GemmConfig();
 84 |     // debug info, better to retain:
 85 |     std::cout << "before launch kernel" << std::endl;
 86 |     launchLinearGemm(in, weight, out, cublas_wrapper);
 87 |     // debug info, better to retain:
 88 |     std::cout << "after launch kernel" << std::endl;
 89 |     // debug info, better to retain:
 90 |     std::cout << "cuda memcpy device to host" << std::endl;
 91 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
 92 |     CHECK(cudaMemcpy(h_out, d_out, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
 93 |     float* CPUout = (float*) malloc(sizeof(float) * output_size);
 94 |     CPUlinear(h_in, h_w, CPUout, seqlen, hidden_units, hidden_units);
 95 | 
 96 |     bool is_right = CheckResult(CPUout, h_out, output_size);
 97 |     // debug info, better to retain:
 98 |     std::cout << "before free" << std::endl;
 99 |     std::cout << "linear passed" << std::endl;
100 |     free(h_in);
101 |     free(h_w);
102 |     free(h_out);
103 |     free(CPUout);
104 |     cudaFree(d_in);
105 |     cudaFree(d_w);
106 |     cudaFree(d_out);
107 | }


--------------------------------------------------------------------------------
/src/utils/macro.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <string>
  3 | #include <fstream>
  4 | #include <iostream>
  5 | #include <cuda.h>
  6 | #include <cuda_runtime.h>
  7 | #include <cublas_v2.h>
  8 | //(RussWong) note: some macro check to assert for helping us find errors, so that we can 
  9 | // find the bugs faster
 10 | #define CHECK(call)                                   \
 11 | do                                                    \
 12 | {                                                     \
 13 |     const cudaError_t error_code = call;              \
 14 |     if (error_code != cudaSuccess)                    \
 15 |     {                                                 \
 16 |         printf("CUDA Error:\n");                      \
 17 |         printf("    File:       %s\n", __FILE__);     \
 18 |         printf("    Line:       %d\n", __LINE__);     \
 19 |         printf("    Error code: %d\n", error_code);   \
 20 |         printf("    Error text: %s\n",                \
 21 |             cudaGetErrorString(error_code));          \
 22 |         exit(1);                                      \
 23 |     }                                                 \
 24 | } while (0)
 25 | 
 26 | static const char* _cudaGetErrorEnum(cudaError_t error)
 27 | {
 28 |     return cudaGetErrorString(error);
 29 | }
 30 | 
 31 | static const char* _cudaGetErrorEnum(cublasStatus_t error)
 32 | {
 33 |     switch (error) {
 34 |         case CUBLAS_STATUS_SUCCESS:
 35 |             return "CUBLAS_STATUS_SUCCESS";
 36 | 
 37 |         case CUBLAS_STATUS_NOT_INITIALIZED:
 38 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
 39 | 
 40 |         case CUBLAS_STATUS_ALLOC_FAILED:
 41 |             return "CUBLAS_STATUS_ALLOC_FAILED";
 42 | 
 43 |         case CUBLAS_STATUS_INVALID_VALUE:
 44 |             return "CUBLAS_STATUS_INVALID_VALUE";
 45 | 
 46 |         case CUBLAS_STATUS_ARCH_MISMATCH:
 47 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
 48 | 
 49 |         case CUBLAS_STATUS_MAPPING_ERROR:
 50 |             return "CUBLAS_STATUS_MAPPING_ERROR";
 51 | 
 52 |         case CUBLAS_STATUS_EXECUTION_FAILED:
 53 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
 54 | 
 55 |         case CUBLAS_STATUS_INTERNAL_ERROR:
 56 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
 57 | 
 58 |         case CUBLAS_STATUS_NOT_SUPPORTED:
 59 |             return "CUBLAS_STATUS_NOT_SUPPORTED";
 60 | 
 61 |         case CUBLAS_STATUS_LICENSE_ERROR:
 62 |             return "CUBLAS_STATUS_LICENSE_ERROR";
 63 |     }
 64 |     return "<unknown>";
 65 | }
 66 | 
 67 | template<typename T>
 68 | void check(T result, char const* const func, const char* const file, int const line)
 69 | {
 70 |     if (result) {
 71 |         throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
 72 |                                  + file + ":" + std::to_string(line) + " \n");
 73 |     }
 74 | }
 75 | 
 76 | #define CHECK_CUBLAS(val) check((val), #val, __FILE__, __LINE__)
 77 | 
 78 | inline void syncAndCheck(const char* const file, int const line)
 79 | {
 80 |     cudaDeviceSynchronize();
 81 |     cudaError_t result = cudaGetLastError();
 82 |     if (result) {
 83 |         throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
 84 |                                  + file + ":" + std::to_string(line) + " \n");
 85 |     }
 86 | }
 87 | 
 88 | #define DeviceSyncAndCheckCudaError() syncAndCheck(__FILE__, __LINE__)
 89 | 
 90 | [[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "")
 91 | {
 92 |     throw std::runtime_error(std::string("[oneLLM][ERROR] ") + info + " Assertion fail: " + file + ":"
 93 |                              + std::to_string(line) + " \n");
 94 | }
 95 | 
 96 | inline void llmAssert(bool result, const char* const file, int const line, std::string const& info = "")
 97 | {
 98 |     if (!result) {
 99 |         throwRuntimeError(file, line, info);
100 |     }
101 | }
102 | 
103 | #define LLM_CHECK(val) llmAssert(val, __FILE__, __LINE__)
104 | #define LLM_CHECK_WITH_INFO(val, info)                                                                              \
105 |     do {                                                                                                               \
106 |         bool is_valid_val = (val);                                                                                     \
107 |         if (!is_valid_val) {                                                                                           \
108 |             llmAssert(is_valid_val, __FILE__, __LINE__, (info));                                                    \
109 |         }                                                                                                              \
110 |     } while (0)
111 | 


--------------------------------------------------------------------------------
/examples/cpp/ffn/ffn_example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <cuda.h>
 4 | #include <cuda_runtime.h>
 5 | #include "src/layers/ffn/ffn.h"
 6 | 
 7 | int main(int argc, char** argv)
 8 | {
 9 |     int head_num = 4;
10 |     int head_size = 8;
11 |     int inter_size = 12;
12 |     int hidden_units = head_num * head_size;
13 |     
14 |     cublasHandle_t cublas_handle;
15 |     cublasLtHandle_t cublaslt_handle;
16 |     cudaStream_t stream;
17 |     cublasCreate(&cublas_handle);
18 |     cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
19 |     cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle);
20 |     BaseAllocator* allocator = new CudaAllocator;
21 | 
22 |     LLaMAAttentionDynParams attn_dyn_params;
23 |     attn_dyn_params.num_tokens = 14;  
24 |     std::cout << "start malloc/cudamalloc buffer" << "\n";
25 |     float* h_ffn_input = (float*) malloc(sizeof(float) * hidden_units * attn_dyn_params.num_tokens);
26 |     float* d_ffn_input;
27 |     cudaMalloc((void**)&d_ffn_input, sizeof(float) * hidden_units * attn_dyn_params.num_tokens);
28 |     for(int i = 0; i < hidden_units * attn_dyn_params.num_tokens; i++) { 
29 |        h_ffn_input[i] = (float)(i % 2 + 1);
30 |     }    
31 |     float* h_gate_up = (float*) malloc(sizeof(float) * hidden_units * 2 * inter_size);
32 |     float* d_gate_up;
33 |     cudaMalloc((void**)&d_gate_up, sizeof(float) * hidden_units * 2 * inter_size);
34 |     for(int i = 0; i < hidden_units * 2 * inter_size; i++) { 
35 |        h_gate_up[i] = (float)(i % 2 + 1);
36 |     }  
37 |    //  float* h_up = (float*) malloc(sizeof(float) * hidden_units * inter_size);
38 |    //  float* d_up;
39 |    //  cudaMalloc((void**)&d_up, sizeof(float) * hidden_units * inter_size);
40 |    //  for(int i = 0; i < hidden_units * inter_size; i++) { 
41 |    //     h_up[i] = 1.0f;
42 |    //  }  
43 |     float* h_down = (float*) malloc(sizeof(float) * hidden_units * inter_size);
44 |     float* d_down;
45 |     cudaMalloc((void**)&d_down, sizeof(float) * hidden_units * inter_size);
46 |     for(int i = 0; i < hidden_units * inter_size; i++) { 
47 |        h_down[i] = (float)(i % 2 + 1);
48 |     }  
49 |     float* d_ffn_output;
50 |     cudaMalloc((void**)&d_ffn_output, sizeof(float) * attn_dyn_params.num_tokens * hidden_units);
51 |     std::cout << "end malloc/cudamalloc buffer and start memcpyh2d" << "\n";
52 |     CHECK(cudaMemcpy(d_ffn_input, h_ffn_input, sizeof(float) * hidden_units * attn_dyn_params.num_tokens, cudaMemcpyHostToDevice));
53 |     CHECK(cudaMemcpy(d_gate_up, h_gate_up, sizeof(float) * hidden_units * 2 * inter_size, cudaMemcpyHostToDevice));
54 |    //  CHECK(cudaMemcpy(d_up, h_up, sizeof(float) * hidden_units * inter_size, cudaMemcpyHostToDevice));
55 |     CHECK(cudaMemcpy(d_down, h_down, sizeof(float) * hidden_units * inter_size, cudaMemcpyHostToDevice));
56 |     DataType type = getTensorType<float>(); // note: the type should be as a class data member!
57 |     LLaMAFFNWeights<float> ffn_weights;
58 |     ffn_weights.gateAndup.data = d_gate_up;
59 |     ffn_weights.gateAndup.shape = {2 * inter_size, hidden_units};
60 |    //  ffn_weights.up.data = d_up;
61 |    //  ffn_weights.up.shape = {hidden_units, inter_size};
62 |     ffn_weights.down.data = d_down;
63 |     ffn_weights.down.shape = {hidden_units, inter_size};
64 |     TensorWrapper<float>* ffn_input = new TensorWrapper<float>(GPU, 
65 |                                                                type, 
66 |                                                                {attn_dyn_params.num_tokens, hidden_units}, 
67 |                                                                d_ffn_input);
68 |     TensorWrapper<float>* ffn_output = new TensorWrapper<float>(GPU, 
69 |                                                                type, 
70 |                                                                {attn_dyn_params.num_tokens, hidden_units}, 
71 |                                                                d_ffn_output);
72 |     TensorMap ffn_inputs{
73 |         {"ffn_input", ffn_input}
74 |     };
75 |     TensorMap ffn_outputs{
76 |         {"ffn_output", ffn_output}
77 |     };
78 |     std::cout << "initializing ffn layer" << "\n";
79 |     LLaMAFFNLayer<float>* ffn_layer = new LLaMAFFNLayer<float>(head_num,
80 |                                                 head_size,
81 |                                                 inter_size,
82 |                                                 stream,
83 |                                                 cublas_wrapper,
84 |                                                 allocator);
85 |     std::cout << "start fwd" << "\n";
86 |     ffn_layer->forward(ffn_inputs, ffn_outputs, ffn_weights, attn_dyn_params);
87 |     std::cout << "end fwd" << "\n";
88 |     free(h_ffn_input);  
89 |     free(h_gate_up);  
90 |    //  free(h_up);  
91 |     free(h_down); 
92 |     cudaFree(d_ffn_input);  
93 |     cudaFree(d_gate_up);  
94 |    //  cudaFree(d_up);  
95 |     cudaFree(d_down); 
96 |     cudaFree(d_ffn_output);
97 | }
98 | 


--------------------------------------------------------------------------------
/tests/unittests/test_casual_mask.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>   // std::fill_n
  2 | #include <iostream>    // snprintf
  3 | #include <math.h>      // expf, log
  4 | #include <stdlib.h>    // rand
  5 | #include <string>      // std::string
  6 | #include <vector>      // std::vector
  7 | 
  8 | #include "src/kernels/build_casual_mask.h"
  9 | // (RussWong)note: this kernel's CPU implementation is absolutely right.
 10 | // when you are implementing LLMs inference on CPU, you can reuse the CPU kernel
 11 | // we compare the kernel correctnesss by eyes and result print infos
 12 | void CPUbuildCasualMask(float* mask, 
 13 |                         const int* q_lens,  //input lens, shape=[batch size]
 14 |                         const int* k_lens,  //context lens, shape=[batch size]
 15 |                         int max_q_len, 
 16 |                         int max_k_len,
 17 |                         int batch_size) {
 18 |     for(int b = 0; b < batch_size; b++){
 19 |         int start = b * max_q_len * max_k_len;
 20 |         int q = q_lens[b];
 21 |         int k = k_lens[b];
 22 |         for(int i = 0; i < max_q_len; i++) {
 23 |             for(int j = 0; j < max_k_len; j++) {
 24 |                 if(j <= i + (k - q) && i < q && j < k) {
 25 |                     mask[start + i * max_k_len + j] = 1.0f;
 26 |                 } else {
 27 |                     mask[start + i * max_k_len + j] = 0.0f;   
 28 |                 }
 29 |             }
 30 |         }
 31 |     }
 32 | }
 33 | bool CheckResult(float* CPUres, float* GPUres, const int size) {
 34 |     for(int i = 0; i < size; i++) {
 35 |         if(fabs(CPUres[i] - GPUres[i]) > 1e-6){
 36 |             printf("the %dth res is wrong, CPU mask = %f, GPU mask = %f\n", i, CPUres[i], GPUres[i]);
 37 |             return false;
 38 |         }
 39 |     }
 40 |     return true;
 41 | }
 42 | // (RussWong)note:
 43 | // `./causalmask` to test fp32 GPU build causal mask kernel
 44 | int main() {
 45 |     const int batch_size = 1;
 46 |     const int max_q_len = 5;
 47 |     const int max_k_len = 5;
 48 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
 49 |     const int mask_size = batch_size * max_q_len * max_k_len;
 50 |     int* h_q_lens;
 51 |     int* d_q_lens;
 52 |     h_q_lens = (int*)malloc(sizeof(int) * batch_size);
 53 |     cudaMalloc((void**)&d_q_lens, sizeof(int) * batch_size);
 54 |     int* h_k_lens;
 55 |     int* d_k_lens;
 56 |     h_k_lens = (int*)malloc(sizeof(int) * batch_size);
 57 |     cudaMalloc((void**)&d_k_lens, sizeof(int) * batch_size);
 58 | 
 59 |     float* d_mask;
 60 |     float* h_mask = (float*)malloc(sizeof(float) * mask_size);
 61 |     cudaMalloc((void**)&d_mask, sizeof(float) * mask_size);
 62 | 
 63 |     for(int i = 0; i < batch_size; i++) {
 64 |        h_q_lens[i] = 3;
 65 |     }
 66 |     for(int i = 0; i < batch_size; i++) {
 67 |        h_k_lens[i] = 3;
 68 |     }
 69 |     CHECK(cudaMemcpy(d_q_lens, h_q_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice));
 70 |     CHECK(cudaMemcpy(d_k_lens, h_k_lens, sizeof(int) * batch_size, cudaMemcpyHostToDevice));
 71 |     DataType type_float = getTensorType<float>();
 72 |     DataType type_int = getTensorType<int>();
 73 |     TensorWrapper<float>* mask = new TensorWrapper<float>(Device::GPU, 
 74 |                                                         type_float,
 75 |                                                         {batch_size, max_q_len, max_k_len}, 
 76 |                                                         d_mask);
 77 |     TensorWrapper<int>* q_lens = new TensorWrapper<int>(Device::GPU, 
 78 |                                                         type_int,
 79 |                                                         {batch_size}, 
 80 |                                                         d_q_lens);
 81 |     TensorWrapper<int>* k_lens = new TensorWrapper<int>(Device::GPU, 
 82 |                                                         type_int,
 83 |                                                         {batch_size}, 
 84 |                                                         d_k_lens);
 85 |     launchBuildCausalMasks(mask, q_lens, k_lens);
 86 |     // debug info, better to retain: std::cout << "after launch kernel" << std::endl;
 87 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
 88 |     CHECK(cudaMemcpy(h_mask, d_mask, sizeof(float) * mask_size, cudaMemcpyDeviceToHost));
 89 |     float* CPUmask = (float*)malloc(sizeof(float) * mask_size);
 90 |     CPUbuildCasualMask(CPUmask, h_q_lens, h_k_lens, max_q_len, max_k_len, batch_size);
 91 |     if (CheckResult(CPUmask, h_mask, mask_size)) {
 92 |         printf("test passed!\n");
 93 |     }
 94 | 
 95 |     // debug info, better to retain: std::cout << "before free" << std::endl;
 96 |     free(h_q_lens);
 97 |     free(h_k_lens);
 98 |     free(h_mask);
 99 |     free(CPUmask);
100 |     cudaFree(d_q_lens);
101 |     cudaFree(d_k_lens);
102 |     cudaFree(d_mask);
103 | }
104 | 


--------------------------------------------------------------------------------
/src/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(embeddingFunctor STATIC input_embedding.cu)
 2 | set_property(TARGET embeddingFunctor PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
 3 | set_property(TARGET embeddingFunctor PROPERTY POSITION_INDEPENDENT_CODE  ON)
 4 | set_property(TARGET embeddingFunctor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 5 | 
 6 | add_library(rmsnorm STATIC rmsnorm_kernel.cu)
 7 | set_property(TARGET rmsnorm PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
 8 | set_property(TARGET rmsnorm PROPERTY POSITION_INDEPENDENT_CODE  ON)
 9 | set_property(TARGET rmsnorm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
10 | 
11 | add_library(cal_paddingoffset STATIC cal_paddingoffset.cu)
12 | set_property(TARGET cal_paddingoffset PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
13 | set_property(TARGET cal_paddingoffset PROPERTY POSITION_INDEPENDENT_CODE  ON)
14 | set_property(TARGET cal_paddingoffset PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
15 | 
16 | add_library(build_casual_mask STATIC build_casual_mask.cu)
17 | set_property(TARGET build_casual_mask PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
18 | set_property(TARGET build_casual_mask PROPERTY POSITION_INDEPENDENT_CODE  ON)
19 | set_property(TARGET build_casual_mask PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
20 | 
21 | add_library(cublasWrapper STATIC cublas_utils.cc)
22 | set_property(TARGET cublasWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
23 | set_property(TARGET cublasWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
24 | 
25 | add_library(linear STATIC linear.cu)
26 | set_property(TARGET linear PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
27 | set_property(TARGET linear PROPERTY POSITION_INDEPENDENT_CODE  ON)
28 | set_property(TARGET linear PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
29 | target_link_libraries(linear PUBLIC -lcudart -lcublas cublasWrapper)
30 | 
31 | add_library(qkv_bias_and_rope STATIC qkv_bias_and_RoPE.cu)
32 | set_property(TARGET qkv_bias_and_rope PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
33 | set_property(TARGET qkv_bias_and_rope PROPERTY POSITION_INDEPENDENT_CODE  ON)
34 | set_property(TARGET qkv_bias_and_rope PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
35 | 
36 | add_library(concat_kv STATIC concat_past_kv.cu)
37 | set_property(TARGET concat_kv PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
38 | set_property(TARGET concat_kv PROPERTY POSITION_INDEPENDENT_CODE  ON)
39 | set_property(TARGET concat_kv PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
40 | 
41 | add_library(repeat_kv STATIC repeat_kv.cu)
42 | set_property(TARGET repeat_kv PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
43 | set_property(TARGET repeat_kv PROPERTY POSITION_INDEPENDENT_CODE  ON)
44 | set_property(TARGET repeat_kv PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
45 | 
46 | add_library(mask_softmax STATIC attn_softmax_kernel.cu)
47 | set_property(TARGET mask_softmax PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
48 | set_property(TARGET mask_softmax PROPERTY POSITION_INDEPENDENT_CODE  ON)
49 | set_property(TARGET mask_softmax PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
50 | 
51 | add_library(fused_transpose_and_remv_pad STATIC fused_transpose_and_remv_pad.cu)
52 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
53 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY POSITION_INDEPENDENT_CODE  ON)
54 | set_property(TARGET fused_transpose_and_remv_pad PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
55 | 
56 | add_library(fused_addresidual_norm STATIC fused_addresidual_norm.cu)
57 | set_property(TARGET fused_addresidual_norm PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
58 | set_property(TARGET fused_addresidual_norm PROPERTY POSITION_INDEPENDENT_CODE  ON)
59 | set_property(TARGET fused_addresidual_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
60 | 
61 | add_library(act STATIC act_kernel.cu)
62 | set_property(TARGET act PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
63 | set_property(TARGET act PROPERTY POSITION_INDEPENDENT_CODE  ON)
64 | set_property(TARGET act PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
65 | 
66 | add_library(topk STATIC topK.cu)
67 | set_property(TARGET topk PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
68 | set_property(TARGET topk PROPERTY POSITION_INDEPENDENT_CODE  ON)
69 | set_property(TARGET topk PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
70 | 
71 | add_library(fused_decoder_self_attention STATIC fused_decoder_self_attention.cu)
72 | set_property(TARGET fused_decoder_self_attention PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
73 | set_property(TARGET fused_decoder_self_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
74 | set_property(TARGET fused_decoder_self_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
75 | 
76 | add_library(sampling STATIC sampling.cu)
77 | set_property(TARGET sampling PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
78 | set_property(TARGET sampling PROPERTY POSITION_INDEPENDENT_CODE  ON)
79 | set_property(TARGET sampling PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
80 | 
81 | add_library(add_residual STATIC add_residual.cu)
82 | set_property(TARGET add_residual PROPERTY CUDA_SEPARABLE_COMPILATION   ON)
83 | set_property(TARGET add_residual PROPERTY POSITION_INDEPENDENT_CODE  ON)
84 | set_property(TARGET add_residual PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)


--------------------------------------------------------------------------------
/src/weights/llama/llama_weights.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "src/weights/llama/llama_weights.h"
 3 | template<typename T>
 4 | LlamaWeight<T>::LlamaWeight(
 5 |     int     head_num,
 6 |     int     kv_head_num,
 7 |     int     head_size,
 8 |     int     inter_size,
 9 |     int     vocab_size,
10 |     int     num_layer,
11 |     bool    attn_bias,
12 |     WeightType weight_type
13 | ):
14 |     hidden_units(head_num * head_size),
15 |     inter_size(inter_size),
16 |     vocab_size(vocab_size),
17 |     vocab_size_padded(vocab_size),
18 |     num_layer(num_layer),
19 |     weight_type(weight_type) 
20 | {
21 |     llama_layer_weight.reserve(num_layer);
22 |     for (int l = 0; l < num_layer; ++l) {
23 |         llama_layer_weight.push_back(new LlamaLayerWeight<T>(head_num,
24 |                                                             kv_head_num,
25 |                                                             head_size,
26 |                                                             inter_size,
27 |                                                             weight_type,
28 |                                                             //group_size,
29 |                                                             attn_bias));
30 |     }
31 |     GPUMalloc(&out_rmsnorm_weight.gamma, hidden_units);
32 |     GPUMalloc(&post_decoder_embedding_weight.data, vocab_size * hidden_units);
33 |     GPUMalloc(&pre_decoder_embedding_weight.data, vocab_size * hidden_units);
34 |     pre_decoder_embedding_weight.shape = {vocab_size, hidden_units};
35 |     post_decoder_embedding_weight.shape = {vocab_size, hidden_units};
36 |     pre_decoder_embedding_weight.type = weight_type;
37 |     post_decoder_embedding_weight.type = weight_type;
38 | }
39 | // (RussWong)note: weight from HF is always half type, and if we want run fp32 inference, we should convert half weight to fp32 weight in tools/weights_convert.py 
40 | // (RussWong)note: shape and data of embedding and LMHead weight downloaded form HF are transposed, so we should carefully declare shape here
41 | template<typename T>
42 | void LlamaWeight<T>::loadWeights(std::string weight_path) {
43 |     loadWeightFromBin<T, float>::internalFunc(out_rmsnorm_weight.gamma, {(size_t)hidden_units}, weight_path + "model.norm.weight.bin");
44 |     loadWeightFromBin<T, float>::internalFunc(post_decoder_embedding_weight.data, {(size_t)vocab_size, (size_t)hidden_units}, weight_path + "lm_head.weight.bin");
45 |     loadWeightFromBin<T, float>::internalFunc(pre_decoder_embedding_weight.data, {(size_t)vocab_size, (size_t)hidden_units}, weight_path + "model.embed_tokens.weight.bin");
46 |     for (int layer = 0; layer < num_layer; ++layer) {
47 |         llama_layer_weight[layer]->loadWeights(weight_path + "model.layers." + std::to_string(layer), weight_type);
48 |     }
49 | }
50 | 
51 | template<typename T>
52 | void LlamaWeight<T>::loadWeightsFromDummy() {
53 |     T* d_dummy_out_rmsnorm_weight_gamma; 
54 |     T* d_dummy_post_decoder_embedding_weight; 
55 |     T* d_dummy_pre_decoder_embedding_weight;
56 |     GPUMalloc(&d_dummy_out_rmsnorm_weight_gamma, sizeof(T) * hidden_units);
57 |     GPUMalloc(&d_dummy_post_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size);
58 |     GPUMalloc(&d_dummy_pre_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size);
59 |     T* h_dummy_out_rmsnorm_weight_gamma = (T*)malloc(sizeof(T) * hidden_units); 
60 |     T* h_dummy_post_decoder_embedding_weight = (T*)malloc(sizeof(T) * hidden_units * vocab_size); 
61 |     T* h_dummy_pre_decoder_embedding_weight = (T*)malloc(sizeof(T) * hidden_units * vocab_size);  
62 |     for (int i = 0; i < hidden_units; i++){
63 |         h_dummy_out_rmsnorm_weight_gamma[i] = (T)1.0f;
64 |     }
65 |     for (int i = 0; i < hidden_units * vocab_size; i++) {
66 |         h_dummy_post_decoder_embedding_weight[i] = (T)1.0f; 
67 |         h_dummy_pre_decoder_embedding_weight[i] = (T)1.0f; 
68 |     } 
69 |     cudaMemcpy(d_dummy_out_rmsnorm_weight_gamma, h_dummy_out_rmsnorm_weight_gamma, sizeof(T) * hidden_units, cudaMemcpyHostToDevice);
70 |     cudaMemcpy(d_dummy_post_decoder_embedding_weight, h_dummy_post_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size, cudaMemcpyHostToDevice);
71 |     cudaMemcpy(d_dummy_pre_decoder_embedding_weight, h_dummy_pre_decoder_embedding_weight, sizeof(T) * hidden_units * vocab_size, cudaMemcpyHostToDevice);
72 | 
73 |     out_rmsnorm_weight.gamma = d_dummy_out_rmsnorm_weight_gamma;
74 |     post_decoder_embedding_weight.data = d_dummy_post_decoder_embedding_weight;
75 |     pre_decoder_embedding_weight.data = d_dummy_pre_decoder_embedding_weight;
76 |     for (int layer = 0; layer < num_layer; ++layer) {
77 |         llama_layer_weight[layer]->loadWeights();
78 |     }
79 | }
80 | 
81 | template<typename T>
82 | LlamaWeight<T>::~LlamaWeight()
83 | {
84 |     cudaFree(pre_decoder_embedding_weight.data);
85 |     cudaFree(out_rmsnorm_weight.gamma);
86 |     cudaFree(post_decoder_embedding_weight.data);
87 | 
88 |     for (auto& p : llama_layer_weight) {
89 |         delete p;
90 |     }
91 | }
92 | // template instantial required in linking time
93 | template struct LlamaWeight<float>;
94 | template struct LlamaWeight<half>;
95 | 


--------------------------------------------------------------------------------
/tests/unittests/test_topk.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>   // std::fill_n
 2 | #include <iostream>    // snprintf
 3 | #include <math.h>      // expf, log
 4 | #include <stdlib.h>    // rand
 5 | #include <string>      // std::string
 6 | #include <vector>      // std::vector
 7 | 
 8 | #include <cuda.h>
 9 | #include "src/kernels/topK.h"
10 | // (RussWong)note:
11 | // there is no top k cpu kernel implementation now
12 | // we compare the kernel correctnesss by eyes and result print infos
13 | // `./test_topk` to test fp32 GPU kernel
14 | int main() {
15 |     const int batch_size = 1;
16 |     const int vocab_size = 30000;
17 |     const int beamwidth = 2;
18 |     const int K = 5;
19 |     const int BlockPerBeam = 8;
20 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
21 |     const int probs_size = batch_size * vocab_size * beamwidth;
22 |     float* h_probs;
23 |     float *d_probs;
24 |     h_probs = (float*)malloc(sizeof(float) * probs_size);
25 |     cudaMalloc((void**)&d_probs, sizeof(float) * probs_size);
26 |     
27 |     int topK_val_buf_size = batch_size * beamwidth * BlockPerBeam * K;
28 |     int topK_ids_buf_size = batch_size * beamwidth * BlockPerBeam * K;
29 |     int final_topK_val_buf_size = batch_size * beamwidth * K; // sampling topK buf size, beamsearch topK size = [batch_size * beam_width * beam_width]
30 | 
31 | 
32 |     int *d_tmp_topk_ids;
33 |     cudaMalloc((void**)&d_tmp_topk_ids, sizeof(int) * topK_ids_buf_size);
34 | 
35 |     float *d_tmp_topk_vals;
36 |     cudaMalloc((void**)&d_tmp_topk_vals, sizeof(float) * topK_val_buf_size);
37 | 
38 |     int* h_final_topk_ids;
39 |     int *d_final_topk_ids;
40 |     h_final_topk_ids = (int*)malloc(sizeof(int) * final_topK_val_buf_size);
41 |     cudaMalloc((void**)&d_final_topk_ids, sizeof(int) * final_topK_val_buf_size);
42 | 
43 |     float* h_final_topk_vals;
44 |     float *d_final_topk_vals;
45 |     h_final_topk_vals = (float*)malloc(sizeof(float) * final_topK_val_buf_size);
46 |     cudaMalloc((void**)&d_final_topk_vals, sizeof(float) * final_topK_val_buf_size);
47 | 
48 |     for(int i = 0; i < probs_size; i++) { // 0-59999
49 |        h_probs[i] = i;
50 |     }
51 |     cudaMemcpy(d_probs, h_probs, sizeof(float)*probs_size, cudaMemcpyHostToDevice);
52 | 
53 |     DataType type_float = getTensorType<float>();
54 |     DataType type_int = getTensorType<int>();
55 |     TensorWrapper<float>* probs_tensor = new TensorWrapper<float>(Device::GPU, 
56 |                                                                 type_float,
57 |                                                                 {batch_size * beamwidth, vocab_size}, 
58 |                                                                 d_probs);
59 |     TensorWrapper<int> *tmp_topk_ids = new TensorWrapper<int>(Device::GPU, 
60 |                                                                 type_int,
61 |                                                                 {batch_size, beamwidth, BlockPerBeam, K}, 
62 |                                                                 d_tmp_topk_ids);
63 |     TensorWrapper<float>* tmp_topk_vals = new TensorWrapper<float>(Device::GPU, 
64 |                                                                 type_float,
65 |                                                                 {batch_size, beamwidth, BlockPerBeam, K}, 
66 |                                                                 d_tmp_topk_vals);
67 |     TensorWrapper<int> *final_topk_ids = new TensorWrapper<int>(Device::GPU, 
68 |                                                                 type_int,
69 |                                                                 {batch_size * beamwidth, K}, 
70 |                                                                 d_final_topk_ids);
71 |     TensorWrapper<float> *final_topk_vals = new TensorWrapper<float>(Device::GPU, 
72 |                                                                 type_float,
73 |                                                                 {batch_size * beamwidth, K}, 
74 |                                                                 d_final_topk_vals);
75 |     // debug info, better to retain: std::cout << "before launch kernel" << std::endl;
76 |     launchTopKforBeamSearch(probs_tensor, tmp_topk_ids, tmp_topk_vals, final_topk_ids, final_topk_vals);
77 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
78 |     cudaMemcpy(h_final_topk_ids, d_final_topk_ids, sizeof(int) * final_topK_val_buf_size, cudaMemcpyDeviceToHost);
79 |     cudaMemcpy(h_final_topk_vals, d_final_topk_vals,  sizeof(float) * final_topK_val_buf_size, cudaMemcpyDeviceToHost);
80 |     for(int i = 0; i < final_topK_val_buf_size; i++) {
81 |         int id = h_final_topk_ids[i];
82 |         printf("topK id = %d\n", id);
83 |         float val = h_final_topk_vals[i];
84 |         printf("topK val =%f\n", val);
85 |     }
86 |     // debug info, better to retain: std::cout << "before free" << std::endl;
87 |     free(h_probs);
88 |     free(h_final_topk_ids);
89 |     free(h_final_topk_vals);
90 |     cudaFree(d_probs);
91 |     cudaFree(d_final_topk_ids);
92 |     cudaFree(d_final_topk_vals);
93 |     cudaFree(d_tmp_topk_ids);
94 |     cudaFree(d_tmp_topk_vals);
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/unittests/test_residual.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>   // std::fill_n
  2 | #include <iostream>    // snprintf
  3 | #include <math.h>      // expf, log
  4 | #include <stdlib.h>    // rand
  5 | #include <string>      // std::string
  6 | #include <vector>      // std::vector
  7 | 
  8 | #include <iostream>
  9 | #include "src/kernels/add_residual.h"
 10 | 
 11 | #include <stdio.h>
 12 | // (RussWong)note: this kernel's CPU implementation is absolutely right.
 13 | // But when you are implementing LLMs inference on CPU, I dont recommend to reuse the CPU kernel, because its performance is bad
 14 | // `./test_residual` to test fp32 GPU kernel
 15 | #define CHECK(call)                                   \
 16 | do                                                    \
 17 | {                                                     \
 18 |     const cudaError_t error_code = call;              \
 19 |     if (error_code != cudaSuccess)                    \
 20 |     {                                                 \
 21 |         printf("CUDA Error:\n");                      \
 22 |         printf("    File:       %s\n", __FILE__);     \
 23 |         printf("    Line:       %d\n", __LINE__);     \
 24 |         printf("    Error code: %d\n", error_code);   \
 25 |         printf("    Error text: %s\n",                \
 26 |             cudaGetErrorString(error_code));          \
 27 |         exit(1);                                      \
 28 |     }                                                 \
 29 | } while (0)
 30 | 
 31 | void CPUresidual(float* h_residual, float* h_decoder_out,  int hidden_units, int num_tokens) {
 32 |     for(int b = 0; b < num_tokens; b++) {
 33 |         for (int i = 0; i < hidden_units; i++) {
 34 |             h_decoder_out[b * hidden_units + i] += h_residual[b * hidden_units + i];
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) {
 40 |     for(int i = 0; i < output_size; i++) {
 41 |         if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){
 42 |             printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]);
 43 |             return false;
 44 |         }
 45 | 
 46 |     }
 47 |     return true;
 48 | }
 49 | 
 50 | int main() {
 51 |     const int num_tokens = 16;
 52 |     const int hidden_units = 4096;
 53 |     const int total_size = num_tokens * hidden_units;
 54 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
 55 |     float* h_residual;
 56 |     float* d_residual;
 57 |     h_residual = (float*)malloc(sizeof(float) * total_size);
 58 |     cudaMalloc((void**)&d_residual, sizeof(float) * total_size);
 59 |     for(int i = 0; i < total_size; i++) { 
 60 |        h_residual[i] = (float)(i % 2 + 1);
 61 |     }
 62 | 
 63 |     float* h_decoder_out = (float*) malloc(sizeof(float) * total_size);
 64 |     float* decoder_out = (float*) malloc(sizeof(float) * total_size);
 65 |     float* d_decoder_out;
 66 |     cudaMalloc((void**)&d_decoder_out, sizeof(float) * total_size);
 67 |     for(int i = 0; i < total_size; i++) { 
 68 |        h_decoder_out[i] = (float)(i % 2 + 1);
 69 |     }
 70 | 
 71 |     CHECK(cudaMemcpy(d_residual, h_residual, sizeof(float) * total_size, cudaMemcpyHostToDevice));
 72 |     CHECK(cudaMemcpy(d_decoder_out, h_decoder_out, sizeof(float) * total_size, cudaMemcpyHostToDevice));
 73 |     DataType type_float = getTensorType<float>();
 74 |     TensorWrapper<float>* decoder_out_tensor = new TensorWrapper<float>(Device::GPU, 
 75 |                                                                         type_float,
 76 |                                                                         {num_tokens, hidden_units}, 
 77 |                                                                         d_decoder_out);
 78 |     TensorWrapper<float>* residual_tensor = new TensorWrapper<float>(Device::GPU, 
 79 |                                                                         type_float,
 80 |                                                                         {num_tokens, hidden_units}, 
 81 |                                                                         d_residual);                                                                        
 82 |     // debug info, better to retain: 
 83 |     std::cout << "before launch kernel" << std::endl;
 84 |     launchAddResidual(residual_tensor, decoder_out_tensor);
 85 |     // debug info, better to retain: 
 86 |     std::cout << "after launch kernel" << std::endl;
 87 |     // debug info, better to retain: 
 88 |     std::cout << "cuda memcpy device to host" << std::endl;
 89 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
 90 |     CHECK(cudaMemcpy(decoder_out, d_decoder_out, sizeof(float) * total_size, cudaMemcpyDeviceToHost));
 91 |     float* CPUout = (float*) malloc(sizeof(float) * total_size);
 92 |     for(int i = 0; i < total_size; i++){
 93 |         CPUout[i] = (float)(i % 2 + 1);
 94 |     }
 95 |     CPUresidual(h_residual, CPUout, hidden_units, num_tokens);
 96 |     bool is_right = CheckResult(CPUout, decoder_out, total_size);
 97 |     // debug info, better to retain: 
 98 |     std::cout << "before free" << std::endl;
 99 |     std::cout << "AddResidual kernel passed" << std::endl;
100 |     free(h_residual);
101 |     free(h_decoder_out);
102 |     free(CPUout);
103 |     free(decoder_out);
104 |     cudaFree(d_residual);
105 |     cudaFree(d_decoder_out);
106 | }
107 | 


--------------------------------------------------------------------------------
/src/kernels/cublas_utils.cc:
--------------------------------------------------------------------------------
  1 | #include "cublas_utils.h"
  2 | #include <iostream>
  3 | // (RussWong) notes:cublas gemm和stridedbatchgemm调库的写法，比较固定
  4 | cublasWrapper::cublasWrapper(cublasHandle_t cublas_handle,
  5 |                                  cublasLtHandle_t cublaslt_handle):
  6 |     cublas_handle_(cublas_handle),
  7 |     cublaslt_handle_(cublaslt_handle)
  8 | {
  9 | }
 10 | 
 11 | cublasWrapper::~cublasWrapper()
 12 | {
 13 | }
 14 | // invoked in model example main function after initialize cublas wrapper
 15 | void cublasWrapper::setFP32GemmConfig()
 16 | {
 17 |     Atype_       = CUDA_R_32F;
 18 |     Btype_       = CUDA_R_32F;
 19 |     Ctype_       = CUDA_R_32F;
 20 |     computeType_ = CUDA_R_32F;
 21 | }
 22 | 
 23 | void cublasWrapper::setFP16GemmConfig()
 24 | {
 25 |     Atype_       = CUDA_R_16F;
 26 |     Btype_       = CUDA_R_16F;
 27 |     Ctype_       = CUDA_R_16F;
 28 |     computeType_ = CUDA_R_32F;
 29 | }
 30 | 
 31 | //fp32 gemm and fp16 gemm
 32 | void cublasWrapper::Gemm(cublasOperation_t transa,
 33 |                            cublasOperation_t transb,
 34 |                            const int         m,
 35 |                            const int         n,
 36 |                            const int         k,
 37 |                            const void*       A,
 38 |                            const int         lda,
 39 |                            const void*       B,
 40 |                            const int         ldb,
 41 |                            void*             C,
 42 |                            const int         ldc,
 43 |                            float             f_alpha = 1.0f,
 44 |                            float             f_beta = 0.0f)
 45 | {
 46 |     half h_alpha = (half)(f_alpha);
 47 |     half h_beta  = (half)(f_beta);
 48 |     int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; //之前是CUDA_R_16F
 49 |     const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&(h_alpha)) : reinterpret_cast<void*>(&f_alpha);
 50 |     const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&(h_beta)) : reinterpret_cast<void*>(&f_beta);
 51 |     CHECK_CUBLAS(cublasGemmEx(cublas_handle_,
 52 |                             transa,
 53 |                             transb,
 54 |                             m,
 55 |                             n,
 56 |                             k,
 57 |                             alpha,
 58 |                             A,
 59 |                             Atype_,
 60 |                             lda,
 61 |                             B,
 62 |                             Btype_,
 63 |                             ldb,
 64 |                             beta,
 65 |                             C,
 66 |                             Ctype_,
 67 |                             ldc,
 68 |                             computeType_,
 69 |                             CUBLAS_GEMM_DEFAULT));
 70 | }
 71 | 
 72 | void cublasWrapper::stridedBatchedGemm(cublasOperation_t transa,
 73 |                                         cublasOperation_t transb,
 74 |                                         const int         m,
 75 |                                         const int         n,
 76 |                                         const int         k,
 77 |                                         const void*       A,
 78 |                                         const int         lda,
 79 |                                         const int64_t     strideA,
 80 |                                         const void*       B,
 81 |                                         const int         ldb,
 82 |                                         const int64_t     strideB,
 83 |                                         void*             C,
 84 |                                         const int         ldc,
 85 |                                         const int64_t     strideC,
 86 |                                         const int         batchCount,
 87 |                                         float       f_alpha = 1.0f,
 88 |                                         float       f_beta  = 0.0f)
 89 | {
 90 |     int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
 91 |     const void* alpha =
 92 |        is_fp16_computeType ? reinterpret_cast<void*>(&(f_alpha)) : reinterpret_cast<const void*>(&f_alpha);
 93 |     const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&(f_beta)) : reinterpret_cast<const void*>(&f_beta);
 94 |     CHECK_CUBLAS(cublasGemmStridedBatchedEx(cublas_handle_,
 95 |                                             transa,
 96 |                                             transb,
 97 |                                             m,
 98 |                                             n,
 99 |                                             k,
100 |                                             alpha,
101 |                                             A,
102 |                                             Atype_,
103 |                                             lda,
104 |                                             strideA,
105 |                                             B,
106 |                                             Btype_,
107 |                                             ldb,
108 |                                             strideB,
109 |                                             beta,
110 |                                             C,
111 |                                             Ctype_,
112 |                                             ldc,
113 |                                             strideC,
114 |                                             batchCount,
115 |                                             computeType_,
116 |                                             CUBLAS_GEMM_DEFAULT));
117 | }
118 | 


--------------------------------------------------------------------------------
/tests/unittests/test_concat_kv.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm> // std::fill_n
  2 | #include <iostream>  // snprintf
  3 | #include <math.h>    // expf, log
  4 | #include <stdlib.h>  // rand
  5 | #include <string>    // std::string
  6 | #include <vector>    // std::vector
  7 | 
  8 | #include <cuda.h>
  9 | #include "src/kernels/concat_past_kv.h"
 10 | // (RussWong)note:
 11 | // there is no concat kv cpu kernel implementation now
 12 | // we compare the kernel correctnesss by eyes and result print infos
 13 | // `./test_concat_kv` to test fp32 GPU kernel
 14 | int main()
 15 | {
 16 |     const int batch_size = 1;
 17 |     const int max_q_len = 16;
 18 |     const int max_seq_len = 32;
 19 |     const int head_size = 8;
 20 |     const int kv_head_num = 2;
 21 |     const int kv_size = 1 * batch_size * max_q_len * kv_head_num * head_size;
 22 |     const int layer_offset = 1 * batch_size * max_seq_len * kv_head_num * head_size;
 23 |     const int kvcache_size = layer_offset;
 24 |     // (RussWong)note: we plan to place layer id on CPU
 25 |     //    const int layer_id = 0;
 26 | 
 27 |     float *h_k_src;
 28 |     float *d_k_src;
 29 |     h_k_src = (float *)malloc(sizeof(float) * kv_size);
 30 |     cudaMalloc((void **)&d_k_src, sizeof(float) * kv_size);
 31 | 
 32 |     float *h_v_src;
 33 |     float *d_v_src;
 34 |     h_v_src = (float *)malloc(sizeof(float) * kv_size);
 35 |     cudaMalloc((void **)&d_v_src, sizeof(float) * kv_size);
 36 | 
 37 |     int *cur_query_length = (int *)malloc(sizeof(int) * batch_size);
 38 |     int *history_length = (int *)malloc(sizeof(int) * batch_size);
 39 |     int *dcur_query_length;
 40 |     int *dhistory_length;
 41 |     cudaMalloc((void **)&dcur_query_length, sizeof(int) * batch_size);
 42 |     cudaMalloc((void **)&dhistory_length, sizeof(int) * batch_size);
 43 | 
 44 |     float *h_k_dst = (float *)malloc(sizeof(float) * kvcache_size);
 45 |     float *h_v_dst = (float *)malloc(sizeof(float) * kvcache_size);
 46 |     float *d_k_dst;
 47 |     float *d_v_dst;
 48 |     cudaMalloc((void **)&d_k_dst, sizeof(float) * kvcache_size);
 49 |     cudaMalloc((void **)&d_v_dst, sizeof(float) * kvcache_size);
 50 |     float *kv_scale;
 51 |     cudaMalloc((void **)&kv_scale, sizeof(float));
 52 |     int *h_layer_id = (int *)malloc(sizeof(int) * batch_size);
 53 |     // (RussWong)note: we plan to place layer id on CPU
 54 |     // int *d_layer_id;
 55 |     // cudaMalloc((void **)&d_layer_id, sizeof(int) * batch_size);
 56 | 
 57 |     for (int i = 0; i < kv_size; i++)
 58 |     {
 59 |         h_k_src[i] = 1.0f;
 60 |         h_v_src[i] = 1.0f;
 61 |     }
 62 |     for (int i = 0; i < batch_size; i++)
 63 |     {
 64 |         cur_query_length[i] = 16;
 65 |         history_length[i] = 1;
 66 |         h_layer_id[i] = 0;
 67 |     }
 68 |     cudaMemcpy(d_v_src, h_v_src, sizeof(float) * kv_size, cudaMemcpyHostToDevice);
 69 |     cudaMemcpy(d_k_src, h_k_src, sizeof(float) * kv_size, cudaMemcpyHostToDevice);
 70 |     cudaMemcpy(dcur_query_length, cur_query_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice);
 71 |     cudaMemcpy(dhistory_length, history_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice);
 72 |     // cudaMemcpy(d_layer_id, h_layer_id, sizeof(int) * batch_size, cudaMemcpyHostToDevice);
 73 | 
 74 |     DataType type = getTensorType<float>();
 75 |     DataType type_int = getTensorType<int>();
 76 |     TensorWrapper<float> *in_ksrc = new TensorWrapper<float>(Device::GPU, type, {batch_size, kv_head_num, max_q_len, head_size}, d_k_src);
 77 |     TensorWrapper<float> *in_vsrc = new TensorWrapper<float>(Device::GPU, type, {batch_size, kv_head_num, max_q_len, head_size}, d_v_src);
 78 |     TensorWrapper<int> *layer_id = new TensorWrapper<int>(Device::CPU, type_int, {batch_size}, h_layer_id);
 79 |     TensorWrapper<int> *cur_q_len = new TensorWrapper<int>(Device::GPU, type_int, {batch_size}, dcur_query_length);
 80 |     TensorWrapper<int> *history_len = new TensorWrapper<int>(Device::GPU, type_int, {batch_size}, dhistory_length);
 81 |     TensorWrapper<float> *out_kdst = new TensorWrapper<float>(Device::GPU, type, {batch_size, kv_head_num, max_seq_len, head_size}, d_k_dst);
 82 |     TensorWrapper<float> *out_vdst = new TensorWrapper<float>(Device::GPU, type, {batch_size, kv_head_num, max_seq_len, head_size}, d_v_dst);
 83 |     // debug info, better to retain: std::cout << "before launch kernel" << std::endl;
 84 |     launchConcatKVCache(in_ksrc, in_vsrc, layer_id, cur_q_len, history_len, out_kdst, out_vdst);
 85 |     // debug info, better to retain: std::cout << "after launch kernel" << std::endl;
 86 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
 87 |     cudaMemcpy(h_v_dst, d_v_dst, sizeof(float) * kvcache_size, cudaMemcpyDeviceToHost);
 88 |     cudaMemcpy(h_k_dst, d_k_dst, sizeof(float) * kvcache_size, cudaMemcpyDeviceToHost);
 89 |     // debug info, better to retain: std::cout << "cuda memcpy device to host" << std::endl;
 90 |     // note: need to add offset2index and index2offset API to help us program and check result
 91 |     for (int i = batch_size * (1) * kv_head_num * head_size; i < batch_size * max_seq_len * kv_head_num * head_size; i++)
 92 |     {
 93 |         printf("index = %d\n", i);
 94 |         printf("res k = %f\n", h_k_dst[i]);
 95 |         // debug info, better to retain: printf("topK id = %d\n", id);
 96 |         printf("res v = %f\n", h_v_dst[i]);
 97 |         printf("===============\n");
 98 |         // debug info, better to retain: printf("topK val =%f\n", val);
 99 |     }
100 |     // debug info, better to retain: std::cout << "before free" << std::endl;
101 |     free(h_k_src);
102 |     free(h_v_src);
103 |     free(h_k_dst);
104 |     free(h_v_dst);
105 |     free(cur_query_length);
106 |     free(history_length);
107 |     free(h_layer_id);
108 |     cudaFree(d_k_src);
109 |     cudaFree(d_v_src);
110 |     cudaFree(d_k_dst);
111 |     cudaFree(d_v_dst);
112 |     cudaFree(dcur_query_length);
113 |     cudaFree(dhistory_length);
114 |     cudaFree(kv_scale);
115 | }
116 | 


--------------------------------------------------------------------------------
/src/kernels/repeat_kv.cu:
--------------------------------------------------------------------------------
  1 | #include "src/kernels/repeat_kv.h"
  2 | #include "src/utils/cuda_debug_utils.cuh"
  3 | #include <iostream>
  4 | // if MQA or GQA, we should use this transpose to broadcast kv head num to q head num
  5 | //[num layers, bs, kv head num, max_seq_len, head size]=>[bs, q head num, max_k_len, head size]
  6 | // context_length.shape=[bs]
  7 | // bugs1: when k_dst.shape = [1,32,13,128],现在这个k_dst以13*128为单位循环第一个13*128的值
  8 | // solu1: launcher函数里面获取kv cache的shape出错，需要仔细核对各个TensorWrapper的shape再通过正确索引获取
  9 | template <typename T>
 10 | __global__ void repeat_value_cache(T *v_dst,
 11 |                                    const T *v_src,
 12 |                                    const size_t layer_offset,
 13 |                                    const int head_num,
 14 |                                    const int q_head_per_kv,
 15 |                                    const int head_size,
 16 |                                    const int *context_length,
 17 |                                    const int max_k_len,
 18 |                                    const int max_seq_len)
 19 | {
 20 |     const int batch_id = blockIdx.y;
 21 |     const int head_id = blockIdx.z;
 22 | 
 23 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 24 | 
 25 |     const auto val_src = v_src + layer_offset;
 26 |     const auto val_dst = v_dst;
 27 | 
 28 |     const auto seq_len = context_length[batch_id];
 29 | 
 30 |     const int v_head_size_id = idx % head_size;
 31 |     const int v_seq_len_id = idx / head_size;
 32 |     // only fetch context_length(<max_seq_len) kv data from all kv cache of current seq
 33 |     if (v_seq_len_id < seq_len)
 34 |     {
 35 |         const int64_t src_idx = batch_id * (head_num / q_head_per_kv) * head_size * max_seq_len + // B
 36 |                                 head_id / q_head_per_kv * head_size * max_seq_len +               // H
 37 |                                 v_seq_len_id * head_size +                                        // s
 38 |                                 v_head_size_id;                                                   // D/x
 39 | 
 40 |         const int64_t dst_idx = batch_id * head_num * head_size * max_k_len + // B
 41 |                                 head_id * head_size * max_k_len +             // H
 42 |                                 v_seq_len_id * head_size +                    // s
 43 |                                 v_head_size_id;                               // D/x
 44 | 
 45 |         val_dst[dst_idx] = val_src[src_idx];
 46 |     }
 47 | }
 48 | template <typename T>
 49 | void launchRepeatKVCache(TensorWrapper<T> *k_cache_src, //{num_layers, batch_size, kv_head_num, max_seq_len, head_size}
 50 |                          TensorWrapper<T> *v_cache_src, //{num_layers, batch_size, kv_head_num, max_seq_len, head_size}
 51 |                          TensorWrapper<int> *context_length,
 52 |                          TensorWrapper<int> *layer_id,
 53 |                          TensorWrapper<T> *k_cache_dst, //{batch_size, head_num, max_k_len, head_size}
 54 |                          TensorWrapper<T> *v_cache_dst)
 55 | {
 56 |     int batch_size = context_length->shape[0];
 57 |     int kv_head_num = k_cache_src->shape[2]; // (RussWong)note: we should carefully access the shape value, corresponding to the place where tensorwapper is defined
 58 |     int max_seq_len = k_cache_src->shape[3];
 59 |     int head_num = k_cache_dst->shape[1];
 60 | 
 61 |     int max_k_len = k_cache_dst->shape[2];
 62 |     int head_size = k_cache_dst->shape[3];
 63 |     int layer = layer_id->getVal();
 64 |     // (RussWong)note: if layer id is on GPU, here MUSTN'T use layer_id->getVal<int>(), because we cant access GPU memory directly by [] if data is on GPU
 65 |     // (RussWong)note: so we can make layer data locate on CPU, so that we can access data by []
 66 |     size_t layer_offset = layer * batch_size * kv_head_num * max_seq_len * head_size;
 67 |     int q_head_per_kv = head_num / kv_head_num;
 68 |     int blockSize = 128;
 69 |     dim3 block(blockSize);
 70 |     dim3 grid((max_k_len * head_size + blockSize - 1) / blockSize, batch_size, head_num);
 71 |     repeat_value_cache<T><<<grid, block>>>(v_cache_dst->data,
 72 |                                            v_cache_src->data,
 73 |                                            layer_offset,
 74 |                                            head_num,
 75 |                                            q_head_per_kv,
 76 |                                            head_size,
 77 |                                            context_length->data,
 78 |                                            max_k_len,
 79 |                                            max_seq_len);
 80 | 
 81 |     repeat_value_cache<T><<<grid, block>>>(k_cache_dst->data,
 82 |                                            k_cache_src->data,
 83 |                                            layer_offset,
 84 |                                            head_num,
 85 |                                            q_head_per_kv,
 86 |                                            head_size,
 87 |                                            context_length->data,
 88 |                                            max_k_len,
 89 |                                            max_seq_len);
 90 | #ifdef PRINT_DATA
 91 |     printf("repeat kv kernel top2 result:\n");
 92 |     print_data<<<1, 1>>>(k_cache_dst->data);
 93 | #else
 94 | #endif
 95 | }
 96 | 
 97 | template void launchRepeatKVCache(TensorWrapper<float> *k_cache_src,
 98 |                                   TensorWrapper<float> *v_cache_src,
 99 |                                   TensorWrapper<int> *context_length,
100 |                                   TensorWrapper<int> *layer_id,
101 |                                   TensorWrapper<float> *k_cache_dst,
102 |                                   TensorWrapper<float> *v_cache_dst);
103 | template void launchRepeatKVCache(TensorWrapper<half> *k_cache_src,
104 |                                   TensorWrapper<half> *v_cache_src,
105 |                                   TensorWrapper<int> *context_length,
106 |                                   TensorWrapper<int> *layer_id,
107 |                                   TensorWrapper<half> *k_cache_dst,
108 |                                   TensorWrapper<half> *v_cache_dst);
109 | 


--------------------------------------------------------------------------------
/tests/unittests/test_bmm.cu:
--------------------------------------------------------------------------------
  1 | #include <algorithm>   // std::fill_n
  2 | #include <iostream>    // snprintf
  3 | #include <math.h>      // expf, log
  4 | #include <stdlib.h>    // rand
  5 | #include <string>      // std::string
  6 | #include <vector>      // std::vector
  7 | #include <stdio.h>
  8 | #include "src/utils/macro.h"
  9 | #include "src/kernels/linear.h"
 10 | #include "src/weights/base_weights.h"
 11 | // (RussWong)note: this kernel's CPU implementation is absolutely right.
 12 | // But when you are implementing LLMs inference on CPU, I dont recommend to reuse the CPU kernel, because its performance is bad
 13 | void CPUlinear(float* input, float* weight, float* output,
 14 |                 int m, int k, int n, int batch) {
 15 |     for(int b = 0; b < batch; b++) {
 16 |         for(int i = 0; i < m; i++) {
 17 |             for(int j = 0; j < n; j++) {
 18 |                 for(int l = 0; l < k; l++) {
 19 |                     output[b * m * n + i * n + j] += input[b * m * k + i * k + l] * weight[b * k * n + l * n + j];
 20 |                 }
 21 |             }
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | bool CheckResult(float* CPUoutput, float* GPUoutput, int output_size) {
 27 |     for(int i = 0; i < output_size; i++) {
 28 |         if(fabs(CPUoutput[i] - GPUoutput[i]) > 1e-6){
 29 |             printf("the %dth res is wrong, CPUoutput = %f, GPUoutput = %f\n", i, CPUoutput[i], GPUoutput[i]);
 30 |             return false;
 31 |         }
 32 |     }
 33 |     return true;
 34 | }
 35 | // (RussWong)note:
 36 | // `./bmm 1` to test fp32 GPU batch matmul with trans_b = true
 37 | // `./bmm` to test fp32 GPU batch matmul with trans_b = false
 38 | int main(int argc, char *argv[]) {
 39 |     const int batch_size = 1;
 40 |     const int seqlen_in = 16;
 41 |     const int seqlen_w = 16;
 42 |     const int hidden_units = 4096;
 43 |     const int head_num = 32;
 44 |     const int head_size = 128;
 45 |     int in_size = 0;
 46 |     int w_size = 0;
 47 |     int output_size = 0;
 48 |     if (argv[1]) {// enable trans_b for test lmhead linear
 49 |         in_size = batch_size * head_num * seqlen_in * head_size; // q
 50 |         w_size = batch_size * head_num * seqlen_w * head_size; // k
 51 |         output_size = batch_size * head_num * seqlen_in * seqlen_w; //q k
 52 |     } else {
 53 |         in_size = batch_size * head_num * seqlen_in * seqlen_w; //qk
 54 |         w_size = batch_size * head_num * seqlen_w * head_size; // v
 55 |         output_size = batch_size * head_num * seqlen_in * head_size;
 56 |     }
 57 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
 58 |     float* h_w;
 59 |     float* d_w;
 60 |     h_w = (float*)malloc(sizeof(float) * w_size);
 61 |     cudaMalloc((void**)&d_w, sizeof(float) * w_size);
 62 |     for(int i = 0; i < w_size; i++) { 
 63 |         h_w[i] = (float)(i % 2 + 1);
 64 |     	//h_w[i] = 1.0f; // simple data
 65 |     }
 66 | 
 67 |     float* h_in = (float*) malloc(sizeof(float) * in_size);
 68 |     float* d_in;
 69 |     cudaMalloc((void**)&d_in, sizeof(float) * in_size);
 70 |     for(int i = 0; i < in_size; i++) { 
 71 |         h_in[i] = (float)(i % 2 + 1);
 72 |     	//h_in[i] = 1.0f; // simple data
 73 |     }
 74 | 
 75 |     float* h_out = (float*) malloc(sizeof(float) * output_size);
 76 |     float* d_out;
 77 |     cudaMalloc((void**)&d_out, sizeof(float) * output_size);
 78 | 
 79 |     CHECK(cudaMemcpy(d_in, h_in, sizeof(float) * in_size, cudaMemcpyHostToDevice));
 80 |     CHECK(cudaMemcpy(d_w, h_w, sizeof(float) * w_size, cudaMemcpyHostToDevice));
 81 |     DataType type = getTensorType<float>();
 82 |     WeightType wtype = getWeightType<float>(); 
 83 |     TensorWrapper<float>* in;
 84 |     if (argv[1]) {// enable trans_b for test qk*v
 85 |         in = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, seqlen_in, head_size}, d_in);
 86 |     } else {// disable trans_b for test q*k
 87 |         in = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, seqlen_in, seqlen_w}, d_in);
 88 |     }
 89 |     TensorWrapper<float>* weight = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, seqlen_w, head_size}, d_w);
 90 |     TensorWrapper<float>* out;
 91 |     if (argv[1]) {// enable trans_b for test qk*v
 92 |         out = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, seqlen_in, seqlen_w}, d_out);
 93 |     } else {// disable trans_b for test q*k
 94 |         out = new TensorWrapper<float>(Device::GPU, type, {batch_size, head_num, seqlen_in, head_size}, d_out);
 95 |     }
 96 |     cublasHandle_t cublas_handle;
 97 |     cublasLtHandle_t cublaslt_handle;
 98 |     cublasCreate(&cublas_handle);
 99 |     cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
100 |     cublasWrapper* cublas_wrapper = new cublasWrapper(cublas_handle, cublaslt_handle);  
101 |     cublas_wrapper->setFP32GemmConfig();  
102 |     // debug info, better to retain: 
103 |     std::cout << "before launch kernel" << std::endl;
104 |     if (argv[1]) {// enable trans_b for test qk*v
105 |         launchLinearStridedBatchGemm(in, weight, out, cublas_wrapper, false, true);
106 |     } else {// disable trans_b for test q*k
107 |         launchLinearStridedBatchGemm(in, weight, out, cublas_wrapper);
108 |     } 
109 |     // debug info, better to retain: 
110 |     std::cout << "after launch kernel" << std::endl;
111 |     // debug info, better to retain: 
112 |     std::cout << "cuda memcpy device to host" << std::endl;
113 |     // Note: remember to memcpy from device to host and define the correct copy size(mul the sizeof(dtype)), or will cause segment fault
114 |     CHECK(cudaMemcpy(h_out, d_out, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
115 |     float* CPUout = (float*) malloc(sizeof(float) * output_size);
116 |     if (argv[1]) {// enable trans_b for ttest qk*v
117 |         CPUlinear(h_in, h_w, CPUout, seqlen_in, head_size, seqlen_w, batch_size * head_num);
118 |     } else {// disable trans_b for test q*k
119 |         CPUlinear(h_in, h_w, CPUout, seqlen_in, seqlen_w, head_size, batch_size * head_num);
120 |     } 
121 |     
122 |     bool is_right = CheckResult(CPUout, h_out, output_size);
123 |     // debug info, better to retain: 
124 |     std::cout << "before free" << std::endl;
125 |     std::cout << "linear passed" << std::endl;
126 |     free(h_in);
127 |     free(h_w);
128 |     free(h_out);
129 |     free(CPUout);
130 |     cudaFree(d_in);
131 |     cudaFree(d_w);
132 |     cudaFree(d_out);
133 | }
134 | 


--------------------------------------------------------------------------------
/src/utils/weight_utils.cu:
--------------------------------------------------------------------------------
  1 | #include "src/utils/weight_utils.h"
  2 | 
  3 | template<typename T_OUT, typename T_IN>
  4 | inline __device__ T_OUT type_cast(T_IN val) {
  5 |     return val;
  6 | }
  7 | template<>
  8 | inline __device__ float type_cast(half val) {
  9 |     return __half2float(val);
 10 | }
 11 | 
 12 | template<>
 13 | inline __device__ half type_cast(float val) {
 14 |     return __float2half(val); 
 15 | }
 16 | 
 17 | template<typename T>
 18 | void GPUMalloc(T** ptr, size_t size)
 19 | {
 20 |     LLM_CHECK_WITH_INFO(size >= ((size_t)0), "Ask cudaMalloc size " + std::to_string(size) + "< 0 is invalid.");
 21 |     CHECK(cudaMalloc((void**)(ptr), sizeof(T) * size));
 22 | }
 23 | template void GPUMalloc(float** ptr, size_t size);
 24 | template void GPUMalloc(half** ptr, size_t size);
 25 | 
 26 | template<typename T>
 27 | void GPUFree(T* ptr)
 28 | {
 29 |     if (ptr != NULL) {
 30 |         CHECK(cudaFree(ptr));
 31 |         ptr = NULL;
 32 |     }
 33 | }
 34 | template void GPUFree(float* ptr);
 35 | template void GPUFree(half* ptr);
 36 | 
 37 | template<typename T>
 38 | void cudaH2Dcpy(T* tgt, const T* src, const size_t size)
 39 | {
 40 |     CHECK(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
 41 | }
 42 | 
 43 | template void cudaH2Dcpy(float* tgt, const float* src, const size_t size);
 44 | template void cudaH2Dcpy(half* tgt, const half* src, const size_t size);
 45 | 
 46 | template<typename T_IN, typename T_OUT>
 47 | __global__ void type_conversion(T_OUT* dst, const T_IN* src, const int size)
 48 | {
 49 |     int gtid = threadIdx.x + blockIdx.x * blockDim.x;
 50 |     int total_thread_nums = blockDim.x * gridDim.x;
 51 |     for (int index = gtid; index < size; index += total_thread_nums) {
 52 |         dst[index] = type_cast<T_OUT>(src[index]);
 53 |     }
 54 | }
 55 | 
 56 | template<typename T_IN, typename T_OUT>
 57 | void cuda_type_conversion(T_OUT* dst, const T_IN* src, const int size)
 58 | {
 59 |     dim3 grid(128);
 60 |     dim3 block(128);
 61 |     type_conversion<T_IN, T_OUT><<<grid, block, 0, 0>>>(dst, src, size);
 62 | }
 63 | 
 64 | template void cuda_type_conversion(float* dst, const half* src, const int size);
 65 | template void cuda_type_conversion(half* dst, const float* src, const int size);
 66 | 
 67 | // from FT code
 68 | // loads data from binary file. If it succeeds, returns a non-empty (shape size) vector. If loading fails or
 69 | // the product of the elements in shape is 0, this function will return an empty vector.
 70 | template<typename T>
 71 | std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
 72 | {
 73 |     if (shape.size() > 2) {
 74 |         printf("[ERROR] shape should have less than two dims \n");
 75 |         return std::vector<T>();
 76 |     }
 77 |     size_t dim0 = shape[0], dim1 = 1;
 78 |     if (shape.size() == 2) {
 79 |         dim1 = shape[1];
 80 |     }
 81 |     size_t size = dim0 * dim1;
 82 |     if (size == 0) {
 83 |         std::cout << "shape is zero, skip loading weight from file: " << filename << std::endl;
 84 |         return std::vector<T>();
 85 |     }
 86 | 
 87 |     std::vector<T> host_array(size);
 88 |     std::ifstream  in(filename, std::ios::in | std::ios::binary);
 89 |     if (!in.is_open()) {
 90 |         std::cout << "file" << filename << "cannot be opened, loading model fails!" << std::endl;
 91 |         return std::vector<T>();
 92 |     }
 93 | 
 94 |     size_t loaded_data_size = sizeof(T) * size;
 95 |     in.seekg(0, in.end);
 96 |     in.seekg(0, in.beg);
 97 | 
 98 |     std::cout << "Read " << std::to_string(loaded_data_size) << " bytes from " << filename << std::endl;
 99 |     in.read((char*)host_array.data(), loaded_data_size);
100 | 
101 |     size_t in_get_size = in.gcount();
102 |     if (in_get_size != loaded_data_size) {
103 |         return std::vector<T>();
104 |     }
105 |     in.close();
106 |     // If we succeed, return an array with values.
107 |     return host_array;
108 | }
109 | 
110 | template <typename T_OUT, typename T_FILE>
111 | struct loadWeightFromBin<T_OUT, T_FILE, true>
112 | {
113 | public:
114 |     static void internalFunc(T_OUT* ptr, std::vector<size_t> shape, std::string filename) {
115 |         std::vector<T_FILE> host_array = loadWeightFromBinHelper<T_FILE>(shape, filename);
116 |         if (host_array.empty()) {
117 |             return;
118 |         }
119 | 
120 |         cudaH2Dcpy(ptr, host_array.data(), host_array.size());
121 |         return;    
122 |    }
123 | };
124 | 
125 | template <typename T_OUT, typename T_FILE>
126 | struct loadWeightFromBin<T_OUT, T_FILE, false>
127 | {
128 | public:
129 |     static void internalFunc(T_OUT* ptr, std::vector<size_t> shape, std::string filename) {
130 |         std::vector<T_FILE> host_array = loadWeightFromBinHelper<T_FILE>(shape, filename);
131 |         if (host_array.empty()) {
132 |             return;
133 |         }
134 | 
135 |         T_FILE* ptr_tmp;
136 |         GPUMalloc(&ptr_tmp, host_array.size());
137 |         cudaH2Dcpy(ptr_tmp, host_array.data(), host_array.size());
138 |         cuda_type_conversion(ptr, ptr_tmp, host_array.size());
139 |         GPUFree(ptr_tmp);
140 |         return;
141 |     }
142 | };
143 | 
144 | // ！！(wrong case)C++委员会规定：函数模板不支持模板偏特化
145 | // template<typename T_OUT, typename T_FILE>
146 | // typename std::enable_if<std::is_same<T_OUT, T_FILE>::value, int>::type loadWeightFromBin(T_OUT* ptr, std::vector<size_t> shape, std::string filename)
147 | // {
148 | //     std::vector<T_FILE> host_array = loadWeightFromBinHelper<T_FILE>(shape, filename);
149 | 
150 | //     if (host_array.empty()) {
151 | //         return 0;
152 | //     }
153 | 
154 | //     cudaH2Dcpy(ptr, host_array.data(), host_array.size());
155 | //     return 0;
156 | // }
157 | 
158 | // template<typename T_OUT, typename T_FILE>
159 | // typename std::enable_if<!std::is_same<T_OUT, T_FILE>::value, int>::type loadWeightFromBin(T_OUT* ptr, std::vector<size_t> shape, std::string filename)
160 | // {
161 | //     std::vector<T_FILE> host_array = loadWeightFromBinHelper<T_FILE>(shape, filename);
162 | 
163 | //     if (host_array.empty()) {
164 | //         return 0;
165 | //     }
166 | 
167 | 
168 | //     T_FILE* ptr_tmp;
169 | //     GPUMalloc(&ptr_tmp, host_array.size());
170 | //     cudaH2Dcpy(ptr_tmp, host_array.data(), host_array.size());
171 | //     cuda_type_conversion(ptr, ptr_tmp, host_array.size());
172 | //     GPUFree(ptr_tmp);
173 | //     return 0;
174 | // }
175 | 
176 | template struct loadWeightFromBin<float, float, true>;
177 | template struct loadWeightFromBin<half, half, true>;
178 | template struct loadWeightFromBin<float, half, false>;
179 | template struct loadWeightFromBin<half, float, false>;
180 | 


--------------------------------------------------------------------------------
/tests/unittests/test_mask_softmax.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm> // std::fill_n
 2 | #include <iostream>  // snprintf
 3 | #include <math.h>    // expf, log
 4 | #include <stdlib.h>  // rand
 5 | #include <string>    // std::string
 6 | #include <vector>    // std::vector
 7 | 
 8 | #include <math.h>
 9 | #include "src/kernels/attn_softmax_kernel.h"
10 | // (RussWong)note:
11 | // there is no cpu kernel implementation now, and if you bought my CUDA lesson, you can find CPU softmax kernel.
12 | // we compare the kernel correctnesss by eyes and result print infos
13 | // `./test_mask_softmax 1` to test half GPU kernel
14 | // `./test_mask_softmax` to test fp32 GPU kernel
15 | #define TEST_MASKED_SOFTMAX(dtype)                                                                                                  \
16 |     dtype *h_qk;                                                                                                                    \
17 |     dtype *d_qk;                                                                                                                    \
18 |     h_qk = (dtype *)malloc(sizeof(dtype) * qk_size);                                                                                \
19 |     cudaMalloc((void **)&d_qk, sizeof(dtype) * qk_size);                                                                            \
20 |     dtype *h_score;                                                                                                                 \
21 |     dtype *d_score;                                                                                                                 \
22 |     h_score = (dtype *)malloc(sizeof(dtype) * qk_size);                                                                             \
23 |     cudaMalloc((void **)&d_score, sizeof(dtype) * qk_size);                                                                         \
24 |     dtype *h_mask;                                                                                                                \
25 |     dtype *d_mask;                                                                                                                \
26 |     h_mask = (dtype *)malloc(sizeof(dtype) * batch_size * q_length * k_length);                                                 \
27 |     cudaMalloc((void **)&d_mask, sizeof(dtype) * batch_size * q_length * k_length);                                               \
28 |     for (int i = 0; i < qk_size; i++)                                                                                               \
29 |     {                                                                                                                               \
30 |         h_qk[i] = i % 8;                                                                                                             \
31 |     }                                                                                                                               \
32 |     for (int i = 0; i < batch_size * q_length * k_length; i++)                                                                      \
33 |     {                                                                                                                               \
34 |         h_mask[i] = (dtype)(1);                                                                                                   \
35 |     }                                                                                                                               \
36 |     cudaMemcpy(d_qk, h_qk, sizeof(dtype) * qk_size, cudaMemcpyHostToDevice);                                                        \
37 |     cudaMemcpy(d_mask, h_mask, sizeof(dtype) * batch_size * q_length * k_length, cudaMemcpyHostToDevice);                         \
38 |     DataType type = getTensorType<dtype>();                                                                                         \
39 |     TensorWrapper<dtype> *qk = new TensorWrapper<dtype>(Device::GPU, type, {batch_size, head_num, q_length, k_length}, d_qk);       \
40 |     TensorWrapper<dtype> *mask = new TensorWrapper<dtype>(Device::GPU, type, {batch_size, q_length, k_length}, d_mask);             \
41 |     TensorWrapper<dtype> *score = new TensorWrapper<dtype>(Device::GPU, type, {batch_size, head_num, q_length, k_length}, d_score); \
42 |     std::cout << "before launch softmax kernel" << std::endl;                                                                       \
43 |     launchScaleMaskAndSoftmax(qk, mask, score, scale);                                                                              \
44 |     std::cout << "after launch softmax kernel" << std::endl;                                                                        \
45 |     std::cout << "cuda memcpy device to host" << std::endl;                                                                         \
46 |     cudaMemcpy(h_score, score->data, sizeof(dtype) * qk_size, cudaMemcpyDeviceToHost);                                              \
47 |     for (int i = 0; i < qk_size; i++)                                                                                               \
48 |     {                                                                                                                               \
49 |         printf("attn score[%d] = %f\n", i, (float)h_score[i]);                                                                      \
50 |     }                                                                                                                               \
51 |     free(h_qk);                                                                                                                     \
52 |     free(h_score);                                                                                                                  \
53 |     free(h_mask);                                                                                                                   \
54 |     cudaFree(d_qk);                                                                                                                 \
55 |     cudaFree(d_score);                                                                                                              \
56 |     cudaFree(d_mask);
57 | 
58 | int main(int argc, char *argv[])
59 | {
60 |     const int batch_size = 1;
61 |     const int head_num = 2;
62 |     const int q_length = 8;
63 |     const int k_length = 8;
64 |     const int head_size = 4;
65 |     float scale = rsqrtf(float(head_size));
66 |     // debug info, better to retain: std::cout <<"batch_size=" << batch_size << "  vocab_size=" << vocab_size << std::endl;
67 |     const int qk_size = batch_size * head_num * q_length * k_length;
68 |     if (argv[1])
69 |     {
70 |         TEST_MASKED_SOFTMAX(half);
71 |     }
72 |     else
73 |     {
74 |         TEST_MASKED_SOFTMAX(float);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/kernels/topK.cu:
--------------------------------------------------------------------------------
  1 | #include <float.h> //FLT_MIN
  2 | #include <cuda.h>
  3 | #include <iostream>
  4 | #include "src/kernels/topK.h"
  5 | #include <cub/cub.cuh>
  6 | 
  7 | // Note: a b两个topK reduce输出一个topK
  8 | template<typename T, int K>
  9 | __device__ topK<T, K> reduce_functor(const topK<T, K>& a, const topK<T, K>& b) {
 10 |     topK<T, K> res = a;
 11 |     for(int i = 0; i < K; i++){
 12 |         res.insertHeap(b.val[i], b.id[i]);
 13 |     }
 14 |     return res;
 15 | }
 16 | // gridsize:bs * beamwidth * BlockPerBeam 
 17 | // blocksize: 256
 18 | // shape infer: [bs, beamwidth, vocab size] => [bs, beamwidth, BlockPerBeam, K]
 19 | template<typename T, int K, int blockSize, int BlockPerBeam>
 20 | __global__ void topK_kernel_round1(const T* probs, const int vocab_size, 
 21 |                                          int* topK_ids, T* topK_vals)
 22 | {
 23 |     typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 24 |     __shared__ typename blockreduce::TempStorage temp_storage;
 25 | 
 26 |     int tid = threadIdx.x;
 27 |     int bid = blockIdx.x;
 28 |     int gid = blockIdx.x * blockDim.x + threadIdx.x;
 29 |     int row_id = bid / BlockPerBeam;
 30 |     int block_lane = bid % BlockPerBeam;
 31 |     topK<T, K> thread_topK;
 32 |     thread_topK.init();
 33 |     // thread local reduce
 34 |     for(int data_id = tid + block_lane * blockSize; data_id < vocab_size; data_id += BlockPerBeam * blockSize){
 35 |         int data_offset = data_id + row_id * vocab_size;
 36 |         T data = probs[data_offset];
 37 |         //thread_topK.insertHeap(data, data_offset); // bug, id should be local in bsxbm, if use this line, assume bsxbm=2, prob=1-50000,the 2nd bsxbm res topk id will be 59999,59998..., but in bsxbm internal, this id will be 29999,29998... rather than not global id
 38 |         thread_topK.insertHeap(data, data_id); 
 39 |     }
 40 |     //block local reduce
 41 |     topK<T, K> block_topK = blockreduce(temp_storage).Reduce(thread_topK, reduce_functor<T, K>);
 42 | 
 43 |     if(tid == 0){
 44 |         for(int k_offset = 0; k_offset < K; k_offset++) {
 45 |             // topK_vals[row_id * vocab_size + block_lane * blockSize + k_offset] = block_topK.val[k_offset]; //bug
 46 |             topK_vals[row_id * BlockPerBeam * K + block_lane * K + k_offset] = block_topK.val[k_offset];
 47 |             topK_ids[row_id * BlockPerBeam * K  + block_lane * K + k_offset] = block_topK.id[k_offset];//output offset要根据output buffer的shape来计算
 48 | 
 49 |         }
 50 |     }
 51 | }
 52 | // shape infer: [bs, beamwidth, BlockPerBeam, K] => [bs, beamwidth, K]
 53 | // ids是beam width * vocab size中的全局word id
 54 | // gridSize = bs
 55 | // blockSize = 256
 56 | template<typename T, int K, int blockSize, int BlockPerBeam>
 57 | __global__ void topK_kernel_round2(const int* topK_ids, const T* topK_vals,
 58 |                                     int* final_topK_ids, T* final_topK_vals)
 59 | {
 60 |     typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 61 |     __shared__ typename blockreduce::TempStorage temp_storage;
 62 | 
 63 |     int tid = threadIdx.x;
 64 |     int bid = blockIdx.x;
 65 |     int gid = blockIdx.x * blockDim.x + threadIdx.x;
 66 |     int row_id = bid;
 67 |     topK<T, K> thread_topK;
 68 |     // thread local reduce    
 69 |     for(int i = tid; i < BlockPerBeam * K; i += blockDim.x) {
 70 |         int data_offset = bid * BlockPerBeam * K + i;
 71 |         thread_topK.insertHeap(topK_vals[data_offset], topK_ids[i]);
 72 |     }
 73 |     // block reduce
 74 |     topK<T, K> block_topK = blockreduce(temp_storage).Reduce(thread_topK, reduce_functor<T, K>);
 75 |     if(tid == 0){
 76 |         for(int k_offset = 0; k_offset < K; k_offset++) {
 77 |             // topK_vals[row_id * vocab_size + block_lane * blockSize + k_offset] = block_topK.val[k_offset]; //bug
 78 |             final_topK_vals[bid * K + k_offset] = block_topK.val[k_offset];
 79 |             final_topK_ids[bid * K + k_offset] = block_topK.id[k_offset];
 80 |         }
 81 |     }    
 82 | }
 83 | 
 84 | template <typename T>
 85 | void launchTopKforBeamSearch(TensorWrapper<T> *probs,
 86 |                              TensorWrapper<int> *topk_ids,
 87 |                              TensorWrapper<T> *topk_vals,
 88 |                              TensorWrapper<int> *final_topk_ids,
 89 |                              TensorWrapper<T> *final_topk_vals)
 90 | {
 91 |     // support both beamserach and sampling topk by integrate beamwidth into batchsize, we get variable bsxbw = bs*bw, the probs shape is [bs*bw, vocabsize]
 92 |     int bsxbm = probs->shape[0];
 93 |     int vocab_size = probs->shape[1];
 94 |     constexpr int BlockPerBeam = 8;
 95 |     constexpr int beamwidth = 1;
 96 |     constexpr int K = 5;
 97 |     // buffer size
 98 |     int topK_val_buf_size = bsxbm * BlockPerBeam * K;
 99 |     int topK_ids_buf_size = bsxbm * BlockPerBeam * K;
100 |     int final_topK_val_buf_size = bsxbm * K;
101 |     
102 |     T* topK_vals = topk_vals->data;
103 |     int* topK_ids = topk_ids->data;
104 |     T* final_topK_vals = final_topk_vals->data;
105 |     int* final_topK_ids = final_topk_ids->data;    
106 |     // prepare launch
107 |     // TODO: add GPUconfig API to easily get GPU config, ep: maxblocknums
108 |     // GPUConfig config;
109 |     // int maxBlockNums = config.getMaxBlockNums();
110 |     // TODO: how to alloc block nums more flexable according to shape
111 |     //constexpr int BlockPerBeam = 8;
112 |     int maxBlockNums = 1024;
113 |     int BlockNums1 = std::min(bsxbm * BlockPerBeam, maxBlockNums);
114 |     int BlockNums2 = std::min(bsxbm, maxBlockNums);
115 |     dim3 grid_round1(BlockNums1);
116 |     dim3 block_round1(256);
117 |     dim3 grid_round2(BlockNums2);
118 |     dim3 block_round2(256);
119 |     // debug info, better to retain: std::cout << "in cu file, before launch" << std::endl;
120 |     topK_kernel_round1<T, K, 256, BlockPerBeam>
121 |                         <<<grid_round1, block_round1>>>(probs->data, vocab_size, topK_ids, topK_vals);
122 |     topK_kernel_round2<T, K, 256, BlockPerBeam>
123 |                         <<<grid_round2, block_round2>>>(topK_ids, topK_vals, final_topK_ids, final_topK_vals);
124 |     // debug info, better to retain: std::cout << "in cu file, after launch" << std::endl;
125 | }
126 | 
127 | template void launchTopKforBeamSearch(TensorWrapper<float> *probs,
128 |                              TensorWrapper<int> *topk_ids,
129 |                              TensorWrapper<float> *topk_vals,
130 |                              TensorWrapper<int> *final_topk_ids,
131 |                              TensorWrapper<float> *final_topk_vals);
132 | 
133 | template void launchTopKforBeamSearch(TensorWrapper<half> *probs,
134 |                              TensorWrapper<int> *topk_ids,
135 |                              TensorWrapper<half> *topk_vals,
136 |                              TensorWrapper<int> *final_topk_ids,
137 |                              TensorWrapper<half> *final_topk_vals);
138 | 


--------------------------------------------------------------------------------
/src/kernels/topK_bk.cu:
--------------------------------------------------------------------------------
  1 | #include <float.h> //FLT_MIN
  2 | #include <iostream>
  3 | #include "src/kernels/topK.h"
  4 | #include <cub/cub.cuh>
  5 | 
  6 | // Note: a b两个topK reduce输出一个topK
  7 | template <typename T, int K>
  8 | __device__ topK<T, K> reduce_functor(const topK<T, K> &a, const topK<T, K> &b)
  9 | {
 10 | 	topK<T, K> res = a;
 11 | 	for (int i = 0; i < K; i++) {
 12 | 		res.insertHeap(b.val[i], b.id[i]);
 13 | 	}
 14 | 	return res;
 15 | }
 16 | // gridsize:bs * beam_width * BlockPerBeam
 17 | // blocksize:256
 18 | // shape infer: [bs, beam_width, vocab size] => [bs, beam_width, BlockPerBeam, K],在vocabsize的大小里选出blockPerBeam个topK
 19 | template <typename T, int K, int blockSize, int BlockPerBeam>
 20 | __global__ void topK_kernel_round1(const T *probs, const int vocab_size,
 21 |                                    int *topK_ids, T *topK_vals)
 22 | {
 23 | 	typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 24 |         __shared__ typename blockreduce::TempStorage tmp_storage;
 25 | 
 26 | 	int tid = threadIdx.x;
 27 | 	int bid = blockIdx.x;
 28 | 	int row_id = bid / BlockPerBeam;
 29 | 	int block_lane = bid % BlockPerBeam;
 30 | 	topK<T, K> thread_topK;
 31 | 	thread_topK.init();
 32 | 	//thread local reduce
 33 | 	for (int data_id = tid + block_lane * blockSize; data_id < vocab_size; data_id += BlockPerBeam * blockSize) {
 34 | 		int data_offset = data_id + row_id * vocab_size;
 35 | 		T data = probs[data_offset];
 36 | 		thread_topK.insertHeap(data, data_offset);
 37 |         	// if (bid == 1 && data_id < 10) {
 38 |             // 		printf("ROUND1, 1st block, top1 vals = %f, top1 id = %d\n", data, data_offset);
 39 |         	// }
 40 | 	}
 41 | //	typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 42 | //	__shared__ typename blockreduce::TempStorage tmp_storage;
 43 | 	topK<T, K> block_topk = blockreduce(tmp_storage).Reduce(thread_topK, reduce_functor<T, K>);
 44 | 
 45 | 	if (tid == 0) {
 46 | 		for (int k_offset = 0; k_offset < K; k_offset++) {
 47 | 			int dst_offset = row_id * BlockPerBeam * K + block_lane * K + k_offset;
 48 | 			topK_vals[dst_offset] = block_topk.val[k_offset];
 49 | 			topK_ids[dst_offset] = block_topk.id[k_offset];
 50 | 		}
 51 | 	}
 52 | 
 53 | }
 54 | // shape infer: [bs, beam_width, BlockPerBeam, K] => [bs, beam_width, K] ，这是sampling的topK（=>[bs, beam_width, K]才是beamsearch topK），后期注意重写一个beamsearch的topK
 55 | // gridSize = bs
 56 | // blockSize = 256
 57 | template <typename T, int K, int blockSize, int BlockPerBeam>
 58 | __global__ void topK_kernel_round2(const int *topK_ids, const T *topK_vals,
 59 |                                    int *final_topK_ids, T *final_topK_vals)
 60 | {
 61 |         typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 62 |         __shared__ typename blockreduce::TempStorage tmp_storage;
 63 | 
 64 | 	int tid = threadIdx.x;
 65 | 	int bid = blockIdx.x;
 66 | 	int row_id = bid;
 67 | 	topK<T, K> thread_topK;
 68 | 	thread_topK.init();
 69 | 	//thread local reduce
 70 | 	for (int data_id = tid; data_id < BlockPerBeam * K; data_id += blockSize) {
 71 | 		int data_offset = data_id + bid * BlockPerBeam * K;
 72 | 	
 73 | 		thread_topK.insertHeap(topK_vals[data_offset], topK_ids[data_offset]);
 74 |         	// if (bid == 0 && data_id == 0) {
 75 |             // 		printf("ROUND2, 1st block, top1 vals = %f, top1 id = %d\n", topK_vals[data_offset], topK_ids[data_offset]);
 76 |         	// }
 77 | 	}
 78 | 
 79 | //	typedef cub::BlockReduce<topK<T, K>, blockSize> blockreduce;
 80 | //	__shared__ typename blockreduce::TempStorage tmp_storage;
 81 | 	topK<T, K> block_topk = blockreduce(tmp_storage).Reduce(thread_topK, reduce_functor<T, K>);
 82 | 
 83 | 	if (tid == 0) {
 84 | 		//int beam_id = (blockDim.x * blockIdx.x + tid) / BlockPerBeam / K;
 85 | 		for (int k_offset = 0; k_offset < K; k_offset++) {
 86 | 			int dst_offset = bid * K + k_offset;
 87 | 			final_topK_vals[dst_offset] = block_topk.val[k_offset];
 88 | 			final_topK_ids[dst_offset] = block_topk.id[k_offset];
 89 | 		}
 90 | 	}
 91 | }
 92 | 
 93 | template <typename T>
 94 | void launchTopKforBeamSearch(TensorWrapper<T> *probs,
 95 |                              // TensorWrapper<T>* topk_workspace
 96 |                              TensorWrapper<int> *tmp_topk_ids,
 97 |                              TensorWrapper<T> *tmp_topk_vals,
 98 |                              TensorWrapper<int> *final_topk_ids,
 99 |                              TensorWrapper<T> *final_topk_vals)
100 | {
101 |     int batch_size = probs->shape[0];
102 |     int vocab_size = probs->shape[1];
103 |     constexpr int BlockPerBeam = 8;
104 |     constexpr int beam_width = 1;
105 |     constexpr int K = 5;
106 |     // buffer size
107 |     // int topK_val_buf_size = batch_size * beam_width * BlockPerBeam * beam_width;
108 |     // int topK_ids_buf_size = batch_size * beam_width * BlockPerBeam * beam_width;
109 |     // int final_topK_val_buf_size = batch_size * beam_width; // sampling topK buf size, beamsearch topK size = [batch_size * beam_width * beam_width]
110 |     // memory plan
111 |     T *topK_vals = tmp_topk_vals->data;         // topK_val_buf_size
112 |     int *topK_ids = tmp_topk_ids->data;         // topK_ids_buf_size
113 |     T *final_topK_vals = final_topk_vals->data; // final_topK_val_buf_size
114 |     int *final_topK_ids = final_topk_ids->data; // final_topK_val_buf_size
115 |     cudaSetDevice(0);
116 |     cudaDeviceProp deviceProp;
117 |     cudaGetDeviceProperties(&deviceProp, 0);
118 |     int maxBlockNums = deviceProp.maxGridSize[0];
119 |     int BlockNums1 = std::min(batch_size * beam_width * BlockPerBeam, maxBlockNums);
120 |     int BlockNums2 = std::min(batch_size * beam_width, maxBlockNums);
121 |     dim3 grid_round1(BlockNums1);
122 |     dim3 block_round1(256);
123 |     dim3 grid_round2(BlockNums2);
124 |     dim3 block_round2(256);
125 |     // debug info, better to retain: std::cout << "in cu file, before launch" << std::endl;
126 |     topK_kernel_round1<T, K, 256, BlockPerBeam>
127 |         <<<grid_round1, block_round1>>>(probs->data, vocab_size, topK_ids, topK_vals);
128 |     topK_kernel_round2<T, K, 256, BlockPerBeam>
129 |         <<<grid_round2, block_round2>>>(topK_ids, topK_vals, final_topK_ids, final_topK_vals);
130 |     // debug info, better to retain: std::cout << "in cu file, after launch" << std::endl;
131 | }
132 | 
133 | template void launchTopKforBeamSearch(TensorWrapper<float> *probs,
134 |                                       TensorWrapper<int> *tmp_topk_ids,
135 |                                       TensorWrapper<float> *tmp_topk_vals,
136 |                                       TensorWrapper<int> *final_topk_ids,
137 |                                       TensorWrapper<float> *final_topk_vals);
138 | template void launchTopKforBeamSearch(TensorWrapper<half> *probs,
139 |                                       TensorWrapper<int> *tmp_topk_ids,
140 |                                       TensorWrapper<half> *tmp_topk_vals,
141 |                                       TensorWrapper<int> *final_topk_ids,
142 |                                       TensorWrapper<half> *final_topk_vals);
143 | 


--------------------------------------------------------------------------------
/src/kernels/rmsnorm_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include "src/utils/cuda_debug_utils.cuh"
  3 | #include "src/kernels/rmsnorm_kernel.h"
  4 | //bugs1: 2nd warpreducesum returns 0, because blockDim.x < 32, blockDim.x / 32=0
  5 | //bugs2: output buffer valuse is the same as ones before call, thats because we didn't successfully write into the output address
  6 | //bugs3: output buffer's 1st 32 values are right, the latter is wrong, because when we use vec, the ele nums of a row is hiddenunits/vecsize, we should note the row stride to move the ptr carefully
  7 | //bugs4: remeber add __syncthreads() in fp32/fp16 kernel, or we cant get the right res, ep, here we didnt add it, we get some res equal to 0 
  8 | template<typename T>
  9 | __device__ T warpReduceSum(T val){
 10 |     for(int i = 32 / 2; i > 0; i >>= 1){
 11 |         val += __shfl_xor_sync(0xffffffff, val, i);
 12 |     }
 13 |     return val; // 32 threads return val, but only 0th thread is sum val
 14 | }
 15 | //note:!!!when blocksize < 32, use blockDim.x/32 to get warp nums is wrong, we should instead ceil it
 16 | template<typename T>
 17 | __device__ T blockReduceSum(T val){
 18 |     int tid = threadIdx.x;
 19 |     int wid = tid / 32;
 20 |     int laneid = tid % 32;
 21 |     int warpnum = (blockDim.x + 31) / 32;
 22 |     static __shared__ T warpsum[64];
 23 |     val = warpReduceSum<T>(val);
 24 |     if(laneid == 0){
 25 |         warpsum[wid] = val;
 26 |     }
 27 |     __syncthreads();
 28 | 
 29 |     T sum = tid < warpnum ? warpsum[tid] : (T)0;
 30 |     sum = warpReduceSum<T>(sum); //though 0th own the sum, but dont need to shfl sync
 31 |     return sum;
 32 | }
 33 | // 1.this kernel is used at the begin of every decoder layer and the end of 32 decoder layers
 34 | // 2.I allocate threads number by assuming head size can be divided by 4 and 2
 35 | template <typename T>
 36 | __global__ void RMSNorm(T* decoder_out, // [num tokens, q_hidden_units]
 37 |                         T* decoder_residual,
 38 |                         T* scale, //[q_hidden_units], RMSNorm weights
 39 |                         float eps, //RMSNorm eps
 40 |                         int num_tokens, 
 41 |                         int hidden_units){
 42 |   int vec_size = Vec<T>::size;
 43 |   using Vec_t = typename Vec<T>::Type;
 44 |   float thread_sum = 0.0f;
 45 |   Vec_t* dout = reinterpret_cast<Vec_t*>(decoder_out + blockIdx.x * hidden_units);
 46 |   Vec_t* rsd;
 47 |   rsd = reinterpret_cast<Vec_t*>(decoder_residual + blockIdx.x * hidden_units);
 48 |   for (int idx = threadIdx.x; idx < hidden_units / vec_size; idx += blockDim.x) {
 49 |     Vec_t vec = dout[idx];
 50 |     rsd[idx] = vec;
 51 |     thread_sum += vec.x * vec.x;
 52 |     thread_sum += vec.y * vec.y;
 53 |     thread_sum += vec.z * vec.z;
 54 |     thread_sum += vec.w * vec.w;
 55 |   }
 56 |   thread_sum = blockReduceSum<float>(thread_sum);
 57 |   __shared__ float inv_mean;
 58 |   if (threadIdx.x == 0) {
 59 |     inv_mean = rsqrtf((float)thread_sum / hidden_units + eps);
 60 |   }
 61 |   __syncthreads();
 62 |   Vec_t* s = reinterpret_cast<Vec_t*>(scale);
 63 |   for (int idx = threadIdx.x; idx < hidden_units / vec_size; idx += blockDim.x) {
 64 |     Vec_t out = dout[idx];// note the offset should divide vec size
 65 | 
 66 |     dout[idx].x = out.x * inv_mean * s[idx].x;
 67 |     dout[idx].y = out.y * inv_mean * s[idx].y;
 68 |     dout[idx].z = out.z * inv_mean * s[idx].z;
 69 |     dout[idx].w = out.w * inv_mean * s[idx].w;
 70 |   }
 71 | }
 72 | 
 73 | template <>
 74 | __global__ void RMSNorm(half* decoder_out, // [num tokens, q_hidden_units]
 75 |                         half* decoder_residual,
 76 |                         half* scale, //[q_hidden_units], RMSNorm weights
 77 |                         float eps, //RMSNorm eps
 78 |                         int num_tokens, 
 79 |                         int hidden_units){
 80 |     int vec_size = Vec<half>::size;
 81 |     using Vec_t = typename Vec<half>::Type;
 82 |     int batch_id = blockIdx.x;
 83 |     int tid = threadIdx.x;
 84 |     Vec_t* s; 
 85 |     Vec_t* dout = reinterpret_cast<Vec_t*>(decoder_out + batch_id * hidden_units);
 86 |     Vec_t* rsd;
 87 |     if (decoder_residual != nullptr) {
 88 |         rsd = reinterpret_cast<Vec_t*>(decoder_residual + batch_id * hidden_units);
 89 |     }
 90 |     float thread_accm = 0.0f;
 91 |     for(int i = tid; i < hidden_units / vec_size; i += blockDim.x) {
 92 |         Vec_t out = dout[i];// note the offset should divide vec size
 93 |         if (decoder_residual != nullptr) {
 94 |             rsd[i] = out;
 95 |         }
 96 |         thread_accm += __half2float(out.x) * __half2float(out.x);
 97 |         thread_accm += __half2float(out.y) * __half2float(out.y);
 98 |     } //x^2
 99 |     
100 |     // mean(x^2)
101 |     float blocksum = blockReduceSum<float>(thread_accm);
102 |     __shared__ float inv_fenmu;
103 |     if(tid == 0){
104 |         inv_fenmu = rsqrtf(float(blocksum / hidden_units) + eps);
105 |     }
106 |     __syncthreads();
107 |     // rmsnorm
108 |     s = reinterpret_cast<Vec_t*>(scale);
109 |     for(int i = tid; i < hidden_units / vec_size; i += blockDim.x) {
110 |         Vec_t dout_h2 =dout[i];
111 |         dout[i].x = s[i].x * __float2half(__half2float(dout_h2.x) * inv_fenmu);
112 |         dout[i].y = s[i].y * __float2half(__half2float(dout_h2.y) * inv_fenmu);
113 |     }    
114 | }
115 | 
116 | 
117 | template<typename T>
118 | void launchRMSNorm( TensorWrapper<T>* decoder_out, // [num tokens, hidden_units]
119 |                     TensorWrapper<T>* decoder_residual,
120 |                     LayerNormWeight<T>& attn_norm_weight, //RMSNorm weights
121 |                     float eps, //RMSNorm eps
122 |                     bool is_last // for print last rmsnorm output to debug
123 |                     )
124 | {
125 |     int num_tokens = decoder_out->shape[0];
126 |     int hidden_units = decoder_out->shape[1];
127 |     int vec_size = Vec<T>::size;
128 |     int num_threads = hidden_units / 4; //vec size // assume head size can be divided by 4 and 2
129 |     T* rsd = decoder_residual->data;
130 |     dim3 grid(num_tokens);
131 |     dim3 block(num_threads);
132 |     RMSNorm<T><<<grid, block>>>(decoder_out->data,
133 |                             rsd,
134 |                             attn_norm_weight.gamma,
135 |                             eps,
136 |                             num_tokens,
137 |                             hidden_units);
138 | #ifdef PRINT_DATA
139 |     printf("rmsnorm kernel top2 result:\n");
140 |     print_data<<<1, 1>>>(decoder_out->data);
141 | #else
142 | #endif
143 | }
144 | 
145 | template void launchRMSNorm( TensorWrapper<float>* decoder_out, // [num tokens, hidden_units]
146 |                     TensorWrapper<float>* decoder_residual,
147 |                     LayerNormWeight<float>& attn_norm_weight, //RMSNorm weights
148 |                     float eps, //RMSNorm eps
149 |                     bool is_last
150 |                     );
151 | template void launchRMSNorm( TensorWrapper<half>* decoder_out, // [num tokens, hidden_units]
152 |                     TensorWrapper<half>* decoder_residual,
153 |                     LayerNormWeight<half>& attn_norm_weight, //RMSNorm weights
154 |                     float eps, //RMSNorm eps
155 |                     bool is_last
156 |                     );
157 | 


--------------------------------------------------------------------------------