├── .gitignore
├── .vscode
    ├── c_cpp_properties.json
    └── settings.json
├── CMakeLists.txt
├── README.md
├── docs
    └── 【CUDA编程】手撸大模型推理框架FasterLlama.md
├── fasterLlama
    ├── CMakeLists.txt
    ├── cuda
    │   ├── CMakeLists.txt
    │   ├── allocator.h
    │   ├── common.h
    │   ├── cuda_kernels.cuh
    │   ├── decoder_kernels.cu
    │   ├── decoder_kernels.cuh
    │   ├── decoding_kernels.cu
    │   ├── decoding_kernels.cuh
    │   ├── decoding_sampling.cu
    │   ├── open_decoder.cu
    │   └── utils.h
    ├── decoding_sampling.h
    ├── lib
    │   ├── libfldecoderkernel.so
    │   ├── libfldecodersampling.so
    │   ├── libfldecodingkernel.so
    │   └── libflopendecoder.so
    └── open_decoder.h
└── samples
    ├── CMakeLists.txt
    ├── llama_fp16.cu
    ├── llama_fp32.cu
    └── test.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore all files in the build/ directory
2 | build/
3 | 


--------------------------------------------------------------------------------
/.vscode/c_cpp_properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "configurations": [
 3 |         {
 4 |             "name": "Linux",
 5 |             "includePath": [
 6 |                 "${workspaceFolder}/**",
 7 |                 "/usr/local/cuda-11.7/include/**"
 8 |             ],
 9 |             "defines": [],
10 |             "cStandard": "c11",
11 |             "cppStandard": "c++11",
12 |             "intelliSenseMode": "linux-gcc-x64"
13 |         }
14 |     ],
15 |     "version": 4
16 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "array": "cpp",
 4 |         "chrono": "cpp",
 5 |         "functional": "cpp",
 6 |         "istream": "cpp",
 7 |         "ostream": "cpp",
 8 |         "ratio": "cpp",
 9 |         "tuple": "cpp",
10 |         "type_traits": "cpp",
11 |         "utility": "cpp",
12 |         "__functional_base": "cpp",
13 |         "__functional_base_03": "cpp",
14 |         "__hash_table": "cpp",
15 |         "__tree": "cpp",
16 |         "__tuple": "cpp",
17 |         "algorithm": "cpp",
18 |         "filesystem": "cpp",
19 |         "limits": "cpp",
20 |         "memory": "cpp",
21 |         "random": "cpp",
22 |         "string_view": "cpp",
23 |         "__locale": "cpp",
24 |         "__string": "cpp",
25 |         "string": "cpp",
26 |         "*.tcc": "cpp",
27 |         "iosfwd": "cpp",
28 |         "cstdint": "cpp",
29 |         "cmath": "cpp",
30 |         "sstream": "cpp",
31 |         "stdexcept": "cpp"
32 |     }
33 | }


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
 4 | project(FasterLlama LANGUAGES CXX CUDA)
 5 | 
 6 | set(CMAKE_CUDA_ARCHITECTURES 75)
 7 | 
 8 | set(COMMON_HEADER_DIRS
 9 |     ${PROJECT_SOURCE_DIR}
10 |     ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
11 | )
12 | 
13 | set(COMMON_LIB_DIRS
14 |     /usr/local/cuda-11.7/lib64
15 | )
16 | 
17 | include_directories(
18 |   ${COMMON_HEADER_DIRS}
19 | )
20 | 
21 | message("-- Assign include directories (include_directories=${COMMON_HEADER_DIRS})")
22 | 
23 | add_definitions(-DNDEBUG)
24 | 
25 | add_subdirectory(fasterLlama)
26 | add_subdirectory(samples)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 使用 CUDA C++ 实现的大模型推理框架 FasterLLaMA
 2 | 
 3 | ## 1 版本发布背景
 4 | 在 FasterLLaMA v1.0 中，笔者提供了一个 Decoder 模块和一套推理方案 Decoding 模型，目前 FasterLLaMA v1.0 仅适配 LLaMA2，至于LLaMA3 及其他开源大模型的适配工作，将在后续版本逐步加入。其中，Decoder 相当于我们常说的 decoder layer；而 Decoding 则包含了整个解码的流程，包括词嵌入、解码层和采样解码等过程。
 5 | 
 6 | 针对 Decoder 模块的 GEMM 场景，笔者提供了基于 cuBLAS 的 INT8 量化实现，对模型权重和激活值进行 INT8 量化，量化粒度均为 per-channel，通过 INT8 量化的矩阵运算可以高效地利用 GPU 中的 INT8 Tensor Core，在保证低精度损失的前提下，取得较好的加速比（对比 FP16 运算精度而言），要注意的是 FasterLLaMA v1.0 仅支持在计算能力不低于 7.5 的设备上运行。另外，`Q*K` 乘法和 `QK*V` 乘法部分在 v1.0 版本仍然还是使用的 FP32 类型，没有实现低精度量化。
 7 | 
 8 | 针对 Decoding 模型的解码场景，笔者参考了 Faster Transformer，提供了两种基于采样解码的实现：top-k 解码和 top-p 解码。
 9 | 
10 | 数据类型方面，目前 FasterLLaMA v1.0 支持 FP32 和 FP16 两种类型，笔者针对 FP16 类型对相关 Kernel 函数模板进行了特化。
11 | 
12 | 注意力机制方面，目前 FasterLLaMA v1.0 仅支持 MHA，计划在后续版本加入对 MQA 和 GQA 的支持。
13 | 
14 | ## 2 整体架构
15 | FasterLLaMA v1.0 基于 CUDA、cuBLAS、CUB 等 Nvidia 官方库实现，目前仅提供 C++ API，用户可以将它集成到本机 C++ 中构建的推理服务代码中。此外笔者还提供了一些简单的示例代码来演示如何在 C++ 中执行 Decoding 过程。
16 | 
17 | 下面是 Decoder 模块的整体架构图：
18 | ![](https://mmbiz.qpic.cn/sz_mmbiz_png/GJUG0H1sS5qX4u3gKYjsOZ7r3ib6Jk02RkszQibYbxMpzTOPryIsOxonbFgQicponrNVqWCrIvZiasb0heJcevSic3g/640?wx_fmt=png&amp;from=appmsg)
19 | 
20 | 下面是 Decoding 模型的整体架构图：
21 | ![](https://mmbiz.qpic.cn/sz_mmbiz_png/GJUG0H1sS5okzmlo35c3o3ibDdV7jLkLp6WL1ibGpZemlnWpgZaXxJjeTicicbzK2bQu5gqfq6SUTRbYXx7ibKAtYwg/640?wx_fmt=png&amp;from=appmsg)


--------------------------------------------------------------------------------
/fasterLlama/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set (FASTER_LLAMA_HEADER_DIR
 2 |     ${PROJECT_SOURCE_DIR}/fasterLlama
 3 |     ${PROJECT_SOURCE_DIR}/fasterLlama/cuda
 4 | )
 5 | 
 6 | include_directories(
 7 |     ${FASTER_LLAMA_HEADER_DIR}
 8 | )
 9 | 
10 | add_subdirectory(cuda)
11 | 
12 | 


--------------------------------------------------------------------------------
/fasterLlama/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(decoder_kernel_files
 2 |     decoder_kernels.cu
 3 | )
 4 | 
 5 | set(decoding_kernel_files
 6 |     decoding_kernels.cu
 7 | )
 8 | 
 9 | set(open_decoder_files
10 |     open_decoder.cu
11 | )
12 | 
13 | set(decoding_sampling_files
14 |     decoding_sampling.cu
15 | )
16 | 
17 | set(FASTER_LLAMA_CUDA_HEADER_DIR
18 |     ${PROJECT_SOURCE_DIR}/fasterLlama
19 |     ${PROJECT_SOURCE_DIR}/fasterLlama/cuda
20 | )
21 | 
22 | include_directories(
23 |     ${FASTER_LLAMA_CUDA_HEADER_DIR}
24 | )
25 | 
26 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/fasterLlama/lib)
27 | 
28 | message("-- Assign fasterLlama include directories (include_directories=${FASTER_LLAMA_CUDA_HEADER_DIR})")
29 | message("-- in fasterLlama cuda Assign arch (arch=${CMAKE_CUDA_ARCHITECTURES})")
30 | 
31 | add_library(fldecoderkernel SHARED ${decoder_kernel_files})
32 | # Request that particles be built with --std=c++14
33 | # As this is a public compile feature anything that links to particles
34 | # will also build with -std=c++14
35 | target_compile_features(fldecoderkernel PUBLIC cxx_std_14)
36 | target_compile_options(fldecoderkernel PUBLIC "-gencode=arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}")
37 | set_target_properties(fldecoderkernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
38 | target_link_libraries(fldecoderkernel PUBLIC -lcublas -lcudart -lcurand)
39 | 
40 | add_library(fldecodingkernel SHARED ${decoding_kernel_files})
41 | # Request that particles be built with --std=c++14
42 | # As this is a public compile feature anything that links to particles
43 | # will also build with -std=c++14
44 | target_compile_features(fldecodingkernel PUBLIC cxx_std_14)
45 | target_compile_options(fldecodingkernel PUBLIC "-gencode=arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}")
46 | set_target_properties(fldecodingkernel PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
47 | target_link_libraries(fldecodingkernel PUBLIC -lcublas -lcudart -lcurand)
48 | 
49 | add_library(flopendecoder SHARED ${open_decoder_files})
50 | # Request that particles be built with --std=c++14
51 | # As this is a public compile feature anything that links to particles
52 | # will also build with -std=c++14
53 | target_compile_features(flopendecoder PUBLIC cxx_std_14)
54 | target_compile_options(flopendecoder PUBLIC "-gencode=arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}")
55 | set_target_properties(flopendecoder PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
56 | target_link_libraries(flopendecoder PUBLIC -lcublas -lcudart -lcurand fldecoderkernel)
57 | 
58 | add_library(fldecodersampling SHARED ${decoding_sampling_files})
59 | # Request that particles be built with --std=c++14
60 | # As this is a public compile feature anything that links to particles
61 | # will also build with -std=c++14
62 | target_compile_features(fldecodersampling PUBLIC cxx_std_14)
63 | target_compile_options(fldecodersampling PUBLIC "-gencode=arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES}")
64 | set_target_properties(fldecodersampling PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
65 | target_link_libraries(fldecodersampling PUBLIC -lcublas -lcudart -lcurand flopendecoder fldecodingkernel fldecoderkernel)


--------------------------------------------------------------------------------
/fasterLlama/cuda/allocator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "utils.h"
 4 | #include <vector>
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace FasterLLaMA
 8 | {
 9 | 
10 |   /**
11 |    * Pop current cuda device and set new device
12 |    * i_device - device ID to set
13 |    * o_device - device ID to pop
14 |    * ret  - return code (the same as cudaError_t)
15 |    */
16 | 
17 |   inline cudaError_t get_set_device(int i_device, int *o_device = NULL)
18 |   {
19 |     int current_dev_id = 0;
20 |     cudaError_t err = cudaSuccess;
21 | 
22 |     if (o_device != NULL)
23 |     {
24 |       err = cudaGetDevice(&current_dev_id);
25 |       if (err != cudaSuccess)
26 |         return err;
27 |       if (current_dev_id == i_device)
28 |       {
29 |         *o_device = i_device;
30 |       }
31 |       else
32 |       {
33 |         err = cudaSetDevice(i_device);
34 |         if (err != cudaSuccess)
35 |         {
36 |           return err;
37 |         }
38 |         *o_device = current_dev_id;
39 |       }
40 |     }
41 |     else
42 |     {
43 |       err = cudaSetDevice(i_device);
44 |       if (err != cudaSuccess)
45 |       {
46 |         return err;
47 |       }
48 |     }
49 | 
50 |     return cudaSuccess;
51 |   }
52 | 
53 |   enum class AllocatorType
54 |   {
55 |     CUDA,
56 |     TF,
57 |     TH
58 |   };
59 | 
60 |   class IAllocator
61 |   {
62 |   public:
63 |     virtual void *malloc(size_t size, const bool is_set_zero = true) const = 0;
64 |     virtual void free(void *ptr) const = 0;
65 |   };
66 | 
67 |   template <AllocatorType AllocType_>
68 |   class Allocator;
69 | 
70 |   template <>
71 |   class Allocator<AllocatorType::CUDA> : public IAllocator
72 |   {
73 |     const int device_id_;
74 | 
75 |   public:
76 |     Allocator(int device_id) : device_id_(device_id) {}
77 | 
78 |     void *malloc(size_t size, const bool is_set_zero = true) const
79 |     {
80 |       void *ptr = nullptr;
81 |       int o_device = 0;
82 |       CHECK_CUDA_ERROR(get_set_device(device_id_, &o_device));
83 |       CHECK_CUDA_ERROR(cudaMalloc(&ptr, size));
84 |       CHECK_CUDA_ERROR(get_set_device(o_device));
85 |       return ptr;
86 |     }
87 | 
88 |     void free(void *ptr) const
89 |     {
90 |       int o_device = 0;
91 |       CHECK_CUDA_ERROR(get_set_device(device_id_, &o_device));
92 |       CHECK_CUDA_ERROR(cudaFree(ptr));
93 |       CHECK_CUDA_ERROR(get_set_device(o_device));
94 |       return;
95 |     }
96 |   };
97 | 
98 | } // namespace FasterLLaMA
99 | 


--------------------------------------------------------------------------------
/fasterLlama/cuda/common.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <cublas_v2.h>
  5 | 
  6 | namespace FasterLLaMA
  7 | {
  8 | 
  9 |     enum class OperationType
 10 |     {
 11 |         FP32,
 12 |         FP16,
 13 |         INT8
 14 |     };
 15 | 
 16 |     template <typename T>
 17 |     struct ResNormWeight
 18 |     {
 19 |         T *gamma = nullptr;
 20 |         float eps = 1e-5f;
 21 |     };
 22 | 
 23 |     template <typename T, typename WeightType>
 24 |     struct DenseWeight
 25 |     {
 26 |         WeightType *kernel = nullptr;
 27 |         T *bias = nullptr;
 28 |         float *weight_scale = nullptr;
 29 |     };
 30 | 
 31 |     template <typename T, typename WeightType>
 32 |     struct AttentionWeight
 33 |     {
 34 |         DenseWeight<T, WeightType> query_weight;
 35 |         DenseWeight<T, WeightType> key_weight;
 36 |         DenseWeight<T, WeightType> value_weight;
 37 |         DenseWeight<T, WeightType> attention_output_weight;
 38 |     };
 39 | 
 40 |     template <typename T, typename WeightType>
 41 |     struct FFNWeight
 42 |     {
 43 |         DenseWeight<T, WeightType> w1_weight;
 44 |         DenseWeight<T, WeightType> w2_weight;
 45 |         DenseWeight<T, WeightType> w3_weight;
 46 |     };
 47 | 
 48 |     template <OperationType OpType_>
 49 |     class TransformerTraits;
 50 | 
 51 |     template <>
 52 |     class TransformerTraits<OperationType::INT8>
 53 |     {
 54 |     public:
 55 |         typedef int8_t DataType;
 56 |         typedef int32_t AlphaType;
 57 |         static const OperationType OpType = OperationType::INT8;
 58 |         static cublasComputeType_t const computeType = CUBLAS_COMPUTE_32I;
 59 |         static cudaDataType_t const AType = CUDA_R_8I;
 60 |         static cudaDataType_t const BType = CUDA_R_8I;
 61 |         static cudaDataType_t const CType = CUDA_R_32I;
 62 |     };
 63 | 
 64 |     template <>
 65 |     class TransformerTraits<OperationType::FP32>
 66 |     {
 67 |     public:
 68 |         typedef float DataType;
 69 |         typedef float AlphaType;
 70 |         static const OperationType OpType = OperationType::FP32;
 71 |         static cublasComputeType_t const computeType = CUBLAS_COMPUTE_32F_FAST_16F;
 72 |         static cudaDataType_t const AType = CUDA_R_32F;
 73 |         static cudaDataType_t const BType = CUDA_R_32F;
 74 |         static cudaDataType_t const CType = CUDA_R_32F;
 75 |     };
 76 | 
 77 |     template <>
 78 |     class TransformerTraits<OperationType::FP16>
 79 |     {
 80 |     public:
 81 |         typedef half DataType;
 82 |         typedef half AlphaType;
 83 |         static const OperationType OpType = OperationType::FP16;
 84 |         static cublasComputeType_t const computeType = CUBLAS_COMPUTE_16F;
 85 |         static cudaDataType_t const AType = CUDA_R_16F;
 86 |         static cudaDataType_t const BType = CUDA_R_16F;
 87 |         static cudaDataType_t const CType = CUDA_R_16F;
 88 |     };
 89 | 
 90 |     template <OperationType OpType_>
 91 |     class DecoderTransformerTraits;
 92 | 
 93 |     template <>
 94 |     class DecoderTransformerTraits<OperationType::FP32> : public TransformerTraits<OperationType::FP32>
 95 |     {
 96 |     };
 97 | 
 98 |     template <>
 99 |     class DecoderTransformerTraits<OperationType::FP16> : public TransformerTraits<OperationType::FP16>
100 |     {
101 |     };
102 | 
103 |     template <>
104 |     class DecoderTransformerTraits<OperationType::INT8> : public TransformerTraits<OperationType::INT8>
105 |     {
106 |     };
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/fasterLlama/cuda/cuda_kernels.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "common.h"
 3 | #include <cub/cub.cuh>
 4 | #include <assert.h>
 5 | #include <cuda_fp16.h>
 6 | #include <curand_kernel.h>
 7 | #include <cfloat>
 8 | 
 9 | namespace FasterLLaMA
10 | {
11 | 
12 |     static inline __device__ int8_t float_to_int8_rn(float x);
13 |     template <typename T>
14 |     struct SumOp
15 |     {
16 |         __device__ __forceinline__ T operator()(const T &a, const T &b) const { return a + b; }
17 |     };
18 | 
19 |     template <typename T>
20 |     struct MaxOp
21 |     {
22 |         __device__ __forceinline__ T operator()(const T &a, const T &b) const { return max(a, b); }
23 |     };
24 | 
25 |     template <template <typename> class ReductionOp, typename T>
26 |     __inline__ __device__ T warpAllReduce(T val);
27 | 
28 |     template <typename T>
29 |     __inline__ __device__ T blockAllReduceSum(T val);
30 | 
31 |     template <typename T>
32 |     __inline__ __device__ T blockAllReduceMax(T val);
33 | 
34 |     static inline __device__ int8_t float_to_int8_rn(float x)
35 |     {
36 |         uint32_t dst;
37 |         asm volatile("cvt.rni.sat.s8.f32 %0, %1;"
38 |                      : "=r"(dst)
39 |                      : "f"(x));
40 |         return reinterpret_cast<const int8_t &>(dst);
41 |     }
42 | 
43 |     template <template <typename> class ReductionOp, typename T>
44 |     __inline__ __device__ T warpAllReduce(T val)
45 |     {
46 |         auto func = ReductionOp<T>();
47 | #pragma unroll
48 |         for (int mask = 16; mask > 0; mask >>= 1)
49 |         {
50 |             val = func(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
51 |         }
52 |         return val;
53 |     }
54 | 
55 |     template <typename T>
56 |     __inline__ __device__ T blockAllReduceSum(T val)
57 |     {
58 |         static __shared__ T shared[32];
59 |         __shared__ T result;
60 |         int lane = threadIdx.x & 0x1f;
61 |         int wid = threadIdx.x >> 5;
62 | 
63 |         val = warpAllReduce<SumOp, T>(val);
64 | 
65 |         if (lane == 0)
66 |             shared[wid] = val;
67 |         __syncthreads();
68 | 
69 |         val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)0.0f;
70 |         val = warpAllReduce<SumOp, T>(val);
71 |         if (threadIdx.x == 0)
72 |             result = val;
73 |         __syncthreads();
74 |         return result;
75 |     }
76 | 
77 |     template <typename T>
78 |     __inline__ __device__ T blockAllReduceMax(T val)
79 |     {
80 |         static __shared__ T shared[32];
81 |         __shared__ T result;
82 |         int lane = threadIdx.x & 0x1f;
83 |         int wid = threadIdx.x >> 5;
84 | 
85 |         val = warpAllReduce<MaxOp, T>(val);
86 | 
87 |         if (lane == 0)
88 |             shared[wid] = val;
89 |         __syncthreads();
90 | 
91 |         val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)(-1 * FLT_MAX);
92 |         val = warpAllReduce<MaxOp, T>(val);
93 |         if (threadIdx.x == 0)
94 |             result = val;
95 |         __syncthreads();
96 |         return result;
97 |     }
98 | 
99 | } // FasterLLaMA


--------------------------------------------------------------------------------
/fasterLlama/cuda/decoder_kernels.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_fp16.h>
  4 | 
  5 | namespace FasterLLaMA
  6 | {
  7 | 
  8 |      /** resNorm、量化
  9 |       * grid(batch_size * seq_len)  block(128)
 10 |       * output: [batch_size, seq_len, hidden_units]
 11 |       * input: [batch_size, seq_len, hidden_units]
 12 |       * gamma: [hidden_units, ]
 13 |       */
 14 |      template <typename DataType>
 15 |      __global__ void resNormQuantizedKernel(int8_t *__restrict__ output, const DataType *__restrict__ input, const DataType *__restrict__ gamma,
 16 |                                             float *__restrict__ norm_scale, const float eps, const int hidden_units);
 17 | 
 18 |      template <>
 19 |      __global__ void resNormQuantizedKernel(int8_t *__restrict__ output, const half *__restrict__ input, const half *__restrict__ gamma,
 20 |                                             float *__restrict__ norm_scale, const float eps, const int hidden_units);
 21 | 
 22 |      template <typename DataType>
 23 |      void launchResNormQuantizedKernel(int8_t *output, const DataType *input, const DataType *gamma,
 24 |                                        float *norm_scale, const float eps, const int nrows, const int hidden_units, cudaStream_t stream = 0);
 25 | 
 26 |      /** embeddingLookingUp
 27 |       * grid(batch_size, seq_len) block(128)
 28 |       * from_tensor: [batch_size, seq_len, hidden_units]
 29 |       * word_ids:    [batch_size, seq_len]
 30 |       */
 31 |      template <typename DataType>
 32 |      __global__ void embeddingLookingUpKernel(DataType *__restrict__ from_tensor, const DataType *__restrict__ embedding_table,
 33 |                                               const int *__restrict__ word_ids, const int hidden_units, const int seq_len);
 34 | 
 35 |      template <typename DataType>
 36 |      void launchEmbeddingLookingUpKernel(DataType *from_tensor, const DataType *embedding_table,
 37 |                                          const int *word_ids, const int hidden_units, const int batch_size, const int seq_len,
 38 |                                          cudaStream_t stream = 0);
 39 |      /** perChannel 量化
 40 |       * src: [rows, clos]
 41 |       * dst: [rows, clos]
 42 |       * scale_ptr: [rows, ]
 43 |       */
 44 |      template <typename DataType>
 45 |      __global__ void perChannelQuantizedKernel(int8_t *__restrict__ dst, const DataType *__restrict__ src, float *__restrict__ scale_ptr,
 46 |                                                const int hidden_size);
 47 | 
 48 |      template <typename DataType>
 49 |      void perChannelQuantizedKernelLauncher(int8_t *dst, const DataType *src, float *scale_ptr, const int hidden_size,
 50 |                                             const int nrows, cudaStream_t stream = 0);
 51 | 
 52 |      /**
 53 |       * 反量化、rope旋转编码、量化、转置
 54 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
 55 |       * grid(head_num / warp_num, seq_len, batch_size * 2) block(32, warp_num), each warp process size_per_head elements
 56 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
 57 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
 58 |       * freq_cis: [max_seq_len, size_per_head]
 59 |       * q_out_scale k_out_scale: [batch_size, seq_len, head_num], absmax / 127.0f
 60 |       */
 61 |      __global__ void warpQKRoteEmbeddingQuantizedTransposeKernel(int8_t *q_buf, int8_t *k_buf, const int32_t *Q,
 62 |                                                                  const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
 63 |                                                                  const float *q_weight_scale, const float *k_weight_scale, float *q_out_scale,
 64 |                                                                  float *k_out_scale, float *freq_cis, const int batch_size, const int seq_len,
 65 |                                                                  const int start_pos, const int total_len, const int head_num,
 66 |                                                                  const int size_per_head);
 67 | 
 68 |      /**
 69 |       * 反量化、rope旋转编码、量化、转置
 70 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
 71 |       * grid(head_num, seq_len, batch_size * 2) block(128), each block process size_per_head(256) elements
 72 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
 73 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
 74 |       * freq_cis: [max_seq_len, size_per_head]
 75 |       * q_out_scale k_out_scale: [batch_size, seq_len, head_num], absmax / 127.0f
 76 |       */
 77 |      __global__ void blockQKRoteEmbeddingQuantizedTransposeForDim256Kernel(int8_t *q_buf, int8_t *k_buf, const int32_t *Q,
 78 |                                                                            const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
 79 |                                                                            const float *q_weight_scale, const float *k_weight_scale, float *q_out_scale,
 80 |                                                                            float *k_out_scale, float *freq_cis, const int batch_size, const int seq_len,
 81 |                                                                            const int start_pos, const int total_len, const int head_num,
 82 |                                                                            const int size_per_head);
 83 | 
 84 |      /**
 85 |       * 反量化、rope旋转编码、量化、转置
 86 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
 87 |       * grid(head_num, seq_len, batch_size * 2) block(size_per_head / 4), each block process size_per_head elements
 88 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
 89 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
 90 |       * freq_cis: [max_seq_len, size_per_head]
 91 |       * q_out_scale k_out_scale: [batch_size, head_num, seq_len], absmax / 127.0f
 92 |       */
 93 |      __global__ void blockQKRoteEmbeddingQuantizedTransposeKernel(int8_t *q_buf, int8_t *k_buf, const int32_t *Q,
 94 |                                                                   const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
 95 |                                                                   const float *q_weight_scale, const float *k_weight_scale, float *q_out_scale,
 96 |                                                                   float *k_out_scale, float *freq_cis, const int batch_size, const int seq_len,
 97 |                                                                   const int start_pos, const int total_len, const int head_num,
 98 |                                                                   const int size_per_head);
 99 | 
100 |      void launchQKRoteEmbeddingQuantizedTranspose(int8_t *q_buf, int8_t *k_buf, const int32_t *Q,
101 |                                                   const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
102 |                                                   const float *q_weight_scale, const float *k_weight_scale, float *q_out_scale,
103 |                                                   float *k_out_scale, float *freq_cis, const int batch_size, const int seq_len,
104 |                                                   const int start_pos, const int total_len, const int head_num,
105 |                                                   const int size_per_head, cudaStream_t stream = 0);
106 | 
107 |      /**
108 |       * 反量化、rope旋转编码、转置
109 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
110 |       * grid(head_num / warp_num, seq_len, batch_size * 2) block(32, warp_num), each warp process size_per_head elements
111 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
112 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
113 |       * freq_cis: [max_seq_len, size_per_head]
114 |       */
115 |      __global__ void warpQKRoteEmbeddingTransposeKernel(float *q_buf, float *k_buf, const int32_t *Q,
116 |                                                         const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
117 |                                                         const float *q_weight_scale, const float *k_weight_scale, float *freq_cis,
118 |                                                         const int batch_size, const int seq_len,
119 |                                                         const int start_pos, const int total_len, const int head_num,
120 |                                                         const int size_per_head);
121 | 
122 |      /**
123 |       * 反量化、rope旋转编码、转置
124 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
125 |       * grid(head_num, seq_len, batch_size * 2) block(128), each block process size_per_head(256) elements
126 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
127 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
128 |       * freq_cis: [max_seq_len, size_per_head]
129 |       */
130 |      __global__ void blockQKRoteEmbeddingTransposeForDim256Kernel(float *q_buf, float *k_buf, const int32_t *Q,
131 |                                                                   const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
132 |                                                                   const float *q_weight_scale, const float *k_weight_scale,
133 |                                                                   float *freq_cis, const int batch_size, const int seq_len,
134 |                                                                   const int start_pos, const int total_len, const int head_num,
135 |                                                                   const int size_per_head);
136 | 
137 |      /**
138 |       * 反量化、rope旋转编码、转置
139 |       * Q K: [batch_size, seq_len, head_num, size_per_head]
140 |       * grid(head_num, seq_len, batch_size * 2) block(size_per_head / 4), each block process size_per_head elements
141 |       * q_inp_sacle k_inp_scale: [batch_size, seq_len], absmax / 127.0f
142 |       * q_weight_scale k_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
143 |       * freq_cis: [max_seq_len, size_per_head]
144 |       */
145 |      __global__ void blockQKRoteEmbeddingTransposeKernel(float *q_buf, float *k_buf, const int32_t *Q,
146 |                                                          const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
147 |                                                          const float *q_weight_scale, const float *k_weight_scale,
148 |                                                          float *freq_cis, const int batch_size, const int seq_len,
149 |                                                          const int start_pos, const int total_len, const int head_num,
150 |                                                          const int size_per_head);
151 | 
152 |      void launchQKRoteEmbeddingTranspose(float *q_buf, float *k_buf, const int32_t *Q,
153 |                                          const int32_t *K, const float *q_inp_scale, const float *k_inp_scale,
154 |                                          const float *q_weight_scale, const float *k_weight_scale,
155 |                                          const float *freq_cis, const int batch_size, const int seq_len,
156 |                                          const int start_pos, const int total_len, const int head_num,
157 |                                          const int size_per_head, cudaStream_t stream = 0);
158 | 
159 |      /**
160 |       * grid: [seq_len, head_num / blockDim.y, batch_size * 2]  block(size_per_head / 4, 256 / (size_per_head / 4))
161 |       * k_cache v_cache: [batch_size, head_num, max_seq_len, size_per_head]
162 |       * K V : [batch_size, head_num, seq_len, size_per_head]
163 |       */
164 |      __global__ void storeKVcacheKernel(float *__restrict__ k_cache, float *__restrict__ v_cache, const float *__restrict__ K,
165 |                                         const float *__restrict__ V, const int start_pos, const int seq_len, const int batch_size, const int head_num,
166 |                                         const int max_seq_len, const int size_per_head);
167 | 
168 |      __global__ void storeKVcacheBlockKernel(float *__restrict__ k_cache, float *__restrict__ v_cache, const float *__restrict__ K,
169 |                                              const float *__restrict__ V, const int start_pos, const int seq_len, const int batch_size, const int head_num,
170 |                                              const int max_seq_len, const int size_per_head);
171 | 
172 |      void launchStoreKVcacheKernel(float *k_cache, float *v_cache, const float *K, const float *V, const int start_pos, const int seq_len,
173 |                                    const int batch_size, const int head_num, const int max_seq_len, const int size_per_head,
174 |                                    cudaStream_t stream = 0);
175 | 
176 |      /**
177 |       * grid: [seq_len, head_num / blockDim.y, batch_size * 2]  block(size_per_head / 4, 256 / (size_per_head / 4))
178 |       * k_cache v_cache: [batch_size, head_num, max_seq_len, size_per_head]
179 |       * K V : [batch_size, head_num, seq_len, size_per_head]
180 |       * k_scale: [batch_size, head_num, seq_len]
181 |       * k_scale_cache: [batch_size, head_num, max_seq_len]
182 |       */
183 |      __global__ void storeKVcacheKernel(int8_t *__restrict__ k_cache, float *__restrict__ v_cache, float *__restrict__ k_scale_cache,
184 |                                         const int8_t *__restrict__ K, const float *__restrict__ V, const float *__restrict__ k_scale,
185 |                                         const int start_pos, const int seq_len, const int batch_size, const int head_num,
186 |                                         const int max_seq_len, const int size_per_head);
187 | 
188 |      void launchStoreKVcacheKernel(int8_t *k_cache, float *v_cache, float *k_scale_cache, const int8_t *K, const float *V,
189 |                                    const float *k_scale, const int start_pos, const int seq_len, const int batch_size,
190 |                                    const int head_num, const int max_seq_len, const int size_per_head, cudaStream_t stream = 0);
191 | 
192 |      /**
193 |       * grid: [seq_len, head_num / blockDim.y, batch_size * 2]  block(size_per_head / 4, 256 / (size_per_head / 4))
194 |       * k_cache v_cache: [batch_size, head_num, max_seq_len, size_per_head]
195 |       * K V : [batch_size, head_num, seq_len, size_per_head]
196 |       */
197 |      __global__ void storeINT8KVcacheKernel(int8_t *__restrict__ k_cache, int8_t *__restrict__ v_cache, const int8_t *__restrict__ K,
198 |                                             const int8_t *__restrict__ V, const int start_pos, const int seq_len, const int batch_size,
199 |                                             const int head_num, const int max_seq_len, const int size_per_head);
200 | 
201 |      void launchINT8StoreKVcacheKernel(int8_t *k_cache, int8_t *v_cache, const int8_t *K, const int8_t *V, const int start_pos,
202 |                                        const int seq_len, const int batch_size, const int head_num, const int max_seq_len,
203 |                                        const int size_per_head, cudaStream_t stream = 0);
204 | 
205 |      /**从 K cache 拷贝数据用于后续 gemm
206 |       * grid(end_seq_id, head_num * batch_size) block(128)
207 |       * from [batch_size, head_num, total_len, size_per_head] to [batch_size, head_num, end_seq_id, size_per_head]
208 |       */
209 |      __global__ void copyKFromCacheKernel(int8_t *__restrict__ k_buf, const int8_t *__restrict__ k_cache,
210 |                                           const int nrows, const int total_len, const int end_seq_id, const int size_per_head);
211 | 
212 |      void launchCopyKFromCacheKernel(int8_t *k_buf, const int8_t *k_cache, const int nrows, const int total_len,
213 |                                      const int end_seq_id, const int size_per_head, cudaStream_t stream = 0);
214 | 
215 |      /**
216 |       * 反量化、softmax、量化
217 |       * grid(seq_len_q, head_num, batch_size), block(128), each block process seq_len_k elements
218 |       * qk score: [batch_size, head_num, seq_len_q, seq_len_k]
219 |       * atten_mask: [max_seq_len, max_seq_len]
220 |       * q_inp_scale: [batch_size, head_num, seq_len_q]
221 |       * k_inp_scale: [batch_size, head_num, seq_len_k]
222 |       * score_scale: [batch_size, head_num, seq_len_q]
223 |       *
224 |       */
225 |      __global__ void blockDeQuantizedSoftmaxQuantizedKernel(int8_t *__restrict__ score, const int32_t *__restrict__ qk,
226 |                                                             const float *__restrict__ attn_mask, const float *__restrict__ q_inp_scale,
227 |                                                             const float *__restrict__ k_inp_scale, float *__restrict__ score_scale,
228 |                                                             const float attn_scale, const int batch_size, const int head_num,
229 |                                                             const int seq_len_q, const int seq_len_k, const int max_seq_len);
230 | 
231 |      void launchBlockDeQuantizedSoftmaxQuantizedKernel(int8_t *score, const int32_t *qk, const float *attn_mask, const float *q_inp_scale,
232 |                                                        const float *k_inp_scale, float *score_scale, const float attn_scale,
233 |                                                        const int batch_size, const int head_num, const int seq_len_q, const int seq_len_k,
234 |                                                        const int max_seq_len, cudaStream_t stream = 0);
235 | 
236 |      /**
237 |       * softmax
238 |       * grid(seq_len_q, head_num, batch_size), block(128), each block process seq_len_k elements
239 |       * qk score: [batch_size, head_num, seq_len_q, seq_len_k]
240 |       * atten_mask: [max_seq_len, max_seq_len]
241 |       *
242 |       */
243 |      __global__ void blockSoftmaxKernel(float *__restrict__ qk, const float *__restrict__ attn_mask, const int batch_size,
244 |                                         const int head_num, const int seq_len_q, const int seq_len_k, const int max_seq_len,
245 |                                         const float scaler);
246 | 
247 |      void launchBlockSoftmaxKernel(float *qk, const float *attn_mask, const int batch_size, const int head_num, const int seq_len_q,
248 |                                    const int seq_len_k, const int max_seq_len, const float scaler, cudaStream_t stream = 0);
249 | 
250 |      /**
251 |       * 反量化、转置
252 |       * grid(head_num / warp_num, seq_len, batch_size) block(32, warp_num), each warp process size_per_head elements
253 |       * V: [batch_size, seq_len, head_num, size_per_head]
254 |       * v_buf: [batch_size, head_num, seq_len, size_per_head]
255 |       * v_inp_sacle: [batch_size, seq_len], absmax / 127.0f
256 |       * v_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
257 |       */
258 |      __global__ void warpDequantizedVTransposeKernel(float *__restrict__ v_buf, const int32_t *__restrict__ V,
259 |                                                      const float *__restrict__ v_inp_scale, const float *__restrict__ v_weight_scale,
260 |                                                      const int batch_size, const int seq_len, const int head_num, const int size_per_head);
261 | 
262 |      /**
263 |       * 反量化、转置
264 |       * grid(head_num, seq_len, batch_size) block(128), each block process size_per_head elements
265 |       * V: [batch_size, seq_len, head_num, size_per_head]
266 |       * v_buf: [batch_size, head_num, seq_len, size_per_head]
267 |       * v_inp_sacle: [batch_size, seq_len], absmax / 127.0f
268 |       * v_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
269 |       */
270 |      __global__ void blockDequantizedVTransposeFor256Kernel(float *__restrict__ v_buf, const int32_t *__restrict__ V,
271 |                                                             const float *__restrict__ v_inp_scale, const float *__restrict__ v_weight_scale,
272 |                                                             const int batch_size, const int seq_len, const int head_num, const int size_per_head);
273 | 
274 |      /**
275 |       * 反量化、转置
276 |       * grid(head_num, seq_len, batch_size) block(size_per_head / 4), each block process size_per_head elements
277 |       * V: [batch_size, seq_len, head_num, size_per_head]
278 |       * v_buf: [batch_size, head_num, seq_len, size_per_head]
279 |       * v_inp_sacle: [batch_size, seq_len], absmax / 127.0f
280 |       * v_weight_scale: [head_num * size_per_head, ], absmax / 127.0f
281 |       */
282 |      __global__ void blockDequantizedVTransposeKernel(float *__restrict__ v_buf, const int32_t *__restrict__ V,
283 |                                                       const float *__restrict__ v_inp_scale, const float *__restrict__ v_weight_scale,
284 |                                                       const int batch_size, const int seq_len, const int head_num, const int size_per_head);
285 | 
286 |      void launchDequantizedVTransposeKernel(float *v_buf, const int32_t *V, const float *v_inp_scale, const float *v_weight_scale,
287 |                                             const int batch_size, const int seq_len, const int head_num, const int size_per_head,
288 |                                             cudaStream_t stream = 0);
289 | 
290 |      /**
291 |       * 量化
292 |       * grid(head_num, batch_size) block(size_per_head), each warp process seq_len elements
293 |       * V: [batch_size, head_num, seq_len, size_per_head]
294 |       * v_buf: [batch_size, head_num, seq_len, size_per_head]
295 |       * v_out_scale: [batch_size, head_num, 1, size_per_head], absmax / 127.0f
296 |       */
297 |      __global__ void blockVQuantizedKernel(int8_t *__restrict__ v_buf, const float *__restrict__ V, float *__restrict__ v_out_scale,
298 |                                            const int batch_size, const int seq_len, const int head_num, const int size_per_head);
299 | 
300 |      void launchBlockVQuantizedKernel(int8_t *v_buf, const float *V, float *v_out_scale, const int batch_size, const int seq_len,
301 |                                       const int head_num, const int size_per_head, cudaStream_t stream = 0);
302 | 
303 |      /** 反量化、量化、转置
304 |       * grid(seq_len, batch_size) block(32 * head_num)
305 |       * attn_buf:[batch_size, seq_len, head_num, size_per_head]
306 |       * attn:[batch_size, head_num, seq_len, size_per_head]
307 |       * score_scale:[batch_size, head_num, seq_len]
308 |       * v_scale:[batch_size, head_num, size_per_head]
309 |       * attn_out_scale:[batch_size, seq_len]
310 |       */
311 |      __global__ void warpDequantizedAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const int32_t *__restrict__ attn,
312 |                                                                  const float *__restrict__ score_scale, const float *__restrict__ v_scale,
313 |                                                                  float *__restrict__ attn_out_scale, const int batch_size,
314 |                                                                  const int head_num, const int seq_len, const int size_per_head);
315 | 
316 |      /** 反量化、量化、转置
317 |       * grid(seq_len, batch_size) block(size_per_head)
318 |       * attn_buf:[batch_size, seq_len, head_num, size_per_head]
319 |       * attn:[batch_size, head_num, seq_len, size_per_head]
320 |       * score_scale:[batch_size, head_num, seq_len]
321 |       * v_scale:[batch_size, head_num, size_per_head]
322 |       * attn_out_scale:[batch_size, seq_len]
323 |       */
324 |      __global__ void blockDequantizedAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const int32_t *__restrict__ attn,
325 |                                                                   const float *__restrict__ score_scale, const float *__restrict__ v_scale,
326 |                                                                   float *__restrict__ attn_out_scale, const int batch_size,
327 |                                                                   const int head_num, const int seq_len, const int size_per_head);
328 | 
329 |      void launchDequantizedAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const int32_t *__restrict__ attn,
330 |                                                         const float *__restrict__ score_scale, const float *__restrict__ v_scale,
331 |                                                         float *__restrict__ attn_out_scale, const int batch_size,
332 |                                                         const int head_num, const int seq_len, const int size_per_head,
333 |                                                         cudaStream_t stream = 0);
334 | 
335 |      /** 量化、转置
336 |       * grid(seq_len, batch_size) block(32 * head_num)
337 |       * attn_buf:[batch_size, seq_len, head_num, size_per_head]
338 |       * attn:[batch_size, head_num, seq_len, size_per_head]
339 |       * attn_out_scale:[batch_size, seq_len]
340 |       */
341 |      __global__ void warpAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const float *__restrict__ attn,
342 |                                                       float *__restrict__ attn_out_scale, const int batch_size,
343 |                                                       const int head_num, const int seq_len, const int size_per_head);
344 | 
345 |      /** 量化、转置
346 |       * grid(seq_len, batch_size) block(size_per_head)
347 |       * attn_buf:[batch_size, seq_len, head_num, size_per_head]
348 |       * attn:[batch_size, head_num, seq_len, size_per_head]
349 |       * attn_out_scale:[batch_size, seq_len]
350 |       */
351 |      __global__ void blockAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const float *__restrict__ attn,
352 |                                                        float *__restrict__ attn_out_scale, const int batch_size,
353 |                                                        const int head_num, const int seq_len, const int size_per_head);
354 | 
355 |      void launchAttnQuantizedTransposeKernel(int8_t *__restrict__ attn_buf, const float *__restrict__ attn,
356 |                                              float *__restrict__ attn_out_scale, const int batch_size,
357 |                                              const int head_num, const int seq_len, const int size_per_head, cudaStream_t stream = 0);
358 | 
359 |      /**反量化、残差结构、量化
360 |       * grid(seq_len * batch_size) block(128)
361 |       * norm_out: [batch_size, seq_len, hidden_units]
362 |       * ffn_tensor: [batch_size, seq_len, hidden_units]
363 |       * from_temsor: [batch_size, seq_len, hidden_units]
364 |       * attn_out: [batch_size, seq_len, hidden_units]
365 |       * attn_out_scale: [batch_size, seq_len]
366 |       * attn_weight_scale: [hidden_units]
367 |       * gamma: [hidden_units]
368 |       * norm_scale: [batch_size, seq_len]
369 |       */
370 |      template <typename DataType>
371 |      __global__ void dequantizedResidualResNormQuantizedKernel(int8_t *__restrict__ norm_out, DataType *__restrict__ ffn_tensor,
372 |                                                                const DataType *__restrict__ from_temsor, const int32_t *__restrict__ attn_out,
373 |                                                                const float *__restrict__ attn_out_scale, const float *__restrict__ attn_weight_scale,
374 |                                                                const DataType *__restrict__ gamma, float *__restrict__ norm_scale,
375 |                                                                const float eps, const int hidden_units);
376 | 
377 |      /**反量化、残差结构、量化
378 |       * grid(seq_len * batch_size) block(128)
379 |       * norm_out: [batch_size, seq_len, hidden_units]
380 |       * ffn_tensor: [batch_size, seq_len, hidden_units]
381 |       * from_temsor: [batch_size, seq_len, hidden_units]
382 |       * attn_out: [batch_size, seq_len, hidden_units]
383 |       * attn_out_scale: [batch_size, seq_len]
384 |       * attn_weight_scale: [hidden_units]
385 |       * gamma: [hidden_units]
386 |       */
387 |      template <>
388 |      __global__ void dequantizedResidualResNormQuantizedKernel(int8_t *__restrict__ norm_out, half *__restrict__ ffn_tensor,
389 |                                                                const half *__restrict__ from_temsor, const int32_t *__restrict__ attn_out,
390 |                                                                const float *__restrict__ attn_out_scale, const float *__restrict__ attn_weight_scale,
391 |                                                                const half *__restrict__ gamma, float *__restrict__ norm_scale,
392 |                                                                const float eps, const int hidden_units);
393 | 
394 |      template <typename DataType>
395 |      void launchDequantizedResidualResNormQuantized(int8_t *norm_out, DataType *__restrict__ ffn_tensor, const DataType *from_temsor,
396 |                                                     const int32_t *attn_out, const float *attn_out_scale,
397 |                                                     const float *attn_weight_scale, const DataType *gamma,
398 |                                                     float *norm_scale, const float eps, const int rows, const int hidden_units,
399 |                                                     cudaStream_t stream = 0);
400 | 
401 |      /** 反量化、silu、element-wise-multify、量化
402 |       * grid(nrows) block(128)
403 |       * out_buf: [nrows, hidden_units]
404 |       * w1_ret w3_ret: [nrows, hidden_units]
405 |       * norm_scale: [nrows, ]
406 |       * w1_weight_scale w3_weight_scale: [hidden_units, ]
407 |       * out_scale: [nrows, ]
408 |       */
409 |      __global__ void dequantizedSiluMultifyQuantizedKernel(int8_t *__restrict__ out_buf, const int32_t *__restrict__ w1_ret,
410 |                                                            const float *__restrict__ norm_scale, const float *__restrict__ w1_weight_scale,
411 |                                                            const int32_t *__restrict__ w3_ret, const float *__restrict__ w3_weight_scale,
412 |                                                            float *__restrict__ out_scale, const int hidden_units);
413 | 
414 |      void launchDequantizedSiluMultifyQuantized(int8_t *out_buf, const int32_t *w1_ret, const float *norm_scale, const float *w1_weight_scale,
415 |                                                 const int32_t *w3_ret, const float *w3_weight_scale, float *out_scale, const int nrows,
416 |                                                 const int hidden_units, cudaStream_t stream = 0);
417 | 
418 |      /**反量化、残差结构
419 |       * grid(seq_len * batch_size) block(128)
420 |       * out: [batch_size, seq_len, hidden_units]
421 |       * ffn_tensor: [batch_size, seq_len, hidden_units]
422 |       * from_temsor: [batch_size, seq_len, hidden_units]
423 |       * inp: [batch_size, seq_len, hidden_units]
424 |       * inp_scale: [batch_size, seq_len]
425 |       * weight_scale: [hidden_units]
426 |       */
427 |      template <typename DataType>
428 |      __global__ void dequantizedResidualKernel(DataType *__restrict__ out, const DataType *__restrict__ from_temsor,
429 |                                                const int32_t *__restrict__ inp, const float *__restrict__ inp_scale,
430 |                                                const float *__restrict__ weight_scale, const int hidden_units);
431 | 
432 |      template <>
433 |      __global__ void dequantizedResidualKernel(half *__restrict__ out, const half *__restrict__ from_temsor,
434 |                                                const int32_t *__restrict__ inp, const float *__restrict__ inp_scale,
435 |                                                const float *__restrict__ weight_scale, const int hidden_units);
436 | 
437 |      template <typename DataType>
438 |      void launchDequantizedResidual(DataType *__restrict__ out, const DataType *__restrict__ from_temsor, const int32_t *__restrict__ inp,
439 |                                     const float *__restrict__ inp_scale, const float *__restrict__ weight_scale, const int nrows,
440 |                                     const int hidden_units, cudaStream_t stream = 0);
441 | 
442 | }


--------------------------------------------------------------------------------
/fasterLlama/cuda/decoding_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include "decoding_kernels.cuh"
  2 | #include "utils.h"
  3 | 
  4 | namespace FasterLLaMA
  5 | {
  6 |     /** resNorm
  7 |      * grid(batch_size * seq_len)  block(128)
  8 |      * output: [batch_size, seq_len, hidden_units]
  9 |      * input: [batch_size, seq_len, hidden_units]
 10 |      * gamma: [hidden_units, ]
 11 |      */
 12 |     template <typename DataType>
 13 |     __global__ void resNormKernel(DataType *__restrict__ output, const DataType *__restrict__ input,
 14 |                                   const DataType *__restrict__ gamma, const float eps, const int hidden_units)
 15 |     {
 16 |         const int offset = blockIdx.x * hidden_units;
 17 |         float mean;
 18 |         float val = 0.0f;
 19 |         for (int i = threadIdx.x; i < hidden_units; i += blockDim.x)
 20 |         {
 21 |             val += input[offset + i] * input[offset + i];
 22 |         }
 23 |         __syncthreads();
 24 | 
 25 |         val = blockAllReduceSum<float>(val);
 26 |         mean = rsqrtf(val / hidden_units + eps);
 27 |         // __syncthreads();
 28 | 
 29 |         for (int i = threadIdx.x; i < hidden_units; i += blockDim.x)
 30 |         {
 31 |             output[offset + i] = (DataType)(mean * input[offset + i] * gamma[i]);
 32 |         }
 33 |     }
 34 | 
 35 |     template <>
 36 |     __global__ void resNormKernel(half *__restrict__ output, const half *__restrict__ input,
 37 |                                   const half *__restrict__ gamma, const float eps, const int hidden_units)
 38 |     {
 39 |         const int offset = blockIdx.x * hidden_units;
 40 |         half2 *out_ptr = (half2 *)(output + offset);
 41 |         const half2 *inp_ptr = (const half2 *)(input + offset);
 42 |         const half2 *gamma_ptr = (const half2 *)gamma;
 43 | 
 44 |         float mean = 0.0f;
 45 |         float2 val;
 46 | 
 47 |         for (int i = threadIdx.x; i < (hidden_units >> 1); i += blockDim.x)
 48 |         {
 49 |             val = __half22float2(inp_ptr[i]);
 50 |             mean += val.x * val.x + val.y * val.y;
 51 |         }
 52 |         __syncthreads();
 53 | 
 54 |         mean = blockAllReduceSum<float>(mean);
 55 |         mean = rsqrtf(mean / hidden_units + eps);
 56 | 
 57 |         float2 scale;
 58 | 
 59 |         for (int i = threadIdx.x; i < (hidden_units >> 1); i += blockDim.x)
 60 |         {
 61 |             val = __half22float2(inp_ptr[i]);
 62 |             scale = __half22float2(gamma_ptr[i]);
 63 |             val.x *= (mean * scale.x);
 64 |             val.y *= (mean * scale.y);
 65 |             out_ptr[i] = __float22half2_rn(val);
 66 |         }
 67 |     }
 68 | 
 69 |     template <typename DataType>
 70 |     void launchResNormKernel(DataType *output, const DataType *input, const DataType *gamma, const float eps,
 71 |                              const int m, const int n, cudaStream_t stream)
 72 |     {
 73 | #ifndef NDEBUG
 74 |         PRINT_FUNC_NAME_();
 75 | #endif
 76 |         dim3 grid(m);
 77 |         dim3 block(128);
 78 |         resNormKernel<DataType><<<grid, block, 0, stream>>>(output, input, gamma, eps, n);
 79 |     }
 80 | 
 81 |     /** precomputeFreqsCis
 82 |      * grid(seq_len)  block(block_size) for size_per_head/2 >= block_size(128)
 83 |      * freq_cis: [seq_len, size_per_head]
 84 |      */
 85 |     __global__ void precomputeFreqsCis(float *freq_cis, const int size_per_head)
 86 |     {
 87 |         int offset = blockIdx.x * size_per_head;
 88 |         for (int i = threadIdx.x; i < (size_per_head >> 1); i += blockDim.x)
 89 |         {
 90 |             float val = i * (-2.0f) / size_per_head;
 91 |             float theta = __powf(1e4f, val) * blockIdx.x;
 92 |             freq_cis[offset + 2 * i] = __cosf(theta);
 93 |             freq_cis[offset + 2 * i + 1] = __sinf(theta);
 94 |         }
 95 |     }
 96 | 
 97 |     /**
 98 |      * block(32, 4)   each warp compute one row
 99 |      */
100 |     __global__ void warpPrecomputeFreqsCis(float *freq_cis, const int size_per_head, const int seq_len)
101 |     {
102 |         const int row = blockIdx.x * blockDim.y + threadIdx.y;
103 |         int offset = row * size_per_head;
104 |         if (row < seq_len)
105 |         {
106 |             for (int i = threadIdx.x; i < (size_per_head >> 1); i += blockDim.x)
107 |             {
108 |                 float val = i * (-2.0f) / size_per_head;
109 |                 float theta = __powf(1e4f, val) * row;
110 |                 freq_cis[offset + 2 * i] = __cosf(theta);
111 |                 freq_cis[offset + 2 * i + 1] = __sinf(theta);
112 |             }
113 |         }
114 |     }
115 | 
116 |     void launchPrecomputeFreqsCis(float *freq_cis, const int size_per_head, const int seq_len, cudaStream_t stream)
117 |     {
118 | #ifndef NDEBUG
119 |         PRINT_FUNC_NAME_();
120 | #endif
121 |         if ((size_per_head / 2) < 128)
122 |         {
123 |             int warp_num = 128 / 32;
124 |             int grid_size = (seq_len + warp_num - 1) / warp_num;
125 |             dim3 grid(grid_size);
126 |             dim3 block(32, warp_num);
127 |             warpPrecomputeFreqsCis<<<grid, block, 0, stream>>>(freq_cis, size_per_head, seq_len);
128 |         }
129 |         else
130 |         {
131 |             dim3 grid(seq_len);
132 |             dim3 block(128);
133 |             precomputeFreqsCis<<<grid, block, 0, stream>>>(freq_cis, size_per_head);
134 |         }
135 |     }
136 | 
137 |     /**
138 |      * decoding_params.sequence_length is initialized by 0
139 |      * finished_buf_ is initialized by false
140 |      */
141 |     __global__ void topKSamplingInitKernel(bool *__restrict__ finished, int *__restrict__ sequence_length, const int batch_size)
142 |     {
143 |         int tid = threadIdx.x;
144 |         if (tid < batch_size)
145 |         {
146 |             finished[tid] = false;
147 |             sequence_length[tid] = 0;
148 |         }
149 |     }
150 | 
151 |     void launchTopKSamplingInitKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
152 |                                       const int batch_size, cudaStream_t stream)
153 |     {
154 | #ifndef NDEBUG
155 |         PRINT_FUNC_NAME_();
156 | #endif
157 |         dim3 grid(1);
158 |         dim3 block(min(1024, batch_size));
159 |         topKSamplingInitKernel<<<grid, block, 0, stream>>>(finished, sequence_length, batch_size);
160 |     }
161 | 
162 |     /**
163 |      * decoding_params.sequence_length is initialized by 0
164 |      * finished_buf_ is initialized by false
165 |      * topp_offset_buf is initialized by [0, vocab_size, ..., batch_size * vocab_size]
166 |      * topp_id_val_buf is initialized by [[0, 1, ..., vocab_size-1], [0, 1, ..., vocab_size-1], ..., [0, 1, ..., vocab_size-1]]
167 |      */
168 |     __global__ void topPInitializationKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
169 |                                              int *__restrict__ topp_id_val_buf, int *__restrict__ topp_offset_buf,
170 |                                              const int batch_size, const int vocab_size)
171 |     {
172 |         int tid = threadIdx.x;
173 |         int bid = blockIdx.x;
174 | 
175 |         if (bid == 0)
176 |         {
177 |             for (int i = tid; i < batch_size + 1; i += blockDim.x)
178 |             {
179 |                 topp_offset_buf[i] = i * vocab_size;
180 |             }
181 | 
182 |             for (int i = tid; i < batch_size; i += blockDim.x)
183 |             {
184 |                 finished[i] = false;
185 |                 sequence_length[i] = 0;
186 |             }
187 |         }
188 | 
189 |         for (int idx = tid + bid * blockDim.x; idx < batch_size * vocab_size; idx += blockDim.x * gridDim.x)
190 |         {
191 |             topp_id_val_buf[idx] = idx % vocab_size;
192 |         }
193 |     }
194 | 
195 |     void launchTopPInitializationKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
196 |                                         int *__restrict__ topp_id_val_buf, int *__restrict__ topp_offset_buf,
197 |                                         const int batch_size, const int vocab_size, cudaStream_t stream)
198 |     {
199 | #ifndef NDEBUG
200 |         PRINT_FUNC_NAME_();
201 | #endif
202 |         topPInitializationKernel<<<32, 512, 0, stream>>>(finished, sequence_length, topp_id_val_buf, topp_offset_buf,
203 |                                                          batch_size, vocab_size);
204 |     }
205 | 
206 |     template <typename T>
207 |     __global__ void embeddingLookupKernel(T *__restrict__ from_tensor, const T *__restrict__ embedding_table,
208 |                                           const int *__restrict__ word_ids, const int max_len, const int hidden_units)
209 |     {
210 |         const int token_id = blockIdx.x;
211 |         const int batch_id = blockIdx.y;
212 |         int write_pos, lookup_pos;
213 |         for (int tid = threadIdx.x; tid < hidden_units; tid += blockDim.x)
214 |         {
215 |             write_pos = tid + token_id * hidden_units + batch_id * gridDim.x * hidden_units;
216 |             lookup_pos = word_ids[batch_id * max_len + token_id] * hidden_units + tid;
217 |             // 1. lookup the table
218 |             // 2. multiply hidden_dim**0.5
219 |             from_tensor[write_pos] = embedding_table[lookup_pos] * (T)sqrtf(float(hidden_units));
220 |         }
221 |     }
222 | 
223 |     template <>
224 |     __global__ void embeddingLookupKernel(half *__restrict__ from_tensor, const half *__restrict__ embedding_table,
225 |                                           const int *__restrict__ word_ids, const int max_len, const int hidden_units)
226 |     {
227 |         const int token_id = blockIdx.x;
228 |         const int batch_id = blockIdx.y;
229 |         int write_pos, lookup_pos;
230 |         for (int tid = threadIdx.x; tid < hidden_units; tid += blockDim.x)
231 |         {
232 |             write_pos = tid + token_id * hidden_units + batch_id * gridDim.x * hidden_units;
233 |             lookup_pos = word_ids[batch_id * max_len + token_id] * hidden_units + tid;
234 |             // 1. lookup the table
235 |             // 2. multiply hidden_dim**0.5
236 |             from_tensor[write_pos] = __float2half(__half2float(embedding_table[lookup_pos]) * sqrtf(float(hidden_units)));
237 |         }
238 |     }
239 | 
240 |     template <typename T>
241 |     void launchEmbeddingLookupKernel(T *__restrict__ from_tensor, const T *__restrict__ embedding_table, const int *__restrict__ word_ids,
242 |                                      const int batch_size, const int cur_seq_len, const int max_len, const int hidden_units,
243 |                                      cudaStream_t stream)
244 |     {
245 | #ifndef NDEBUG
246 |         PRINT_FUNC_NAME_();
247 | #endif
248 |         dim3 grid(cur_seq_len, batch_size);
249 |         dim3 block(256);
250 |         embeddingLookupKernel<T><<<grid, block, 0, stream>>>(from_tensor, embedding_table, word_ids, max_len, hidden_units);
251 |     }
252 | 
253 |     /** 取 logits[:, -1, :] 存入 step_logits，并顺便进行停止符判断
254 |      * grid(batch_size), block(min(vocab_size, 1024))
255 |      * step_logits: [batch_size, 1, vocab_size]
256 |      * logits: [batch_size, seq_len, vocab_size]
257 |      * finished: [batch_size, 1]
258 |      */
259 |     __global__ void updateLogitsWithoutSoftmax(float *__restrict__ step_logits, const float *__restrict__ logits, const int end_id,
260 |                                                const bool *__restrict__ finished, const int seq_len, const int vocab_size)
261 |     {
262 |         const bool is_finished = finished[blockIdx.x];
263 | 
264 |         for (int tid = threadIdx.x; tid < vocab_size; tid += blockDim.x)
265 |         {
266 |             int idx = blockIdx.x * seq_len * vocab_size + (seq_len - 1) * vocab_size + tid;
267 |             if (is_finished)
268 |             {
269 |                 step_logits[blockIdx.x * vocab_size + tid] = (tid == end_id) ? FLT_MAX : -1 * FLT_MAX;
270 |             }
271 |             else
272 |             {
273 |                 step_logits[blockIdx.x * vocab_size + tid] = logits[idx];
274 |             }
275 |         }
276 |     }
277 | 
278 |     void launchUpdateLogitsWithoutSoftmax(float *__restrict__ step_logits, const float *__restrict__ logits, const int end_id,
279 |                                           const bool *__restrict__ finished, const int batch_size, const int seq_len,
280 |                                           const int vocab_size, cudaStream_t stream)
281 |     {
282 | #ifndef NDEBUG
283 |         PRINT_FUNC_NAME_();
284 | #endif
285 |         dim3 grid(batch_size);
286 |         dim3 block(min(vocab_size, 1024));
287 |         /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
288 |         updateLogitsWithoutSoftmax<<<grid, block, 0, stream>>>(step_logits, logits, end_id, finished, seq_len, vocab_size);
289 |     }
290 | 
291 |     /**
292 |      * top-k Sampling kernel
293 |      * grid(1), block(batch_size)
294 |      */
295 |     template <typename T>
296 |     __global__ void topKSampling(int *__restrict__ topk_tmp_id_buf, T *__restrict__ topk_tmp_val_buf, int *__restrict__ ids,
297 |                                  int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
298 |                                  const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
299 |                                  const int cur_pos, const int max_prompt_seq_len, const int candidate_num,
300 |                                  const int random_num, const int end_id, const int batch_size, const int vocab_size)
301 |     {
302 |         if (threadIdx.x < batch_size)
303 |         {
304 |             // prompt phase, next_token[:] = prompt_tokens[:, cur_pos]
305 |             if (cur_pos < max_prompt_seq_len && prompt_tokens_mask[threadIdx.x * max_prompt_seq_len + cur_pos])
306 |             {
307 |                 ids[threadIdx.x] = prompt_tokens[threadIdx.x * max_prompt_seq_len + cur_pos];
308 |             }
309 |             else
310 |             {
311 |                 // The maximum number of k logits in the current batch
312 |                 float max_val = (float)topk_tmp_val_buf[threadIdx.x * candidate_num];
313 | 
314 |                 float sum = 0.0f;
315 |                 float tmp_val;
316 |                 for (int i = 0; i < candidate_num; ++i)
317 |                 {
318 |                     tmp_val = __expf(topk_tmp_val_buf[threadIdx.x * candidate_num + i] - max_val);
319 |                     topk_tmp_val_buf[threadIdx.x * candidate_num + i] = tmp_val;
320 |                     sum += tmp_val;
321 |                 }
322 | 
323 |                 curandState_t local_state;
324 |                 curand_init(random_num, threadIdx.x, 0, &local_state);
325 |                 float rand_num = curand_uniform(&local_state) * sum;
326 | 
327 |                 ids[threadIdx.x] = topk_tmp_id_buf[threadIdx.x * candidate_num + candidate_num - 1] % vocab_size;
328 |                 for (int i = 0; i < candidate_num; i++)
329 |                 {
330 |                     rand_num = rand_num - topk_tmp_val_buf[threadIdx.x * candidate_num + i];
331 |                     if (rand_num <= 0.0f)
332 |                     {
333 |                         ids[threadIdx.x] = topk_tmp_id_buf[threadIdx.x * candidate_num + i] % vocab_size;
334 |                         break;
335 |                     }
336 |                 }
337 | 
338 |                 sequence_length[threadIdx.x] = finished_buf[threadIdx.x] ? sequence_length[threadIdx.x] : sequence_length[threadIdx.x] + 1;
339 |                 finished_buf[threadIdx.x] = ids[threadIdx.x] == end_id ? true : false;
340 |             }
341 |         }
342 |     }
343 | 
344 |     template <typename T, int MAX_K, int THREADBLOCK_SIZE>
345 |     __launch_bounds__(THREADBLOCK_SIZE)
346 |         __global__
347 |         void beam_topK_kernel(const T *__restrict__ log_probs,
348 |                               int *__restrict__ topk_tmp_id_buf,
349 |                               T *__restrict__ topk_tmp_val_buf,
350 |                               const int vocab_size,
351 |                               T diversity_rate)
352 |     {
353 |         typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
354 |         __shared__ typename BlockReduce::TempStorage temp_storage;
355 | 
356 |         int thread_id = threadIdx.x;
357 |         int block_id = blockIdx.x;
358 |         TopK<T, MAX_K> partial;
359 | 
360 | #pragma unroll
361 |         for (int i = 0; i < MAX_K; ++i)
362 |         {
363 |             partial.p[i] = -1;
364 |             partial.u[i] = -FLT_MAX;
365 |         }
366 | 
367 | #pragma unroll
368 |         for (int elem_id = thread_id; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE)
369 |         {
370 |             int index = elem_id + block_id * vocab_size;
371 |             partial.insert(log_probs[index], index);
372 |         }
373 | 
374 |         TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
375 | 
376 |         if (thread_id == 0)
377 |         {
378 |             int index = block_id * MAX_K;
379 | 
380 | #pragma unroll
381 |             for (int i = 0; i < MAX_K; ++i)
382 |             {
383 |                 topk_tmp_id_buf[index + i] = total.p[i];
384 |                 topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
385 |             }
386 |         }
387 |     }
388 | 
389 |     template <typename T>
390 |     void launchTopKSamplingKernel(T *__restrict__ log_probs, int *__restrict__ topk_tmp_id_buf, T *__restrict__ topk_tmp_val_buf,
391 |                                   int *__restrict__ ids, int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
392 |                                   const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
393 |                                   const int cur_pos, const int max_prompt_seq_len, int random_num, const int batch_size,
394 |                                   const int vocab_size, const int candidate_num, const int end_id, cudaStream_t stream)
395 |     {
396 | #ifndef NDEBUG
397 |         PRINT_FUNC_NAME_();
398 | #endif
399 |         int local_block_size = 256;
400 |         switch (candidate_num)
401 |         {
402 |             CASE_K(1);
403 |             CASE_K(2);
404 |             CASE_K(4);
405 |         default:
406 |             printf("[ERROR] Topk kernel does not support candidate_num = %d \n", candidate_num);
407 |             exit(0);
408 |             break;
409 |         }
410 |         assert(batch_size <= 1024);
411 |         if (batch_size <= 128)
412 |         {
413 |             local_block_size = 128;
414 |         }
415 |         else if (batch_size <= 256)
416 |         {
417 |             local_block_size = 256;
418 |         }
419 |         else if (batch_size <= 512)
420 |         {
421 |             local_block_size = 512;
422 |         }
423 |         else
424 |         {
425 |             local_block_size = 1024;
426 |         }
427 |         topKSampling<T><<<1, local_block_size, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids, sequence_length, finished_buf,
428 |                                                             prompt_tokens, prompt_tokens_mask, cur_pos, max_prompt_seq_len, candidate_num,
429 |                                                             random_num, end_id, batch_size, vocab_size);
430 |     }
431 | 
432 |     __global__ void updateLogitsKernelWithoutLog(float *__restrict__ step_logits, const float *__restrict__ logits,
433 |                                                  const bool *__restrict__ finished,
434 |                                                  const int seq_len, const int end_id, const int vocab_size)
435 |     {
436 |         int bid = blockIdx.x;
437 |         bool finish = finished[bid];
438 |         int offset = bid * vocab_size;
439 | 
440 |         float max_val = -1 * FLT_MAX;
441 | 
442 |         for (int tid = threadIdx.x; tid < vocab_size; tid += blockDim.x)
443 |         {
444 |             int idx = bid * seq_len * vocab_size + (seq_len - 1) * vocab_size + tid;
445 |             if (finish)
446 |                 step_logits[offset + tid] = (tid == end_id) ? FLT_MAX : -1 * FLT_MAX;
447 |             else
448 |                 step_logits[offset + tid] = logits[idx];
449 |             max_val = max(max_val, step_logits[offset + tid]);
450 |         }
451 | 
452 |         max_val = blockAllReduceMax<float>(max_val);
453 | 
454 |         float sum_val = 0.0f;
455 |         for (int tid = threadIdx.x; tid < vocab_size; tid += blockDim.x)
456 |         {
457 |             step_logits[offset + tid] = __expf(step_logits[offset + tid] - max_val);
458 |             sum_val += step_logits[offset + tid];
459 |         }
460 | 
461 |         sum_val = blockAllReduceSum<float>(sum_val);
462 | 
463 |         for (int tid = threadIdx.x; tid < vocab_size; tid += blockDim.x)
464 |         {
465 |             step_logits[offset + tid] = (step_logits[offset + tid] / sum_val);
466 |         }
467 |     }
468 | 
469 |     void launchUpdateLogitsKernelWithoutLog(float *__restrict__ step_logits, const float *__restrict__ logits,
470 |                                             const bool *__restrict__ finished, const int seq_len, const int end_id,
471 |                                             const int batch_size, const int vocab_size, cudaStream_t stream)
472 |     {
473 | #ifndef NDEBUG
474 |         PRINT_FUNC_NAME_();
475 | #endif
476 |         dim3 grid(batch_size);
477 |         dim3 block(min(vocab_size, 1024));
478 |         /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
479 |         updateLogitsKernelWithoutLog<<<grid, block, 0, stream>>>(step_logits, logits, finished, seq_len, end_id, vocab_size);
480 |     }
481 | 
482 |     /**
483 |      * top-k Sampling kernel
484 |      * grid(1), block(batch_size)
485 |      */
486 |     template <typename T>
487 |     __global__ void topPSampling(const T *__restrict__ sorted_logits_probs, const int *__restrict__ sorted_id_vals,
488 |                                  int *__restrict__ ids, int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
489 |                                  const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
490 |                                  const int cur_pos, const int max_prompt_seq_len, const int batch_size, const int vocab_size,
491 |                                  const int random_num, const float prob_threshold, const int end_id)
492 |     {
493 |         if (threadIdx.x < batch_size)
494 |         {
495 |             // prompt phase, next_token[:] = prompt_tokens[:, cur_pos]
496 |             if (cur_pos < max_prompt_seq_len && prompt_tokens_mask[threadIdx.x * max_prompt_seq_len + cur_pos])
497 |             {
498 |                 ids[threadIdx.x] = prompt_tokens[threadIdx.x * max_prompt_seq_len + cur_pos];
499 |             }
500 |             else
501 |             {
502 |                 int tid = threadIdx.x;
503 |                 curandState_t local_state;
504 |                 curand_init(random_num, tid, 0, &local_state);
505 |                 float rand_num = curand_uniform(&local_state) * prob_threshold;
506 |                 ids[tid] = sorted_id_vals[vocab_size - 1];
507 | 
508 |                 for (int i = tid * vocab_size; i < tid * vocab_size + vocab_size; i++)
509 |                 {
510 |                     rand_num = rand_num - sorted_logits_probs[i];
511 |                     if (rand_num <= 0)
512 |                     {
513 |                         ids[tid] = sorted_id_vals[i];
514 |                         break;
515 |                     }
516 |                 }
517 | 
518 |                 sequence_length[tid] = finished_buf[tid] ? sequence_length[tid] : sequence_length[tid] + 1;
519 |                 finished_buf[tid] = ids[tid] == end_id ? true : false;
520 |             }
521 |         }
522 |     }
523 | 
524 |     /**
525 |      * Get the temporary memory buffer size of topp sort by calling the function: cub::DeviceSegmentedRadixSort::SortPairsDescending
526 |      */
527 |     size_t getToppSortTempStorageSize(const float *__restrict__ log_probs,
528 |                                       const int *__restrict__ id_vals,
529 |                                       float *__restrict__ sorted_log_probs,
530 |                                       int *__restrict__ sorted_id_vals,
531 |                                       int *__restrict__ topp_offset_buf,
532 |                                       const int batch_size,
533 |                                       const int vocab_size)
534 |     {
535 | #ifndef NDEBUG
536 |         PRINT_FUNC_NAME_();
537 | #endif
538 |         void *d_temp_storage = NULL;
539 |         size_t temp_storage_bytes = 0;
540 | 
541 |         cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage,
542 |                                                            temp_storage_bytes,
543 |                                                            log_probs,
544 |                                                            sorted_log_probs,
545 |                                                            id_vals,
546 |                                                            sorted_id_vals,
547 |                                                            vocab_size * batch_size,
548 |                                                            batch_size,
549 |                                                            topp_offset_buf, topp_offset_buf + 1);
550 |         return temp_storage_bytes;
551 |     }
552 | 
553 |     template <typename T>
554 |     void launchTopPSamplingKernel(const T *__restrict__ logits_probs, const int *__restrict__ id_vals, T *__restrict__ sorted_logits_probs,
555 |                                   int *__restrict__ sorted_id_vals, const int *__restrict__ topp_offset_buf, void *__restrict__ temp_storage,
556 |                                   size_t temp_storage_size, bool *__restrict__ finished_buf, const int *__restrict__ prompt_tokens,
557 |                                   const bool *__restrict__ prompt_tokens_mask, const int cur_pos, const int max_prompt_seq_len,
558 |                                   const int random_num, int *__restrict__ output_ids, int *__restrict__ sequence_length, const int end_id,
559 |                                   const int batch_size, const int vocab_size, const float probability_threshold, cudaStream_t stream)
560 |     {
561 | #ifndef NDEBUG
562 |         PRINT_FUNC_NAME_();
563 | #endif
564 |         cub::DeviceSegmentedRadixSort::SortPairsDescending(temp_storage,
565 |                                                            temp_storage_size,
566 |                                                            logits_probs,
567 |                                                            sorted_logits_probs,
568 |                                                            id_vals,
569 |                                                            sorted_id_vals,
570 |                                                            vocab_size * batch_size,
571 |                                                            batch_size,
572 |                                                            topp_offset_buf, topp_offset_buf + 1);
573 | 
574 |         int local_block_size;
575 |         assert(batch_size <= 1024);
576 |         if (batch_size <= 128)
577 |         {
578 |             local_block_size = 128;
579 |         }
580 |         else if (batch_size <= 256)
581 |         {
582 |             local_block_size = 256;
583 |         }
584 |         else if (batch_size <= 512)
585 |         {
586 |             local_block_size = 512;
587 |         }
588 |         else
589 |         {
590 |             local_block_size = 1024;
591 |         }
592 | 
593 |         topPSampling<<<1, local_block_size, 0, stream>>>(sorted_logits_probs, sorted_id_vals, output_ids, sequence_length,
594 |                                                          finished_buf, prompt_tokens, prompt_tokens_mask, cur_pos, max_prompt_seq_len,
595 |                                                          batch_size, vocab_size, random_num, probability_threshold, end_id);
596 |     }
597 | 
598 |     __global__ void removePromptTokenKernel(int *__restrict__ gen_ids, const int *__restrict__ word_ids_buf,
599 |                                             const int *__restrict__ sequence_length, const int *__restrict__ prompt_seq_lengths,
600 |                                             const int min_prompt_seq_len, const int batch_size, const int total_len)
601 |     {
602 |         const int offset = prompt_seq_lengths[blockIdx.x] - min_prompt_seq_len;
603 |         for (int tid = threadIdx.x; tid < sequence_length[blockIdx.x]; tid += blockDim.x)
604 |         {
605 |             gen_ids[blockIdx.x * total_len + tid] = word_ids_buf[(offset + tid) * batch_size + blockIdx.x];
606 |         }
607 |     }
608 | 
609 |     void launchRemovePromptTokenKernel(int *__restrict__ gen_ids, const int *__restrict__ word_ids_buf, const int *__restrict__ sequence_length,
610 |                                        const int *__restrict__ prompt_seq_lengths, const int min_prompt_seq_len, const int batch_size, const int total_len, cudaStream_t stream)
611 |     {
612 | #ifndef NDEBUG
613 |         PRINT_FUNC_NAME_();
614 | #endif
615 |         removePromptTokenKernel<<<batch_size, 256, 0, stream>>>(gen_ids, word_ids_buf, sequence_length, prompt_seq_lengths, min_prompt_seq_len, batch_size, total_len);
616 |     }
617 | 
618 |     template void launchResNormKernel(float *output, const float *input, const float *gamma, const float eps,
619 |                                       const int m, const int n, cudaStream_t stream);
620 | 
621 |     template void launchResNormKernel(half *output, const half *input, const half *gamma, const float eps,
622 |                                       const int m, const int n, cudaStream_t stream);
623 | 
624 |     template void launchEmbeddingLookupKernel(float *__restrict__ from_tensor, const float *__restrict__ embedding_table,
625 |                                               const int *__restrict__ word_ids, const int batch_size, const int cur_seq_len,
626 |                                               const int max_len, const int hidden_units, cudaStream_t stream);
627 |     
628 |     template void launchEmbeddingLookupKernel(half *__restrict__ from_tensor, const half *__restrict__ embedding_table,
629 |                                               const int *__restrict__ word_ids, const int batch_size, const int cur_seq_len,
630 |                                               const int max_len, const int hidden_units, cudaStream_t stream);
631 | 
632 |     template void launchTopKSamplingKernel(float *__restrict__ log_probs, int *__restrict__ topk_tmp_id_buf,
633 |                                            float *__restrict__ topk_tmp_val_buf, int *__restrict__ ids,
634 |                                            int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
635 |                                            const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
636 |                                            const int cur_pos, const int max_prompt_seq_len, int random_num, const int batch_size,
637 |                                            const int vocab_size, const int candidate_num, const int end_id, cudaStream_t stream);
638 | 
639 |     template void launchTopPSamplingKernel(const float *__restrict__ logits_probs, const int *__restrict__ id_vals,
640 |                                            float *__restrict__ sorted_logits_probs, int *__restrict__ sorted_id_vals,
641 |                                            const int *__restrict__ topp_offset_buf, void *__restrict__ temp_storage,
642 |                                            size_t temp_storage_size,
643 |                                            bool *__restrict__ finished_buf, const int *__restrict__ prompt_tokens,
644 |                                            const bool *__restrict__ prompt_tokens_mask, const int cur_pos, const int max_prompt_seq_len,
645 |                                            const int random_num, int *__restrict__ output_ids, int *__restrict__ sequence_length,
646 |                                            const int end_id,
647 |                                            const int batch_size, const int vocab_size, const float probability_threshold,
648 |                                            cudaStream_t stream);
649 | }


--------------------------------------------------------------------------------
/fasterLlama/cuda/decoding_kernels.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cub/cub.cuh>
  4 | #include "cuda_kernels.cuh"
  5 | 
  6 | namespace FasterLLaMA
  7 | {
  8 |     namespace
  9 |     {
 10 |         constexpr int defalut_block_size = 256;
 11 |     }
 12 | 
 13 |     
 14 |      /** resNorm
 15 |       * grid(batch_size * seq_len)  block(128)
 16 |       * output: [batch_size, seq_len, hidden_units]
 17 |       * input: [batch_size, seq_len, hidden_units]
 18 |       * gamma: [hidden_units, ]
 19 |       */
 20 |      template <typename DataType>
 21 |      __global__ void resNormKernel(DataType *__restrict__ output, const DataType *__restrict__ input,
 22 |                                    const DataType *__restrict__ gamma, const float eps, const int hidden_units);
 23 | 
 24 |      template <>
 25 |      __global__ void resNormKernel(half *__restrict__ output, const half *__restrict__ input,
 26 |                                    const half *__restrict__ gamma, const float eps, const int hidden_units);
 27 | 
 28 |      template <typename DataType>
 29 |      void launchResNormKernel(DataType *output, const DataType *input, const DataType *gamma, const float eps,
 30 |                               const int m, const int n, cudaStream_t stream = 0);
 31 | 
 32 |     /** precomputeFreqsCis
 33 |       * grid(seq_len)  block(block_size) for size_per_head/2 >= block_size(128)
 34 |       * freq_cis: [seq_len, size_per_head]
 35 |       */
 36 |      __global__ void precomputeFreqsCis(float *freq_cis, const int size_per_head);
 37 | 
 38 |      /**
 39 |       * block(32, 4)   each warp compute one row
 40 |       */
 41 |      __global__ void warpPrecomputeFreqsCis(float *freq_cis, const int size_per_head, const int seq_len);
 42 | 
 43 |      void launchPrecomputeFreqsCis(float *freq_cis, const int size_per_head, const int seq_len, cudaStream_t stream = 0);
 44 | 
 45 |     /**
 46 |      * decoding_params.sequence_length is initialized by 0
 47 |      * finished_buf_ is initialized by false
 48 |      */
 49 |     __global__ void topKSamplingInitKernel(bool *__restrict__ finished, int *__restrict__ sequence_length, const int batch_size);
 50 | 
 51 |     void launchTopKSamplingInitKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
 52 |                                       const int batch_size, cudaStream_t stream = 0);
 53 | 
 54 |     /**
 55 |      * decoding_params.sequence_length is initialized by 0
 56 |      * finished_buf_ is initialized by false
 57 |      * topp_offset_buf is initialized by [0, vocab_size, ..., batch_size * vocab_size]
 58 |      * topp_id_val_buf is initialized by [[0, 1, ..., vocab_size-1], [0, 1, ..., vocab_size-1], ..., [0, 1, ..., vocab_size-1]]
 59 |      */
 60 |     __global__ void topPInitializationKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
 61 |                                              int *__restrict__ topp_id_val_buf, int *__restrict__ topp_offset_buf,
 62 |                                              const int batch_size, const int vocab_size);
 63 | 
 64 |     void launchTopPInitializationKernel(bool *__restrict__ finished, int *__restrict__ sequence_length,
 65 |                                         int *__restrict__ topp_id_val_buf, int *__restrict__ topp_offset_buf,
 66 |                                         const int batch_size, const int vocab_size, cudaStream_t stream = 0);
 67 | 
 68 |     template <typename T>
 69 |     __global__ void embeddingLookupKernel(T *__restrict__ from_tensor, const T *__restrict__ embedding_table,
 70 |                                           const int *__restrict__ word_ids, const int hidden_units);
 71 | 
 72 |     template <>
 73 |     __global__ void embeddingLookupKernel(half *__restrict__ from_tensor, const half *__restrict__ embedding_table,
 74 |                                           const int *__restrict__ word_ids, const int hidden_units);
 75 | 
 76 |     template <typename T>
 77 |     void launchEmbeddingLookupKernel(T *__restrict__ from_tensor, const T *__restrict__ embedding_table, const int *__restrict__ word_ids,
 78 |                                      const int batch_size, const int cur_seq_len, const int max_len, const int hidden_units,
 79 |                                      cudaStream_t stream = 0);
 80 | 
 81 |     /** 取 logits[:, -1, :] 存入 step_logits，并顺便进行停止符判断
 82 |      * grid(batch_size), block(min(vocab_size, 1024))
 83 |      * step_logits: [batch_size, 1, vocab_size]
 84 |      * logits: [batch_size, seq_len, vocab_size]
 85 |      * finished: [batch_size, 1]
 86 |      */
 87 |     __global__ void updateLogitsWithoutSoftmax(float *__restrict__ step_logits, const float *__restrict__ logits, const int end_id,
 88 |                                                const bool *__restrict__ finished, const int seq_len, const int vocab_size);
 89 | 
 90 |     void launchUpdateLogitsWithoutSoftmax(float *__restrict__ step_logits, const float *__restrict__ logits, const int end_id,
 91 |                                           const bool *__restrict__ finished, const int batch_size, const int seq_len,
 92 |                                           const int vocab_size, cudaStream_t stream = 0);
 93 | 
 94 |     template <typename T, int MAX_K>
 95 |     struct TopK
 96 |     {
 97 |         int p[MAX_K];
 98 |         T u[MAX_K];
 99 | 
100 |         __device__ __forceinline__ void insert(T elem, int elem_id)
101 |         {
102 |             if (elem > u[MAX_K - 1] || (p[MAX_K - 1] == -1) || ((elem == u[MAX_K - 1]) && (elem_id < p[MAX_K - 1])))
103 |             // if (elem > u[MAX_K-1] || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1])))
104 |             {
105 |                 u[MAX_K - 1] = elem;
106 |                 p[MAX_K - 1] = elem_id;
107 |             }
108 | 
109 |             for (int k = MAX_K - 2; k >= 0; --k)
110 |             {
111 |                 if ((u[k + 1] > u[k]) || (p[k] == -1) || ((u[k + 1] == u[k]) && (p[k + 1] < p[k])))
112 |                 // if ((u[k+1] > u[k]) || ((u[k+1] == u[k])&&(p[k+1] < p[k])))
113 |                 {
114 |                     T u2 = u[k];
115 |                     int p2 = p[k];
116 |                     u[k] = u[k + 1];
117 |                     p[k] = p[k + 1];
118 |                     u[k + 1] = u2;
119 |                     p[k + 1] = p2;
120 |                 }
121 |             }
122 |         }
123 | 
124 |         __device__ __forceinline__ void init()
125 |         {
126 | #pragma unroll
127 |             for (int i = 0; i < MAX_K; i++)
128 |             {
129 |                 p[i] = -1;
130 |                 u[i] = -FLT_MAX;
131 |             }
132 |         }
133 |     };
134 | 
135 |     template <typename T, int MAX_K>
136 |     __device__ __forceinline__ TopK<T, MAX_K> reduce_topk_op(const TopK<T, MAX_K> &a, const TopK<T, MAX_K> &b)
137 |     {
138 |         TopK<T, MAX_K> res = a;
139 |         for (int i = 0; i < MAX_K; ++i)
140 |             res.insert(b.u[i], b.p[i]);
141 |         return res;
142 |     }
143 | 
144 |     /**
145 |      * top-k Sampling kernel
146 |      * grid(1), block(batch_size)
147 |      */
148 |     template <typename T>
149 |     __global__ void topKSampling(int *__restrict__ topk_tmp_id_buf, T *__restrict__ topk_tmp_val_buf, int *__restrict__ ids,
150 |                                  int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
151 |                                  const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
152 |                                  const int cur_pos, const int max_prompt_seq_len, const int candidate_num,
153 |                                  const int random_num, const int end_id, const int batch_size, const int vocab_size);
154 | 
155 |     template <typename T, int MAX_K, int THREADBLOCK_SIZE>
156 |     __launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_topK_kernel(const T *__restrict__ log_probs,
157 |                                                                          int *__restrict__ topk_tmp_id_buf,
158 |                                                                          T *__restrict__ topk_tmp_val_buf,
159 |                                                                          const int vocab_size,
160 |                                                                          T diversity_rate);
161 | 
162 | #define CASE_K(K)                                                                                                                                       \
163 |     case K:                                                                                                                                             \
164 |         beam_topK_kernel<T, K, defalut_block_size><<<batch_size, defalut_block_size, 0, stream>>>(log_probs,                                            \
165 |                                                                                                   topk_tmp_id_buf, topk_tmp_val_buf, vocab_size, 0.0f); \
166 |         break;
167 | 
168 |     template <typename T>
169 |     void launchTopKSamplingKernel(T *__restrict__ log_probs, int *__restrict__ topk_tmp_id_buf, T *__restrict__ topk_tmp_val_buf,
170 |                                   int *__restrict__ ids, int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
171 |                                   const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
172 |                                   const int cur_pos, const int max_prompt_seq_len, int random_num, const int batch_size,
173 |                                   const int vocab_size, const int candidate_num, const int end_id, cudaStream_t stream = 0);
174 | 
175 |     __global__ void updateLogitsKernelWithoutLog(float *__restrict__ step_logits, const float *__restrict__ logits,
176 |                                                  const bool *__restrict__ finished, const int seq_len, const int end_id,
177 |                                                  const int vocab_size);
178 | 
179 |     void launchUpdateLogitsKernelWithoutLog(float *__restrict__ step_logits, const float *__restrict__ logits,
180 |                                             const bool *__restrict__ finished, const int seq_len, const int end_id,
181 |                                             const int batch_size, const int vocab_size, cudaStream_t stream = 0);
182 | 
183 |     /**
184 |      * top-k Sampling kernel
185 |      * grid(1), block(batch_size)
186 |      */
187 |     template <typename T>
188 |     __global__ void topPSampling(const T *__restrict__ sorted_logits_probs, const int *__restrict__ sorted_id_vals,
189 |                                  int *__restrict__ ids, int *__restrict__ sequence_length, bool *__restrict__ finished_buf,
190 |                                  const int *__restrict__ prompt_tokens, const bool *__restrict__ prompt_tokens_mask,
191 |                                  const int cur_pos, const int max_prompt_seq_len, const int batch_size, const int vocab_size,
192 |                                  const int random_num, const float prob_threshold, const int end_id);
193 | 
194 |     /**
195 |      * Get the temporary memory buffer size of topp sort by calling the function: cub::DeviceSegmentedRadixSort::SortPairsDescending
196 |      */
197 |     size_t getToppSortTempStorageSize(const float *__restrict__ log_probs, const int *__restrict__ id_vals,
198 |                                       float *__restrict__ sorted_log_probs, int *__restrict__ sorted_id_vals,
199 |                                       int *__restrict__ topp_offset_buf, const int batch_size, const int vocab_size);
200 | 
201 |     template <typename T>
202 |     void launchTopPSamplingKernel(const T *__restrict__ logits_probs, const int *__restrict__ id_vals, T *__restrict__ sorted_logits_probs,
203 |                                   int *__restrict__ sorted_id_vals, const int *__restrict__ topp_offset_buf, void *__restrict__ temp_storage,
204 |                                   size_t temp_storage_size, bool *__restrict__ finished_buf, const int *__restrict__ prompt_tokens,
205 |                                   const bool *__restrict__ prompt_tokens_mask, const int cur_pos, const int max_prompt_seq_len,
206 |                                   const int random_num, int *__restrict__ output_ids, int *__restrict__ sequence_length, const int end_id,
207 |                                   const int batch_size, const int vocab_size, const float probability_threshold, cudaStream_t stream = 0);
208 | 
209 |     __global__ void removePromptTokenKernel(int *__restrict__ gen_ids, const int *__restrict__ word_ids_buf,
210 |                                             const int *__restrict__ sequence_length, const int *__restrict__ prompt_seq_lengths,
211 |                                             const int min_prompt_seq_len, const int batch_size, const int total_len);
212 | 
213 |     void launchRemovePromptTokenKernel(int *__restrict__ gen_ids, const int *__restrict__ word_ids_buf, const int *__restrict__ sequence_length,
214 |                                        const int *__restrict__ prompt_seq_lengths, const int min_prompt_seq_len, const int batch_size,
215 |                                        const int total_len, cudaStream_t stream = 0);
216 | }
217 | 


--------------------------------------------------------------------------------
/fasterLlama/cuda/decoding_sampling.cu:
--------------------------------------------------------------------------------
  1 | #include "decoding_sampling.h"
  2 | #include "decoding_kernels.cuh"
  3 | 
  4 | namespace FasterLLaMA
  5 | {
  6 | 
  7 |     template <OperationType OpType_>
  8 |     DecodingSampling<OpType_>::DecodingSampling(const IAllocator &allocator, const int batch_size,
  9 |                                                 const int max_prompt_len, const int max_gen_len,
 10 |                                                 const int head_num, const int size_per_head,
 11 |                                                 const int vocab_size, const int decoder_layers,
 12 |                                                 const int end_id, const int ffn_hidden_units,
 13 |                                                 const int candidate_num, const float probability_threshold)
 14 |         : allocator_(allocator)
 15 |     {
 16 |         args_.batch_size_ = batch_size;
 17 |         args_.max_prompt_len_ = max_prompt_len;
 18 |         args_.max_gen_len_ = max_gen_len;
 19 |         args_.head_num_ = head_num;
 20 |         args_.size_per_head_ = size_per_head;
 21 |         args_.hidden_units_ = head_num * size_per_head;
 22 |         args_.decoder_layers_ = decoder_layers;
 23 |         args_.vocab_size_ = vocab_size;
 24 |         args_.candidate_num_ = candidate_num;
 25 |         args_.probability_threshold_ = probability_threshold;
 26 |         args_.end_id_ = end_id;
 27 |         args_.ffn_hidden_units_ = ffn_hidden_units;
 28 | 
 29 |         // Only one (top-k or top-p sampling) can be selected
 30 |         if (args_.candidate_num_ == 0 && args_.probability_threshold_ == 0.0)
 31 |         {
 32 |             printf("[ERROR] Candidate_num for topk is 0 and probability threshold for top p is 0.0 \n");
 33 |             exit(-1);
 34 |         }
 35 |         else if (args_.candidate_num_ != 0 && args_.probability_threshold_ != 0.0)
 36 |         {
 37 |             printf("[ERROR] Candidate_num for topk is not 0 and probability threshold for top p is not 0.0 \n");
 38 |             exit(-1);
 39 |         }
 40 | #ifndef NDEBUG
 41 |         PRINT_FUNC_NAME_();
 42 | #endif
 43 | 
 44 |         decoder_ = new OpenDecoder<OpType_, OperationType::INT8>(batch_size, max_prompt_len, max_gen_len, head_num, size_per_head, ffn_hidden_units);
 45 | 
 46 |         int from_tensor_size = args_.batch_size_ * args_.max_prompt_len_ * args_.hidden_units_; // type T
 47 |         int decoder_workspace_size = decoder_->getWorkspaceSize();
 48 | #ifndef NDEBUG
 49 |         printf("[FL][INFO] the decoder workspace size: %d GB\n", decoder_workspace_size / 1024 / 1024);
 50 | #endif
 51 |         int decoder_normed_result_buf_size = args_.batch_size_ * args_.max_prompt_len_ * args_.hidden_units_;    // type T
 52 |         int cache_size = args_.batch_size_ * (args_.max_prompt_len_ + args_.max_gen_len_) * args_.hidden_units_; // type float
 53 | 
 54 |         int logits_buf_size = args_.batch_size_ * args_.max_prompt_len_ * args_.vocab_size_;      // type float
 55 |         int step_logits_buf_size = args_.batch_size_ * args_.vocab_size_;                         // type float
 56 |         int word_ids_buf_size = args_.batch_size_ * (args_.max_prompt_len_ + args_.max_gen_len_); // type int
 57 |         int finished_buf_size = args_.batch_size_;                                                // type bool
 58 | 
 59 |         int topk_ids_buf_size = args_.batch_size_ * args_.candidate_num_; // type int
 60 |         int topk_val_buf_size = args_.batch_size_ * args_.candidate_num_; // type float
 61 |         int topp_id_vals_buf_size = args_.batch_size_ * args_.vocab_size_;
 62 |         int topp_sorted_logits_prob_buf_size = args_.batch_size_ * args_.vocab_size_;
 63 |         int topp_sorted_id_vals_buf_size = args_.batch_size_ * args_.vocab_size_;
 64 | 
 65 |         // prevent memory misalinged address
 66 |         logits_buf_size = (int)(ceil(logits_buf_size / 4.)) * 4;
 67 |         step_logits_buf_size = (int)(ceil(step_logits_buf_size / 4.)) * 4;
 68 |         word_ids_buf_size = (int)(ceil(word_ids_buf_size / 4.)) * 4;
 69 |         finished_buf_size = (int)(ceil(finished_buf_size / 32.)) * 32;
 70 | 
 71 |         topk_ids_buf_size = (int)(ceil(topk_ids_buf_size / 4.)) * 4;
 72 |         topk_val_buf_size = (int)(ceil(topk_val_buf_size / 4.)) * 4;
 73 |         topp_id_vals_buf_size = (int)(ceil(topp_id_vals_buf_size / 4.)) * 4;
 74 |         topp_sorted_logits_prob_buf_size = (int)(ceil(topp_sorted_logits_prob_buf_size / 4.)) * 4;
 75 |         topp_sorted_id_vals_buf_size = (int)(ceil(topp_sorted_id_vals_buf_size / 4.)) * 4;
 76 | 
 77 |         args_.temp_storage_size_ = getToppSortTempStorageSize(step_logits_buf_, topp_id_vals_buf_, topp_sorted_logits_prob_buf_,
 78 |                                                               topp_sorted_id_vals_buf_, topp_offset_buf_,
 79 |                                                               args_.batch_size_, args_.vocab_size_);
 80 | 
 81 |         int topp_offset_buf_size = args_.batch_size_ + 1;
 82 |         args_.temp_storage_size_ = (int)(ceil(args_.temp_storage_size_ / 4.)) * 4;
 83 |         topp_offset_buf_size = (int)(ceil(topp_offset_buf_size / 4.)) * 4;
 84 | 
 85 |         int datatype_buf_size = from_tensor_size * 2 + decoder_normed_result_buf_size;
 86 |         int float_buf_size = cache_size * 2 * args_.decoder_layers_ + logits_buf_size + step_logits_buf_size + topk_val_buf_size +
 87 |                              topp_sorted_logits_prob_buf_size;
 88 |         int int_buf_size = word_ids_buf_size + topk_ids_buf_size + topp_id_vals_buf_size + topp_sorted_id_vals_buf_size +
 89 |                            topp_offset_buf_size;
 90 | 
 91 | #ifndef NDEBUG
 92 |         size_t d_mem_size = sizeof(DataType_) * datatype_buf_size +
 93 |                             sizeof(float) * float_buf_size +
 94 |                             sizeof(int) * int_buf_size +
 95 |                             sizeof(bool) * finished_buf_size +
 96 |                             sizeof(char) * decoder_workspace_size +
 97 |                             args_.temp_storage_size_;
 98 | 
 99 |         printf("[FL][INFO] the decoding sampling device memory : %zu GB\n", d_mem_size / 1024 / 1024);
100 | #endif
101 | 
102 |         buf_ = reinterpret_cast<void *>(allocator_.malloc(
103 |             sizeof(DataType_) * datatype_buf_size +
104 |             sizeof(float) * float_buf_size +
105 |             sizeof(int) * int_buf_size +
106 |             sizeof(bool) * finished_buf_size +
107 |             sizeof(char) * decoder_workspace_size +
108 |             args_.temp_storage_size_));
109 | 
110 | #ifndef NDEBUG
111 |         printf("device memory for buf_ is mallocated\n");
112 | #endif
113 | 
114 |         from_tensor_[0] = (DataType_ *)buf_;
115 |         from_tensor_[1] = (DataType_ *)(from_tensor_[0] + from_tensor_size);
116 | 
117 |         /* K V buffer */
118 |         K_cache_ = (float *)(from_tensor_[1] + from_tensor_size);
119 |         V_cache_ = (float *)(K_cache_ + cache_size * args_.decoder_layers_);
120 | 
121 |         decoder_buf_ = (char *)(V_cache_ + cache_size * args_.decoder_layers_);
122 |         decoder_normed_result_buf_ = (DataType_ *)(decoder_buf_ + decoder_workspace_size);
123 |         logits_buf_ = (float *)(decoder_normed_result_buf_ + decoder_normed_result_buf_size);
124 |         step_logits_buf_ = (float *)(logits_buf_ + logits_buf_size);
125 |         word_ids_buf_ = (int *)(step_logits_buf_ + step_logits_buf_size);
126 |         finished_buf_ = (bool *)(word_ids_buf_ + word_ids_buf_size);
127 |         topk_ids_buf_ = (int *)(finished_buf_ + finished_buf_size);
128 |         topk_val_buf_ = (float *)(topk_ids_buf_ + topk_ids_buf_size);
129 |         topp_id_vals_buf_ = (int *)(topk_val_buf_ + topk_val_buf_size);
130 |         topp_sorted_id_vals_buf_ = (int *)(topp_id_vals_buf_ + topp_id_vals_buf_size);
131 |         topp_offset_buf_ = (int *)(topp_sorted_id_vals_buf_ + topp_sorted_id_vals_buf_size);
132 |         topp_sorted_logits_prob_buf_ = (float *)(topp_offset_buf_ + topp_offset_buf_size);
133 |         temp_storage_ = (void *)(topp_sorted_logits_prob_buf_ + topp_sorted_logits_prob_buf_size);
134 | 
135 |         h_finished_buf_ = new bool[finished_buf_size];
136 | 
137 |         if (Traits_::OpType == OperationType::FP32)
138 |         {
139 |             cublasAlgo_[0] = CUBLAS_GEMM_DEFAULT;
140 |         }
141 |         else
142 |         {
143 |             cublasAlgo_[0] = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
144 |         }
145 |     }
146 | 
147 |     template <OperationType OpType_>
148 |     void DecodingSampling<OpType_>::forward(const DecoderInitParam<DataType_, int8_t> *param,
149 |                                             DecodingInitParam<DataType_> decoding_params)
150 |     {
151 | #ifndef NDEBUG
152 |         PRINT_FUNC_NAME_();
153 | #endif
154 | 
155 |         if (args_.candidate_num_ != 0)
156 |         {
157 |             /**
158 |              * decoding_params.sequence_length is initialized by 0
159 |              * finished_buf_ is initialized by false
160 |              */
161 |             launchTopKSamplingInitKernel(finished_buf_, decoding_params.sequence_length, args_.batch_size_, decoding_params.stream);
162 |         }
163 |         else if (args_.probability_threshold_ != 0.0)
164 |         {
165 |             /**
166 |              * decoding_params.sequence_length is initialized by 0
167 |              * finished_buf_ is initialized by false
168 |              * topp_offset_buf is initialized by [0, vocab_size, ..., batch_size * vocab_size]
169 |              * topp_id_val_buf is initialized by [[0, 1, ..., vocab_size-1], [0, 1, ..., vocab_size-1], ..., [0, 1, ..., vocab_size-1]]
170 |              */
171 |             launchTopPInitializationKernel(finished_buf_, decoding_params.sequence_length, topp_id_vals_buf_, topp_offset_buf_,
172 |                                            args_.batch_size_, args_.vocab_size_, decoding_params.stream);
173 |         }
174 | 
175 | #ifndef NDEBUG
176 |         cudaDeviceSynchronize();
177 |         CHECK_CUDA_ERROR(cudaGetLastError());
178 | #endif
179 | 
180 |         int cache_size = args_.batch_size_ * (args_.max_prompt_len_ + args_.max_gen_len_) * args_.hidden_units_; // type float
181 | 
182 |         int min_prompt_seq_len = min(args_.max_prompt_len_, decoding_params.min_prompt_seq_len);
183 |         int max_prompt_seq_len = decoding_params.max_prompt_seq_len;
184 |         assert(max_prompt_seq_len <= args_.max_prompt_len_);
185 |         int total_seq_len = max_prompt_seq_len + args_.max_gen_len_;
186 | 
187 |         /**
188 |          * init the freq_cis matrix, the freq_cis are only related to size_per_head
189 |          */
190 |         launchPrecomputeFreqsCis(decoding_params.freq_cis, args_.size_per_head_, total_seq_len, decoding_params.stream);
191 | 
192 | #ifndef NDEBUG
193 |         cudaDeviceSynchronize();
194 |         CHECK_CUDA_ERROR(cudaGetLastError());
195 | #endif
196 |         int prev_pos = 0;
197 |         for (int cur_pos = min_prompt_seq_len; cur_pos < total_seq_len; ++cur_pos)
198 |         {
199 |             int cur_seq_len = cur_pos - prev_pos;
200 |             int step = cur_pos - min_prompt_seq_len + 1;
201 | 
202 |             /**
203 |              * Embedding Lookup
204 |              */
205 |             if (cur_pos == min_prompt_seq_len)
206 |             {
207 | #ifndef NDEBUG
208 |                 printf("[FL][INFO] prompt tokens embedding lookup\n");
209 | #endif
210 |                 // prompt phase, prompt_tokens[:, :cur_pos] is embedded to from_tensor which shape is [batch_size, cur_seq_len, hidden_units]
211 |                 launchEmbeddingLookupKernel(from_tensor_[0], decoding_params.embedding_table, decoding_params.prompt_tokens,
212 |                                             args_.batch_size_, cur_seq_len, decoding_params.max_prompt_seq_len, 
213 |                                             args_.hidden_units_, decoding_params.stream);
214 |             }
215 |             else
216 |             {
217 | #ifndef NDEBUG
218 |                 printf("[FL][INFO] step: %d tokens embedding lookup\n", step);
219 | #endif
220 |                 // generation phase, word_ids_buf_ is embedded to from_tensor which shape is [batch_size, hidden_units]
221 |                 launchEmbeddingLookupKernel(from_tensor_[0], decoding_params.embedding_table,
222 |                                             word_ids_buf_ + (step - 2) * args_.batch_size_,
223 |                                             args_.batch_size_, 1, 1, args_.hidden_units_, decoding_params.stream);
224 |             }
225 | 
226 | #ifndef NDEBUG
227 |             cudaDeviceSynchronize();
228 |             CHECK_CUDA_ERROR(cudaGetLastError());
229 | #endif
230 | 
231 |             int from_id, out_id;
232 |             for (int layer = 0; layer < args_.decoder_layers_; ++layer)
233 |             {
234 |                 /*
235 |                   For the first layer (layer-0), from_id is 0. We also stored the embedding lookup
236 |                   result in from_tensor_[0]
237 |                 */
238 |                 from_id = layer & 0x1;
239 |                 out_id = 1 - from_id;
240 | 
241 |                 /*
242 |                   We use one decoder_ object to process multiple decoder layers.
243 | 
244 |                   At the beginning of each decoder layer, we initialize the decoder object
245 |                   with corresponding weights and decoder_buf_.
246 | 
247 |                   The decoder_buf_ is reused.
248 |                 */
249 |                 decoder_->initialize(param[layer], decoder_buf_);
250 | 
251 | #ifndef NDEBUG
252 |                 cudaDeviceSynchronize();
253 |                 CHECK_CUDA_ERROR(cudaGetLastError());
254 | #endif
255 |                 decoder_->forward(from_tensor_[from_id], decoding_params.freq_cis,
256 |                                   K_cache_ + layer * cache_size,
257 |                                   V_cache_ + layer * cache_size,
258 |                                   from_tensor_[out_id], prev_pos, cur_seq_len);
259 | 
260 | #ifndef NDEBUG
261 |                 cudaDeviceSynchronize();
262 |                 CHECK_CUDA_ERROR(cudaGetLastError());
263 | #endif
264 |             }
265 | 
266 |             launchResNormKernel(decoder_normed_result_buf_, from_tensor_[out_id], decoding_params.decodingnorm.gamma,
267 |                                 decoding_params.decodingnorm.eps, args_.batch_size_ * cur_seq_len,
268 |                                 args_.hidden_units_, decoding_params.stream);
269 | 
270 | #ifndef NDEBUG
271 |             cudaDeviceSynchronize();
272 |             CHECK_CUDA_ERROR(cudaGetLastError());
273 | #endif
274 | 
275 |             float alpha = 1.0f;
276 |             float beta = 0.0f;
277 |             int m = args_.batch_size_ * cur_seq_len;
278 |             int k = args_.hidden_units_;
279 |             int n = args_.vocab_size_;
280 | 
281 |             CHECK_CUBLAS_STATUS(cublasGemmEx(decoding_params.cublas_handle,
282 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
283 |                                              n, m, k,
284 |                                              &alpha,
285 |                                              decoding_params.output_weight.kernel, AType_, n,
286 |                                              decoder_normed_result_buf_, BType_, k,
287 |                                              &beta,
288 |                                              logits_buf_, CUDA_R_32F, n,
289 |                                              CUBLAS_COMPUTE_32F,
290 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
291 | 
292 | #ifndef NDEBUG
293 |             cudaDeviceSynchronize();
294 |             CHECK_CUDA_ERROR(cudaGetLastError());
295 | #endif
296 | 
297 |             if (args_.candidate_num_ != 0)
298 |             {
299 |                 // top k sampling
300 |                 // step_logits_buf_ = logits_buf[:, -1, :]，and set the logits component corresponding to end_id to the maximum value
301 |                 launchUpdateLogitsWithoutSoftmax(step_logits_buf_, logits_buf_, args_.end_id_, finished_buf_, args_.batch_size_,
302 |                                                  cur_seq_len, args_.vocab_size_, decoding_params.stream);
303 | 
304 | #ifndef NDEBUG
305 |                 cudaDeviceSynchronize();
306 |                 CHECK_CUDA_ERROR(cudaGetLastError());
307 | #endif
308 | 
309 |                 launchTopKSamplingKernel(step_logits_buf_, topk_ids_buf_, topk_val_buf_,
310 |                                          word_ids_buf_ + (step - 1) * args_.batch_size_, decoding_params.sequence_length,
311 |                                          finished_buf_, decoding_params.prompt_tokens, decoding_params.prompt_tokens_mask,
312 |                                          cur_pos, max_prompt_seq_len,
313 |                                          cur_pos, // used as a random seed
314 |                                          args_.batch_size_, args_.vocab_size_, args_.candidate_num_, args_.end_id_, decoding_params.stream);
315 | 
316 | #ifndef NDEBUG
317 |                 cudaDeviceSynchronize();
318 |                 CHECK_CUDA_ERROR(cudaGetLastError());
319 | #endif
320 |             }
321 |             else if (args_.probability_threshold_ != 0.0)
322 |             {
323 |                 // top p sampling
324 |                 // step_logits_buf_ = logits_buf[:, -1, :]，set the logits component corresponding to end_id to the maximum value, softmax
325 |                 launchUpdateLogitsKernelWithoutLog(step_logits_buf_, logits_buf_, finished_buf_, cur_seq_len, args_.end_id_,
326 |                                                    args_.batch_size_, args_.vocab_size_, decoding_params.stream);
327 | 
328 | #ifndef NDEBUG
329 |                 cudaDeviceSynchronize();
330 |                 CHECK_CUDA_ERROR(cudaGetLastError());
331 | #endif
332 | 
333 |                 launchTopPSamplingKernel(step_logits_buf_, topp_id_vals_buf_, topp_sorted_logits_prob_buf_, topp_sorted_id_vals_buf_,
334 |                                          topp_offset_buf_, temp_storage_, args_.temp_storage_size_, finished_buf_, decoding_params.prompt_tokens,
335 |                                          decoding_params.prompt_tokens_mask, cur_pos, max_prompt_seq_len,
336 |                                          cur_pos, // used as a random seed
337 |                                          word_ids_buf_ + (step - 1) * args_.batch_size_, decoding_params.sequence_length,
338 |                                          args_.end_id_, args_.batch_size_, args_.vocab_size_, args_.probability_threshold_,
339 |                                          decoding_params.stream);
340 | 
341 | #ifndef NDEBUG
342 |                 cudaDeviceSynchronize();
343 |                 CHECK_CUDA_ERROR(cudaGetLastError());
344 | #endif
345 |             }
346 | 
347 |             prev_pos = cur_pos;
348 | 
349 | #ifndef NDEBUG
350 |             cudaDeviceSynchronize();
351 |             CHECK_CUDA_ERROR(cudaGetLastError());
352 | #endif
353 | 
354 |             // TODO
355 |             // Find a better method to check the is_finished
356 |             cudaMemcpy(h_finished_buf_, finished_buf_, sizeof(bool) * args_.batch_size_, cudaMemcpyDeviceToHost);
357 |             int sum = 0;
358 |             for (int i = 0; i < args_.batch_size_; i++)
359 |             {
360 |                 sum += h_finished_buf_[i] ? 1 : 0;
361 |             }
362 |             if (sum == args_.batch_size_)
363 |             {
364 |                 printf("the batch stopped\n");
365 |                 break;
366 |             }
367 |         }
368 | 
369 |         /**
370 |          * word_ids_buf_ -> output_ids, remove the token in the prompt section
371 |          */
372 |         launchRemovePromptTokenKernel(decoding_params.output_ids, word_ids_buf_, decoding_params.sequence_length,
373 |                                       decoding_params.prompt_sequence_length, decoding_params.min_prompt_seq_len,
374 |                                       args_.batch_size_, args_.max_prompt_len_ + args_.max_gen_len_, decoding_params.stream);
375 | #ifndef NDEBUG
376 |         cudaDeviceSynchronize();
377 |         CHECK_CUDA_ERROR(cudaGetLastError());
378 | #endif
379 |     }
380 | 
381 |     template <OperationType OpType_>
382 |     DecodingSampling<OpType_>::~DecodingSampling()
383 |     {
384 |         delete[] h_finished_buf_;
385 |         delete decoder_;
386 |         allocator_.free(buf_);
387 |     }
388 | 
389 |     template class DecodingSampling<OperationType::FP32>;
390 | 
391 |     template class DecodingSampling<OperationType::FP16>;
392 | 
393 |     template class DecodingInitParam<float>;
394 | 
395 |     template class DecodingInitParam<half>;
396 | }


--------------------------------------------------------------------------------
/fasterLlama/cuda/open_decoder.cu:
--------------------------------------------------------------------------------
  1 | #include "open_decoder.h"
  2 | #include "decoder_kernels.cuh"
  3 | #include "utils.h"
  4 | #include <cstdlib>
  5 | 
  6 | namespace FasterLLaMA
  7 | {
  8 |     template <OperationType OpType_, OperationType QuantizationType>
  9 |     OpenDecoder<OpType_, QuantizationType>::OpenDecoder(int batch_size, int max_prompt_len, int max_gen_len,
 10 |                                                         int head_num, int size_per_head, int ffn_hidden_units) : batch_size_(batch_size), max_prompt_len_(max_prompt_len),
 11 |                                                                                                                  max_gen_len_(max_gen_len), head_num_(head_num),
 12 |                                                                                                                  size_per_head_(size_per_head), ffn_hidden_units_(ffn_hidden_units)
 13 |     {
 14 | #ifndef NDEBUG
 15 |         PRINT_FUNC_NAME_();
 16 | #endif
 17 |         hidden_units_ = head_num_ * size_per_head_;
 18 |         total_len_ = max_prompt_len_ + max_gen_len_;
 19 |         for (int i = 0; i < 5; i++)
 20 |         {
 21 |             cublasAlgo_[i] = -1; // CUBLAS_GEMM_DEFAULT
 22 |         }
 23 |     }
 24 | 
 25 |     template <OperationType OpType_, OperationType QuantizationType>
 26 |     void OpenDecoder<OpType_, QuantizationType>::initialize(DecoderInitParam<DataType_, weight_DataType_> param, char *buf)
 27 |     {
 28 | #ifndef NDEBUG
 29 |         PRINT_FUNC_NAME_();
 30 | #endif
 31 |         param_ = param;
 32 |         int buf_size = batch_size_ * max_prompt_len_ * head_num_ * size_per_head_;
 33 |         int reuse_buf_size = batch_size_ * max_prompt_len_ * max(ffn_hidden_units_, hidden_units_);
 34 |         from_tensor_int8_buf_ = (int8_t *)(buf);
 35 |         from_tensor_scale_buf_ = (float *)(from_tensor_int8_buf_ + buf_size);
 36 |         query_buf_ = (int32_t *)(from_tensor_scale_buf_ + batch_size_ * max_prompt_len_);
 37 |         key_buf_ = (int32_t *)(query_buf_ + reuse_buf_size);
 38 |         value_buf_ = (int32_t *)(key_buf_ + reuse_buf_size);
 39 |         query_out_buf_ = (float *)(value_buf_ + buf_size);
 40 |         key_out_buf_ = (float *)(query_out_buf_ + buf_size);
 41 |         value_out_fp_buf_ = (float *)(key_out_buf_ + buf_size);
 42 |         qk_buf_ = (float *)(value_out_fp_buf_ + buf_size);
 43 |         qkv_buf_ = (float *)(qk_buf_ + batch_size_ * head_num_ * max_prompt_len_ * total_len_);
 44 |         ffn_tensor_buf_ = (DataType_ *)(qkv_buf_ + buf_size);
 45 |         ffn_inter_scale_buf_ = (float *)(ffn_tensor_buf_ + buf_size);
 46 | 
 47 | #ifndef NDEBUG
 48 |         cudaDeviceSynchronize();
 49 |         CHECK_CUDA_ERROR(cudaGetLastError());
 50 | #endif
 51 |     }
 52 | 
 53 |     template <OperationType OpType_, OperationType QuantizationType>
 54 |     int OpenDecoder<OpType_, QuantizationType>::getWorkspaceSize()
 55 |     {
 56 | #ifndef NDEBUG
 57 |         PRINT_FUNC_NAME_();
 58 | #endif
 59 |         int buf_size = batch_size_ * max_prompt_len_ * hidden_units_;
 60 |         int reuse_buf_size = batch_size_ * max_prompt_len_ * max(ffn_hidden_units_, hidden_units_);
 61 |         int work_space_size = sizeof(int8_t) * buf_size +
 62 |                               sizeof(float) * 4 * buf_size +
 63 |                               sizeof(int32_t) * (buf_size + reuse_buf_size * 2) +
 64 |                               sizeof(float) * 2 * batch_size_ * max_prompt_len_ +
 65 |                               sizeof(DataType_) * buf_size +
 66 |                               sizeof(float) * batch_size_ * head_num_ * max_prompt_len_ * total_len_;
 67 |         return work_space_size;
 68 |     }
 69 | 
 70 |     /**
 71 |      * key_cache_ value_cache_: cache_size, [batch_size, head_num, total_len_, size_per_head]
 72 |      * freq_cis_: [max_prompt_len_, size_per_head]
 73 |      */
 74 |     template <OperationType OpType_, OperationType QuantizationType>
 75 |     void OpenDecoder<OpType_, QuantizationType>::forward(const DataType_ *from_tensor, const float *freq_cis, float *key_cache_,
 76 |                                                          float *value_cache_, DataType_ *decoder_output, const int start_pos,
 77 |                                                          const int seq_len)
 78 |     {
 79 | #ifndef NDEBUG
 80 |         PRINT_FUNC_NAME_();
 81 | #endif
 82 |         typedef typename weight_Traits_::AlphaType weight_AlphaType;
 83 |         typedef typename qkv_Traits_::AlphaType qkv_AlphaType;
 84 |         const weight_AlphaType weight_alpha = 1;
 85 |         const weight_AlphaType weight_beta = 0;
 86 |         const qkv_AlphaType qkv_alpha = 1.0f;
 87 |         const qkv_AlphaType qkv_beta = 0.0f;
 88 |         try
 89 |         {
 90 |             /* masked multi-head attention */
 91 |             /* ResNorm-Quantized(from_tensor) -> from_tensor_int8_buf_ and from_tensor_scale_buf_ */
 92 |             launchResNormQuantizedKernel<DataType_>(from_tensor_int8_buf_, from_tensor, param_.attn_resnorm.gamma, from_tensor_scale_buf_,
 93 |                                                     param_.attn_resnorm.eps, batch_size_ * seq_len, hidden_units_, param_.stream);
 94 | 
 95 | #ifndef NDEBUG
 96 |             cudaDeviceSynchronize();
 97 |             CHECK_CUDA_ERROR(cudaGetLastError());
 98 | #endif
 99 | 
100 |             /* Q\K\V gemm(from_tensor_int8_buf_) -> query_buf_、key_buf_、value_buf_ */
101 |             int m = batch_size_ * seq_len;
102 |             int n = hidden_units_;
103 |             int k = hidden_units_;
104 | 
105 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
106 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
107 |                                              n, m, k,
108 |                                              &weight_alpha,
109 |                                              param_.attention.query_weight.kernel, weight_Traits_::AType, n,
110 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
111 |                                              &weight_beta,
112 |                                              query_buf_, weight_Traits_::CType, n,
113 |                                              weight_Traits_::computeType,
114 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
115 | 
116 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
117 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
118 |                                              n, m, k,
119 |                                              &weight_alpha,
120 |                                              param_.attention.key_weight.kernel, weight_Traits_::AType, n,
121 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
122 |                                              &weight_beta,
123 |                                              key_buf_, weight_Traits_::CType, n,
124 |                                              weight_Traits_::computeType,
125 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
126 | 
127 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
128 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
129 |                                              n, m, k,
130 |                                              &weight_alpha,
131 |                                              param_.attention.value_weight.kernel, weight_Traits_::AType, n,
132 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
133 |                                              &weight_beta,
134 |                                              value_buf_, weight_Traits_::CType, n,
135 |                                              weight_Traits_::computeType,
136 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
137 | 
138 |             /**
139 |              * Q\K Quantized-rope-Quantized-Transpose
140 |              * query_buf_, key_buf_ -> query_out_buf_, key_out_buf_
141 |              */
142 |             launchQKRoteEmbeddingTranspose(query_out_buf_, key_out_buf_, query_buf_, key_buf_, from_tensor_scale_buf_,
143 |                                            from_tensor_scale_buf_, param_.attention.query_weight.weight_scale,
144 |                                            param_.attention.key_weight.weight_scale,
145 |                                            freq_cis, batch_size_, seq_len, start_pos, total_len_, head_num_, size_per_head_,
146 |                                            param_.stream);
147 | 
148 | #ifndef NDEBUG
149 |             cudaDeviceSynchronize();
150 |             CHECK_CUDA_ERROR(cudaGetLastError());
151 | #endif
152 | 
153 |             /**
154 |              * Dequantized V Transpose
155 |              * value_buf_ -> value_out_fp_buf_
156 |              */
157 |             launchDequantizedVTransposeKernel(value_out_fp_buf_, value_buf_, from_tensor_scale_buf_, param_.attention.value_weight.weight_scale,
158 |                                               batch_size_, seq_len, head_num_, size_per_head_, param_.stream);
159 | 
160 | #ifndef NDEBUG
161 |             cudaDeviceSynchronize();
162 |             CHECK_CUDA_ERROR(cudaGetLastError());
163 | #endif
164 | 
165 |             /**
166 |              * Store K\V in cache
167 |              * k_cache v_cache: [batch_size, head_num, total_len_, size_per_head]
168 |              * store k\v [batch_size, head_num, seq_len, size_per_head] to [batch_size, head_num, start_pos:start_pos+seq_len, size_per_head]
169 |              */
170 |             launchStoreKVcacheKernel(key_cache_, value_cache_, key_out_buf_, value_out_fp_buf_, start_pos, seq_len, batch_size_,
171 |                                      head_num_, total_len_, size_per_head_, param_.stream);
172 | 
173 | #ifndef NDEBUG
174 |             cudaDeviceSynchronize();
175 |             CHECK_CUDA_ERROR(cudaGetLastError());
176 | #endif
177 | 
178 |             // prompt 阶段，此时 qk 乘法为 gemm
179 |             if (seq_len > 1)
180 |             {
181 |                 CHECK_CUBLAS_STATUS(cublasGemmStridedBatchedEx(param_.cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
182 |                                                                seq_len, seq_len, size_per_head_,
183 |                                                                &qkv_alpha,
184 |                                                                key_out_buf_, qkv_Traits_::AType, size_per_head_, seq_len * size_per_head_,
185 |                                                                query_out_buf_, qkv_Traits_::BType, size_per_head_, seq_len * size_per_head_,
186 |                                                                &qkv_beta,
187 |                                                                qk_buf_, qkv_Traits_::CType, seq_len, seq_len * seq_len,
188 |                                                                batch_size_ * head_num_,
189 |                                                                qkv_Traits_::computeType,
190 |                                                                static_cast<cublasGemmAlgo_t>(cublasAlgo_[1])));
191 |             }
192 |             else
193 |             { // generation 阶段，此时 qk 乘法为 gemv
194 |                 CHECK_CUBLAS_STATUS(cublasSgemvStridedBatched(param_.cublas_handle, CUBLAS_OP_T,
195 |                                                               seq_len + start_pos, size_per_head_,
196 |                                                               &qkv_alpha,
197 |                                                               key_cache_, size_per_head_, total_len_ * size_per_head_,
198 |                                                               query_out_buf_, 1, size_per_head_,
199 |                                                               &qkv_beta,
200 |                                                               qk_buf_, 1, size_per_head_,
201 |                                                               batch_size_ * head_num_));
202 |             }
203 | 
204 |             /**
205 |              * softmax
206 |              */
207 |             launchBlockSoftmaxKernel(qk_buf_, param_.attn_mask + start_pos * total_len_, batch_size_, head_num_, seq_len,
208 |                                      seq_len + start_pos, total_len_, rsqrtf(static_cast<float>(size_per_head_)), param_.stream);
209 | 
210 | #ifndef NDEBUG
211 |             cudaDeviceSynchronize();
212 |             CHECK_CUDA_ERROR(cudaGetLastError());
213 | #endif
214 | 
215 |             // prompt 阶段，此时 qk*v 乘法为 gemm
216 |             if (seq_len > 1)
217 |             {
218 |                 CHECK_CUBLAS_STATUS(cublasGemmStridedBatchedEx(param_.cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
219 |                                                                size_per_head_, seq_len, seq_len,
220 |                                                                &qkv_alpha,
221 |                                                                value_out_fp_buf_, qkv_Traits_::AType, size_per_head_, seq_len * size_per_head_,
222 |                                                                qk_buf_, qkv_Traits_::BType, seq_len, seq_len * seq_len,
223 |                                                                &qkv_beta,
224 |                                                                qkv_buf_, qkv_Traits_::CType, size_per_head_, seq_len * size_per_head_,
225 |                                                                batch_size_ * head_num_,
226 |                                                                qkv_Traits_::computeType,
227 |                                                                static_cast<cublasGemmAlgo_t>(cublasAlgo_[1])));
228 |             }
229 |             else
230 |             { // generation 阶段，此时 qk*v 乘法为 gemv
231 |                 CHECK_CUBLAS_STATUS(cublasSgemvStridedBatched(param_.cublas_handle, CUBLAS_OP_N,
232 |                                                               size_per_head_, seq_len + start_pos,
233 |                                                               &qkv_alpha,
234 |                                                               value_cache_, size_per_head_, total_len_ * size_per_head_,
235 |                                                               qk_buf_, 1, size_per_head_,
236 |                                                               &qkv_beta,
237 |                                                               qkv_buf_, 1, size_per_head_,
238 |                                                               batch_size_ * head_num_));
239 |             }
240 | 
241 |             /**
242 |              * quantized qkv to int8 and transpose from [batch_size, head_num, seq_len, size_per_head]
243 |              * to [batch_size, seq_len, hidden_units]
244 |              */
245 |             launchAttnQuantizedTransposeKernel(from_tensor_int8_buf_, qkv_buf_, from_tensor_scale_buf_, batch_size_, head_num_, seq_len,
246 |                                                size_per_head_, param_.stream);
247 | 
248 | #ifndef NDEBUG
249 |             cudaDeviceSynchronize();
250 |             CHECK_CUDA_ERROR(cudaGetLastError());
251 | #endif
252 | 
253 |             /**
254 |              * project gemm, Reuse the query_buf_ as attn_out_buf_
255 |              */
256 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
257 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
258 |                                              n, m, k,
259 |                                              &weight_alpha,
260 |                                              param_.attention.attention_output_weight.kernel, weight_Traits_::AType, n,
261 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
262 |                                              &weight_beta,
263 |                                              query_buf_, weight_Traits_::CType, n,
264 |                                              weight_Traits_::computeType,
265 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
266 | 
267 |             /**
268 |              * query_buf_ -> dequantized & add Residual -> ffn_tensor_buf_, DataType_, [batch_size, seq_len, hidden_units]
269 |              * ffn_tensor_buf_ -> resNorm & quantized -> from_tensor_int8_buf_, int8, [batch_size, seq_len, hidden_units]
270 |              */
271 |             launchDequantizedResidualResNormQuantized<DataType_>(from_tensor_int8_buf_, ffn_tensor_buf_, from_tensor, query_buf_,
272 |                                                                  from_tensor_scale_buf_, param_.attention.attention_output_weight.weight_scale,
273 |                                                                  param_.ffn_resnorm.gamma, from_tensor_scale_buf_, param_.ffn_resnorm.eps,
274 |                                                                  batch_size_ * seq_len, hidden_units_, param_.stream);
275 | 
276 | #ifndef NDEBUG
277 |             cudaDeviceSynchronize();
278 |             CHECK_CUDA_ERROR(cudaGetLastError());
279 | #endif
280 | 
281 |             n = ffn_hidden_units_;
282 |             /**
283 |              * w1 gemm, Reuse the query_buf_ as w1_buf_
284 |              */
285 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
286 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
287 |                                              n, m, k,
288 |                                              &weight_alpha,
289 |                                              param_.ffn.w1_weight.kernel, weight_Traits_::AType, n,
290 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
291 |                                              &weight_beta,
292 |                                              query_buf_, weight_Traits_::CType, n,
293 |                                              weight_Traits_::computeType,
294 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
295 | 
296 |             /**
297 |              * w3 gemm, Reuse the key_buf_ as w3_buf_
298 |              */
299 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
300 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
301 |                                              n, m, k,
302 |                                              &weight_alpha,
303 |                                              param_.ffn.w3_weight.kernel, weight_Traits_::AType, n,
304 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
305 |                                              &weight_beta,
306 |                                              key_buf_, weight_Traits_::CType, n,
307 |                                              weight_Traits_::computeType,
308 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
309 | 
310 |             /**
311 |              * dequantized query_buf_ to w1_out
312 |              * dequantized key_buf__ & silu to w3_out
313 |              * pointwise-multiply (w1_out, w3_out) to w13_out
314 |              * quantized w13_out to from_tensor_int8_buf_, ffn_inter_scale_buf_
315 |              */
316 |             launchDequantizedSiluMultifyQuantized(from_tensor_int8_buf_, query_buf_, from_tensor_scale_buf_, param_.ffn.w1_weight.weight_scale,
317 |                                                   key_buf_, param_.ffn.w3_weight.weight_scale, ffn_inter_scale_buf_,
318 |                                                   batch_size_ * seq_len, ffn_hidden_units_, param_.stream);
319 | 
320 | #ifndef NDEBUG
321 |             cudaDeviceSynchronize();
322 |             CHECK_CUDA_ERROR(cudaGetLastError());
323 | #endif
324 | 
325 |             k = ffn_hidden_units_;
326 |             n = hidden_units_;
327 |             /**
328 |              * w2 gemm, Reuse the value_buf_ as w2_buf_
329 |              */
330 |             CHECK_CUBLAS_STATUS(cublasGemmEx(param_.cublas_handle,
331 |                                              CUBLAS_OP_N, CUBLAS_OP_N,
332 |                                              n, m, k,
333 |                                              &weight_alpha,
334 |                                              param_.ffn.w2_weight.kernel, weight_Traits_::AType, n,
335 |                                              from_tensor_int8_buf_, weight_Traits_::BType, k,
336 |                                              &weight_beta,
337 |                                              value_buf_, weight_Traits_::CType, n,
338 |                                              weight_Traits_::computeType,
339 |                                              static_cast<cublasGemmAlgo_t>(cublasAlgo_[0])));
340 | 
341 |             /**
342 |              * dequantized value_buf_ to w2_out used ffn_inter_scale_buf_ and weight_scale
343 |              * add Residual: w2_out + ffn_tensor_buf_ -> decoder_output
344 |              */
345 |             launchDequantizedResidual<DataType_>(decoder_output, ffn_tensor_buf_, value_buf_, ffn_inter_scale_buf_,
346 |                                                  param_.ffn.w2_weight.weight_scale, batch_size_ * seq_len, hidden_units_, param_.stream);
347 | 
348 | #ifndef NDEBUG
349 |             cudaDeviceSynchronize();
350 |             CHECK_CUDA_ERROR(cudaGetLastError());
351 | #endif
352 |         }
353 | 
354 |         catch (std::runtime_error &error)
355 |         {
356 |             throw error;
357 |         }
358 |     }
359 | 
360 |     template <OperationType OpType_, OperationType QuantizationType>
361 |     OpenDecoder<OpType_, QuantizationType>::~OpenDecoder()
362 |     {
363 |         from_tensor_int8_buf_ = nullptr;
364 |         from_tensor_scale_buf_ = nullptr;
365 |         query_buf_ = nullptr;
366 |         key_buf_ = nullptr;
367 |         value_buf_ = nullptr;
368 |         query_out_buf_ = nullptr;
369 |         key_out_buf_ = nullptr;
370 |         value_out_fp_buf_ = nullptr;
371 |         qk_buf_ = nullptr;
372 |         qkv_buf_ = nullptr;
373 |         ffn_tensor_buf_ = nullptr;
374 |         ffn_inter_scale_buf_ = nullptr;
375 |     }
376 | 
377 |     template class OpenDecoder<OperationType::FP32, OperationType::INT8>;
378 | 
379 |     template class OpenDecoder<OperationType::FP16, OperationType::INT8>;
380 | 
381 |     template class DecoderInitParam<float, int8_t>;
382 | 
383 |     template class DecoderInitParam<half, int8_t>;
384 | 
385 | } // namespace FasterLLaMA


--------------------------------------------------------------------------------
/fasterLlama/cuda/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdio.h>
 3 | #include <iostream>
 4 | #include <cublas_v2.h>
 5 | #include <cstdlib>
 6 | 
 7 | static const char *_cudaGetErrorEnum(cublasStatus_t error)
 8 | {
 9 |     switch (error)
10 |     {
11 |     case CUBLAS_STATUS_SUCCESS:
12 |         return "CUBLAS_STATUS_SUCCESS";
13 | 
14 |     case CUBLAS_STATUS_NOT_INITIALIZED:
15 |         return "CUBLAS_STATUS_NOT_INITIALIZED";
16 | 
17 |     case CUBLAS_STATUS_ALLOC_FAILED:
18 |         return "CUBLAS_STATUS_ALLOC_FAILED";
19 | 
20 |     case CUBLAS_STATUS_INVALID_VALUE:
21 |         return "CUBLAS_STATUS_INVALID_VALUE";
22 | 
23 |     case CUBLAS_STATUS_ARCH_MISMATCH:
24 |         return "CUBLAS_STATUS_ARCH_MISMATCH";
25 | 
26 |     case CUBLAS_STATUS_MAPPING_ERROR:
27 |         return "CUBLAS_STATUS_MAPPING_ERROR";
28 | 
29 |     case CUBLAS_STATUS_EXECUTION_FAILED:
30 |         return "CUBLAS_STATUS_EXECUTION_FAILED";
31 | 
32 |     case CUBLAS_STATUS_INTERNAL_ERROR:
33 |         return "CUBLAS_STATUS_INTERNAL_ERROR";
34 | 
35 |     case CUBLAS_STATUS_NOT_SUPPORTED:
36 |         return "CUBLAS_STATUS_NOT_SUPPORTED";
37 | 
38 |     case CUBLAS_STATUS_LICENSE_ERROR:
39 |         return "CUBLAS_STATUS_LICENSE_ERROR";
40 |     }
41 |     return "<unknown>";
42 | }
43 | 
44 | #define PRINT_FUNC_NAME_()                                              \
45 |     do                                                                  \
46 |     {                                                                   \
47 |         std::cout << "[FL][CALL] " << __FUNCTION__ << " " << std::endl; \
48 |     } while (0)
49 | 
50 | #define CHECK_CUDA_ERROR(call)                             \
51 |     do                                                     \
52 |     {                                                      \
53 |         const cudaError_t errorCode = call;                \
54 |         if (errorCode != cudaSuccess)                      \
55 |         {                                                  \
56 |             printf("CUDA Error:\n");                       \
57 |             printf("    File:   %s\n", __FILE__);          \
58 |             printf("    Line:   %d\n", __LINE__);          \
59 |             printf("    Error code:     %d\n", errorCode); \
60 |             printf("    Error text:     %s\n",             \
61 |                    cudaGetErrorString(errorCode));         \
62 |             exit(1);                                       \
63 |         }                                                  \
64 |     } while (0)
65 | 
66 | #define CHECK_CUBLAS_STATUS(call)                                 \
67 |     do                                                            \
68 |     {                                                             \
69 |         const cublasStatus_t statusCode = call;                   \
70 |         if (statusCode != CUBLAS_STATUS_SUCCESS)                  \
71 |         {                                                         \
72 |             printf("CUDA Error:\n");                              \
73 |             printf("    File:   %s\n", __FILE__);                 \
74 |             printf("    Line:   %d\n", __LINE__);                 \
75 |             printf("    Status code:     %d\n", statusCode);      \
76 |             printf("    Error text:     %s\n",                    \
77 |                    _cudaGetErrorEnum(statusCode)); \
78 |             exit(1);                                              \
79 |         }                                                         \
80 |     } while (0)
81 | 


--------------------------------------------------------------------------------
/fasterLlama/decoding_sampling.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "common.h"
  4 | #include "open_decoder.h"
  5 | #include "allocator.h"
  6 | 
  7 | namespace FasterLLaMA
  8 | {
  9 |     template <typename T>
 10 |     class DecodingInitParam
 11 |     {
 12 |     public:
 13 |         T *embedding_table;
 14 |         float *freq_cis;
 15 | 
 16 |         // the length of tokens in the batch. [batch_size, ]
 17 |         int *prompt_sequence_length;
 18 |         int min_prompt_seq_len;
 19 |         int max_prompt_seq_len;
 20 |         // [batch_size, max_prompt_seq_len]
 21 |         int *prompt_tokens;
 22 |         // [batch_size, max_prompt_seq_len], pad token is 0, otherwise 1.
 23 |         bool *prompt_tokens_mask;
 24 | 
 25 |         ResNormWeight<T> decodingnorm;
 26 | 
 27 |         DenseWeight<T, T> output_weight;
 28 | 
 29 |         int *output_ids;
 30 |         int *sequence_length;
 31 |         cublasHandle_t cublas_handle;
 32 |         cudaStream_t stream;
 33 |     };
 34 | 
 35 |     struct TransformerArguments
 36 |     {
 37 |         int batch_size_;
 38 |         int seq_len_;
 39 |         int head_num_;
 40 |         int size_per_head_;
 41 |         int hidden_units_;
 42 |         int ffn_hidden_units_;
 43 |     };
 44 | 
 45 |     struct DecodingArguments : public TransformerArguments
 46 |     {
 47 |         int decoder_layers_;
 48 |         int vocab_size_;
 49 |         int start_id_;
 50 |         // the eos token's index
 51 |         int end_id_;
 52 |         int max_prompt_len_;
 53 |         int max_gen_len_;
 54 |     };
 55 | 
 56 |     struct DecodingSamplingArguments : public DecodingArguments
 57 |     {
 58 |         // the k for top-k sampling
 59 |         int candidate_num_;
 60 |         // the p for top-p sampling
 61 |         float probability_threshold_;
 62 |         size_t temp_storage_size_;
 63 |     };
 64 | 
 65 |     struct DecodingBeamsearchArguments : public DecodingArguments
 66 |     {
 67 |         int beam_width_;
 68 |         size_t temp_storage_size_;
 69 |         float beam_search_diversity_rate_;
 70 |     };
 71 | 
 72 |     template <OperationType OpType_>
 73 |     class DecodingSampling
 74 |     {
 75 |     private:
 76 |         typedef DecoderTransformerTraits<OpType_> Traits_;
 77 |         typedef typename Traits_::DataType DataType_;
 78 |         const IAllocator &allocator_;
 79 |         struct DecodingSamplingArguments args_;
 80 | 
 81 |         const cublasComputeType_t computeType_ = Traits_::computeType;
 82 |         const cudaDataType_t AType_ = Traits_::AType;
 83 |         const cudaDataType_t BType_ = Traits_::BType;
 84 |         const cudaDataType_t CType_ = Traits_::CType;
 85 |         int cublasAlgo_[1] = {20};
 86 | 
 87 |         OpenDecoder<OpType_, OperationType::INT8> *decoder_;
 88 |         float *K_cache_;
 89 |         float *V_cache_;
 90 |         DataType_ *from_tensor_[2];
 91 |         char *decoder_buf_;
 92 |         DataType_ *decoder_normed_result_buf_;
 93 |         float *logits_buf_;
 94 |         float *step_logits_buf_;
 95 |         int *word_ids_buf_;
 96 |         bool *finished_buf_;
 97 |         int *topk_ids_buf_;
 98 |         float *topk_val_buf_;
 99 |         void *buf_;
100 |         bool *h_finished_buf_;
101 |         // is initialized by [[0, 1, ..., vocab_size-1], [0, 1, ..., vocab_size-1], ..., [0, 1, ..., vocab_size-1]]
102 |         int *topp_id_vals_buf_;
103 |         float *topp_sorted_logits_prob_buf_;
104 |         int *topp_sorted_id_vals_buf_;
105 |         // is initialized by [0, vocab_size, ..., batch_size * vocab_size]
106 |         int *topp_offset_buf_;
107 | 
108 |         void *temp_storage_;
109 | 
110 |     public:
111 |         DecodingSampling(const IAllocator &allocator, const int batch_size,
112 |                          const int max_prompt_len, const int max_gen_len,
113 |                          const int head_num, const int size_per_head,
114 |                          const int vocab_size, const int decoder_layers,
115 |                          const int end_id, const int ffn_hidden_units,
116 |                          const int candidate_num = 0, const float probability_threshold = 0.0);
117 | 
118 |         void forward(const DecoderInitParam<DataType_, int8_t> *param,
119 |                      DecodingInitParam<DataType_> decoding_params);
120 |        
121 | 
122 |         virtual ~DecodingSampling();
123 |     };
124 | 
125 | } // namespace FasterLLaMA
126 | 


--------------------------------------------------------------------------------
/fasterLlama/lib/libfldecoderkernel.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caiwanxianhust/FasterLLaMA/05d995c04e9498c435f97e5de92daa3ff64d0f47/fasterLlama/lib/libfldecoderkernel.so


--------------------------------------------------------------------------------
/fasterLlama/lib/libfldecodersampling.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caiwanxianhust/FasterLLaMA/05d995c04e9498c435f97e5de92daa3ff64d0f47/fasterLlama/lib/libfldecodersampling.so


--------------------------------------------------------------------------------
/fasterLlama/lib/libfldecodingkernel.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caiwanxianhust/FasterLLaMA/05d995c04e9498c435f97e5de92daa3ff64d0f47/fasterLlama/lib/libfldecodingkernel.so


--------------------------------------------------------------------------------
/fasterLlama/lib/libflopendecoder.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caiwanxianhust/FasterLLaMA/05d995c04e9498c435f97e5de92daa3ff64d0f47/fasterLlama/lib/libflopendecoder.so


--------------------------------------------------------------------------------
/fasterLlama/open_decoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.h"
 4 | 
 5 | namespace FasterLLaMA
 6 | {
 7 |     
 8 |     template <typename T, typename WeightType>
 9 |     class DecoderInitParam
10 |     {
11 |     public:
12 |         /* weights for transformer */
13 |         ResNormWeight<T> attn_resnorm;
14 |         AttentionWeight<T, WeightType> attention;
15 | 
16 |         float *attn_mask;
17 | 
18 |         ResNormWeight<T> ffn_resnorm;
19 |         FFNWeight<T, WeightType> ffn;
20 | 
21 |         cublasHandle_t cublas_handle;
22 |         cudaStream_t stream;
23 |     };
24 | 
25 |     
26 |     template <OperationType OpType_, OperationType QuantizationType>
27 |     class OpenDecoder
28 |     {
29 |     private:
30 |         typedef DecoderTransformerTraits<OpType_> Traits_;
31 |         typedef DecoderTransformerTraits<OperationType::FP32> qkv_Traits_;
32 |         typedef DecoderTransformerTraits<QuantizationType> weight_Traits_;
33 | 
34 |         typedef typename Traits_::DataType DataType_;
35 |         typedef typename weight_Traits_::DataType weight_DataType_;
36 |         DecoderInitParam<DataType_, weight_DataType_> param_;
37 | 
38 |         int cublasAlgo_[5];
39 | 
40 |         int batch_size_;
41 |         int max_prompt_len_;
42 |         int max_gen_len_;
43 |         int total_len_;
44 |         int head_num_;
45 |         int size_per_head_;
46 |         int hidden_units_;
47 |         int ffn_hidden_units_;
48 | 
49 |         /*  buf_size = batch_size * max_prompt_len_ * head_num * size_per_head
50 |             cache_size = batch_size * head_num * total_len_ * size_per_head
51 |          */
52 |         int8_t *from_tensor_int8_buf_; // buf_size, [batch_size * seq_len, head_num * size_per_head]
53 |         float *from_tensor_scale_buf_; // batch_size * max_prompt_len_, [batch_size, seq_len]
54 |         int32_t *query_buf_;           // [batch_size * seq_len, max(hidden_units, ffn_hidden_units)]
55 |         int32_t *key_buf_;             // [batch_size * seq_len, max(hidden_units, ffn_hidden_units)]
56 |         int32_t *value_buf_;           // buf_size, [batch_size * seq_len, head_num * size_per_head]
57 |         float *query_out_buf_;         // buf_size, [batch_size, head_num, seq_len, size_per_head]
58 |         float *key_out_buf_;           // buf_size, [batch_size, head_num, seq_len, size_per_head]
59 |         float *value_out_fp_buf_;      // buf_size, [batch_size, head_num, seq_len, size_per_head]
60 |         float *qk_buf_;                // [batch_size * head_num, seq_len, total_len_]
61 |         float *qkv_buf_;               // buf_size, [batch_size * head_num, seq_len, size_per_head]
62 |         DataType_ *ffn_tensor_buf_;    // buf_size, [batch_size, seq_len, head_num * size_per_head]
63 |         float *ffn_inter_scale_buf_;   // batch_size * max_prompt_len_, [batch_size, seq_len]
64 | 
65 |     public:
66 |         OpenDecoder(int batch_size, int max_prompt_len, int max_gen_len, int head_num, int size_per_head, int ffn_hidden_units);
67 | 
68 |         void initialize(DecoderInitParam<DataType_, weight_DataType_> param, char *buf);
69 | 
70 |         int getWorkspaceSize();
71 | 
72 |         /**
73 |          * key_cache_ value_cache_: cache_size, [batch_size, head_num, total_len_, size_per_head]
74 |          * freq_cis_: [max_prompt_len_, size_per_head]
75 |          */
76 |         void forward(const DataType_ *from_tensor, const float *freq_cis, float *key_cache_, float *value_cache_,
77 |                      DataType_ *decoder_output, const int start_pos, const int seq_len);
78 | 
79 |         ~OpenDecoder();
80 |     };
81 | 
82 | } // namespace FasterLLaMA
83 | 


--------------------------------------------------------------------------------
/samples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set (sample_fasterllama_fp32_files
 2 |     llama_fp32.cu
 3 | )
 4 | 
 5 | set (sample_fasterllama_fp16_files
 6 |     llama_fp16.cu
 7 | )
 8 | 
 9 | set (SAMPLES_HEADER_DIR
10 |     ${PROJECT_SOURCE_DIR}/fasterLlama
11 |     ${PROJECT_SOURCE_DIR}/fasterLlama/cuda
12 | )
13 | 
14 | include_directories(
15 |     ${SAMPLES_HEADER_DIR}
16 | )
17 | 
18 | link_directories(
19 |     ${PROJECT_SOURCE_DIR}/fasterLlama/lib
20 | )
21 | 
22 | message("-- Assign include directories (include_directories=${SAMPLES_HEADER_DIR})")
23 | 
24 | add_executable(fasterllama_fp32 ${sample_fasterllama_fp32_files})
25 | target_compile_features(fasterllama_fp32 PUBLIC cxx_std_14)
26 | target_link_libraries(fasterllama_fp32 PUBLIC -lcublas -lcudart fldecodersampling)
27 | 
28 | add_executable(fasterllama_fp16 ${sample_fasterllama_fp16_files})
29 | target_compile_features(fasterllama_fp16 PUBLIC cxx_std_14)
30 | target_link_libraries(fasterllama_fp16 PUBLIC -lcublas -lcudart fldecodersampling)
31 | 


--------------------------------------------------------------------------------
/samples/llama_fp16.cu:
--------------------------------------------------------------------------------
  1 | #include "decoding_sampling.h"
  2 | #include <cstdio>
  3 | #include <curand_kernel.h>
  4 | #include <cstdlib>
  5 | #include <cuda_runtime.h>
  6 | #include <cfloat>
  7 | #include <cuda_fp16.h>
  8 | #include <utils.h>
  9 | 
 10 | template <typename T>
 11 | void printVecInVec(const T *clusters, const int nrows, const int ncols, const int end_row, const int end_col, const char *str)
 12 | {
 13 |     printf("%s:\n[\n", str);
 14 |     for (int i = 0; i < end_row; ++i)
 15 |     {
 16 |         printf("[");
 17 |         for (int j = 0; j < end_col; ++j)
 18 |         {
 19 |             printf("%g  ", static_cast<float>(clusters[i * ncols + j]));
 20 |         }
 21 |         printf("]\n");
 22 |     }
 23 |     printf("]\n");
 24 | }
 25 | 
 26 | template <>
 27 | void printVecInVec(const half *clusters, const int nrows, const int ncols, const int end_row, const int end_col, const char *str)
 28 | {
 29 |     printf("%s:\n[\n", str);
 30 |     if (end_row >= nrows || end_col >= ncols)
 31 |         printf("invalid arguments!!!\nend_row >= nrows or end_col >= ncols\n");
 32 |     for (int i = 0; i < end_row; ++i)
 33 |     {
 34 |         printf("[");
 35 |         for (int j = 0; j < end_col; ++j)
 36 |         {
 37 |             printf("%g  ", __half2float(clusters[i * ncols + j]));
 38 |         }
 39 |         printf("]\n");
 40 |     }
 41 |     printf("]\n");
 42 | }
 43 | 
 44 | template <typename T>
 45 | void device_malloc(T **ptr, int size)
 46 | {
 47 |     CHECK_CUDA_ERROR(cudaMalloc((void **)ptr, sizeof(T) * size));
 48 |     T *tmp = new T[size];
 49 |     for (int i = 0; i < size; i++)
 50 |         tmp[i] = (T)((float)rand() / (RAND_MAX + 1.0) * 0.02);
 51 |     CHECK_CUDA_ERROR(cudaMemcpy(*ptr, tmp, sizeof(T) * size, cudaMemcpyHostToDevice));
 52 |     delete[] tmp;
 53 | }
 54 | 
 55 | template <>
 56 | void device_malloc(half **ptr, int size)
 57 | {
 58 |     CHECK_CUDA_ERROR(cudaMalloc((void **)ptr, sizeof(half) * size));
 59 |     half *tmp = new half[size];
 60 |     for (int i = 0; i < size; i++)
 61 |         tmp[i] = __float2half((float)rand() / (RAND_MAX + 1.0) * 0.02);
 62 |     CHECK_CUDA_ERROR(cudaMemcpy(*ptr, tmp, sizeof(half) * size, cudaMemcpyHostToDevice));
 63 |     delete[] tmp;
 64 | }
 65 | 
 66 | template <typename T>
 67 | void device_free(T *ptr)
 68 | {
 69 |     CHECK_CUDA_ERROR(cudaFree(ptr));
 70 |     return;
 71 | }
 72 | 
 73 | template <typename T>
 74 | __global__ void initAttnMaskKernel(T *attn_mask, const int length)
 75 | {
 76 |     int row = blockIdx.x;
 77 |     for (int tid = threadIdx.x; tid < length; tid += blockDim.x)
 78 |     {
 79 |         attn_mask[row * length + tid] = (row > tid) ? 0.0f : (T)(-1 * FLT_MAX);
 80 |     }
 81 | }
 82 | 
 83 | __global__ void initIntVecKernel(int *mat, const int length, const int max_val, const int min_val)
 84 | {
 85 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 86 |     if (tid < length)
 87 |     {
 88 |         curandState_t local_state;
 89 |         curand_init(0, tid, 0, &local_state);
 90 |         int val = static_cast<int>(curand(&local_state) % (max_val - min_val));
 91 |         mat[tid] = val + min_val;
 92 |     }
 93 | }
 94 | 
 95 | template <typename T>
 96 | void decoding_sample(const int batch_size, const int candidate_num, const float probability_threshold, const int head_num,
 97 |                      const int size_per_head, const int vocab_size, const int max_prompt_len, const int max_gen_len,
 98 |                      const int decoder_layers, const int ffn_hidden_units)
 99 | {
100 |     const int hidden_units = head_num * size_per_head;
101 |     const int total_len = max_prompt_len + max_gen_len;
102 |     const int end_id = 2;
103 | 
104 |     cublasHandle_t cublasHandle;
105 |     CHECK_CUBLAS_STATUS(cublasCreate(&cublasHandle));
106 | 
107 |     cudaStream_t stream;
108 |     CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
109 |     CHECK_CUBLAS_STATUS(cublasSetStream(cublasHandle, stream));
110 | 
111 |     FasterLLaMA::Allocator<FasterLLaMA::AllocatorType::CUDA> allocator(0);
112 |     FasterLLaMA::DecoderInitParam<T, int8_t> *param = new FasterLLaMA::DecoderInitParam<T, int8_t>[decoder_layers];
113 | 
114 |     for (int i = 0; i < decoder_layers; i++)
115 |     {
116 |         param[i].stream = stream;
117 |         param[i].cublas_handle = cublasHandle;
118 | 
119 |         T *d_attn_resnorm_gamma;
120 |         int8_t *d_self_Q_kernel, *d_self_K_kernel, *d_self_V_kernel, *d_self_output_kernel;
121 |         float *d_self_Q_kernel_scale, *d_self_K_kernel_scale, *d_self_V_kernel_scale, *d_self_output_kernel_scale;
122 |         float *d_attn_mask; // [total_len, total_len]
123 |         T *d_ffn_resnorm_gamma;
124 |         int8_t *d_ffn_kernel1, *d_ffn_kernel2, *d_ffn_kernel3;
125 |         float *d_ffn_kernel1_scale, *d_ffn_kernel2_scale, *d_ffn_kernel3_scale;
126 | 
127 |         device_malloc(&d_attn_resnorm_gamma, hidden_units);
128 | 
129 |         device_malloc(&d_self_Q_kernel, hidden_units * hidden_units);
130 |         device_malloc(&d_self_K_kernel, hidden_units * hidden_units);
131 |         device_malloc(&d_self_V_kernel, hidden_units * hidden_units);
132 |         device_malloc(&d_self_output_kernel, hidden_units * hidden_units);
133 |         device_malloc(&d_self_Q_kernel_scale, hidden_units);
134 |         device_malloc(&d_self_K_kernel_scale, hidden_units);
135 |         device_malloc(&d_self_V_kernel_scale, hidden_units);
136 |         device_malloc(&d_self_output_kernel_scale, hidden_units);
137 | 
138 |         // attn_mask 为下三角为 0，其他元素为 -lnf 的矩阵，各层复用
139 |         if (i == 0)
140 |         {
141 |             device_malloc(&d_attn_mask, total_len * total_len);
142 |             initAttnMaskKernel<<<total_len, 256, 0, stream>>>(d_attn_mask, total_len);
143 |             cudaDeviceSynchronize();
144 |             CHECK_CUDA_ERROR(cudaGetLastError());
145 |             param[i].attn_mask = d_attn_mask;
146 |         }
147 |         else
148 |         {
149 |             param[i].attn_mask = param[i - 1].attn_mask;
150 |         }
151 | 
152 |         device_malloc(&d_ffn_resnorm_gamma, hidden_units);
153 | 
154 |         device_malloc(&d_ffn_kernel1, ffn_hidden_units * hidden_units);
155 |         device_malloc(&d_ffn_kernel1_scale, ffn_hidden_units);
156 |         device_malloc(&d_ffn_kernel2, ffn_hidden_units * hidden_units);
157 |         device_malloc(&d_ffn_kernel2_scale, hidden_units);
158 |         device_malloc(&d_ffn_kernel3, ffn_hidden_units * hidden_units);
159 |         device_malloc(&d_ffn_kernel3_scale, ffn_hidden_units);
160 | 
161 |         param[i].attn_resnorm.gamma = d_attn_resnorm_gamma;
162 |         param[i].attn_resnorm.eps = 1e-5f;
163 |         param[i].attention.query_weight.kernel = d_self_Q_kernel;
164 |         param[i].attention.query_weight.weight_scale = d_self_Q_kernel_scale;
165 |         param[i].attention.key_weight.kernel = d_self_K_kernel;
166 |         param[i].attention.key_weight.weight_scale = d_self_K_kernel_scale;
167 |         param[i].attention.value_weight.kernel = d_self_V_kernel;
168 |         param[i].attention.value_weight.weight_scale = d_self_V_kernel_scale;
169 |         param[i].attention.attention_output_weight.kernel = d_self_output_kernel;
170 |         param[i].attention.attention_output_weight.weight_scale = d_self_output_kernel_scale;
171 |         param[i].ffn_resnorm.gamma = d_ffn_resnorm_gamma;
172 |         param[i].ffn_resnorm.eps = 1e-5f;
173 |         param[i].ffn.w1_weight.kernel = d_ffn_kernel1;
174 |         param[i].ffn.w1_weight.weight_scale = d_ffn_kernel1_scale;
175 |         param[i].ffn.w2_weight.kernel = d_ffn_kernel2;
176 |         param[i].ffn.w2_weight.weight_scale = d_ffn_kernel2_scale;
177 |         param[i].ffn.w3_weight.kernel = d_ffn_kernel3;
178 |         param[i].ffn.w3_weight.weight_scale = d_ffn_kernel3_scale;
179 |     }
180 | 
181 |     FasterLLaMA::DecodingInitParam<T> decoding_params;
182 | 
183 |     T *d_embedding_table;
184 |     float *d_freq_cis;
185 |     int *d_prompt_sequence_length;
186 |     int *d_prompt_tokens;
187 |     bool *d_prompt_tokens_mask;
188 |     T *d_decoding_resnorm_gamma;
189 |     T *d_output_weight_kernel;
190 |     int *d_output_ids;
191 |     int *d_sequence_lengths;
192 | 
193 |     device_malloc(&d_embedding_table, hidden_units * vocab_size);
194 |     device_malloc(&d_freq_cis, total_len * size_per_head);
195 |     device_malloc(&d_prompt_sequence_length, batch_size);
196 |     device_malloc(&d_prompt_tokens, max_prompt_len * batch_size);
197 |     device_malloc(&d_prompt_tokens_mask, max_prompt_len * batch_size);
198 |     device_malloc(&d_decoding_resnorm_gamma, hidden_units);
199 |     device_malloc(&d_output_weight_kernel, hidden_units * vocab_size);
200 |     device_malloc(&d_output_ids, batch_size * total_len);
201 |     device_malloc(&d_sequence_lengths, batch_size);
202 | 
203 |     int *h_prompt_sequence_length = new int[batch_size];
204 |     bool *h_prompt_tokens_mask = new bool[max_prompt_len * batch_size];
205 |     int min_prompt_seq_len = INT_MAX;
206 |     int max_prompt_seq_len = -1;
207 |     for (int i = 0; i < batch_size; i++)
208 |     {
209 |         h_prompt_sequence_length[i] = max_prompt_len - batch_size + (rand() % batch_size) + 1;
210 |         min_prompt_seq_len = min(min_prompt_seq_len, h_prompt_sequence_length[i]);
211 |         max_prompt_seq_len = max(max_prompt_seq_len, h_prompt_sequence_length[i]);
212 |         for (int j = 0; j < max_prompt_len; ++j)
213 |         {
214 |             h_prompt_tokens_mask[i * max_prompt_len + j] = (j < h_prompt_sequence_length[i]);
215 |         }
216 |     }
217 | 
218 |     CHECK_CUDA_ERROR(cudaMemcpy(d_prompt_sequence_length, h_prompt_sequence_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice));
219 |     CHECK_CUDA_ERROR(cudaMemcpy(d_prompt_tokens_mask, h_prompt_tokens_mask, sizeof(bool) * max_prompt_len * batch_size, cudaMemcpyHostToDevice));
220 | 
221 |     int block_size = 256;
222 |     int grid_size = (max_prompt_len * batch_size + block_size - 1) / block_size;
223 |     initIntVecKernel<<<grid_size, block_size>>>(d_prompt_tokens, max_prompt_len * batch_size, vocab_size, 3);
224 |     cudaDeviceSynchronize();
225 |     CHECK_CUDA_ERROR(cudaGetLastError());
226 | 
227 |     int *h_prompt_tokens = new int[max_prompt_len * batch_size];
228 |     CHECK_CUDA_ERROR(cudaMemcpy(h_prompt_tokens, d_prompt_tokens, sizeof(int) * max_prompt_len * batch_size, cudaMemcpyDeviceToHost));
229 |     printVecInVec(h_prompt_tokens, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens");
230 | 
231 |     decoding_params.cublas_handle = cublasHandle;
232 |     decoding_params.stream = stream;
233 |     decoding_params.embedding_table = d_embedding_table;
234 |     decoding_params.freq_cis = d_freq_cis;
235 |     decoding_params.prompt_sequence_length = d_prompt_sequence_length;
236 |     decoding_params.prompt_tokens = d_prompt_tokens;
237 |     decoding_params.prompt_tokens_mask = d_prompt_tokens_mask;
238 |     decoding_params.decodingnorm.gamma = d_decoding_resnorm_gamma;
239 |     decoding_params.output_weight.kernel = d_output_weight_kernel;
240 |     decoding_params.output_ids = d_output_ids;
241 |     decoding_params.sequence_length = d_sequence_lengths;
242 |     decoding_params.min_prompt_seq_len = min_prompt_seq_len;
243 |     decoding_params.max_prompt_seq_len = max_prompt_seq_len;
244 | 
245 |     const FasterLLaMA::OperationType type = sizeof(T) == sizeof(float) ? FasterLLaMA::OperationType::FP32 : FasterLLaMA::OperationType::FP16;
246 | 
247 |     FasterLLaMA::DecodingSampling<type> *decoding = new FasterLLaMA::DecodingSampling<type>(allocator, batch_size, max_prompt_len,
248 |                                                                                                 max_gen_len, head_num, size_per_head,
249 |                                                                                                 vocab_size, decoder_layers,
250 |                                                                                                 end_id, ffn_hidden_units, candidate_num,
251 |                                                                                                 probability_threshold);
252 | 
253 |     cudaEvent_t start, stop;
254 |     CHECK_CUDA_ERROR(cudaEventCreate(&start));
255 |     CHECK_CUDA_ERROR(cudaEventCreate(&stop));
256 |     CHECK_CUDA_ERROR(cudaEventRecord(start));
257 |     cudaEventQuery(start);
258 | 
259 |     decoding->forward(param, decoding_params);
260 | 
261 |     cudaDeviceSynchronize();
262 | 
263 |     CHECK_CUDA_ERROR(cudaEventRecord(stop));
264 |     CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
265 |     float elapsedTime;
266 |     CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
267 |     CHECK_CUDA_ERROR(cudaEventDestroy(start));
268 |     CHECK_CUDA_ERROR(cudaEventDestroy(stop));
269 | 
270 |     printf("Time = %g ms.\n", elapsedTime);
271 |     printf("[INFO] batch_size %d topk %d topp %f head_num %d size_per_head %d max_prompt_len %d max_gen_len %d decoder_layers"
272 |            " %d vocab_size %d FL-CPP-decoding-sampling-time %.2f ms\n",
273 |            batch_size, candidate_num, probability_threshold, head_num, size_per_head, max_prompt_len, max_gen_len, decoder_layers,
274 |            vocab_size, elapsedTime);
275 | 
276 |     int *h_word_ids = new int[batch_size * total_len];
277 |     CHECK_CUDA_ERROR(cudaMemcpy(h_word_ids, decoding_params.output_ids, sizeof(int) * batch_size * total_len, cudaMemcpyDeviceToHost));
278 | 
279 |     int *h_seq_lengths = new int[batch_size];
280 |     CHECK_CUDA_ERROR(cudaMemcpy(h_seq_lengths, d_sequence_lengths, sizeof(int) * batch_size, cudaMemcpyDeviceToHost));
281 | 
282 |     printVecInVec(h_seq_lengths, 1, batch_size, 1, batch_size, "h_seq_lengths");
283 | 
284 |     printf("word_ids:\n[\n");
285 |     for (int i = 0; i < batch_size; ++i)
286 |     {
287 |         printf("[");
288 |         for (int j = 0; j < h_seq_lengths[i]; ++j)
289 |         {
290 |             printf("%d\t", h_word_ids[i * total_len + j]);
291 |         }
292 |         printf("]\n");
293 |     }
294 |     printf("]\n");
295 | 
296 |     printVecInVec(h_prompt_tokens, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens");
297 | 
298 |     printVecInVec(h_prompt_tokens_mask, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens_mask");
299 | 
300 |     device_free(param[0].attn_mask);
301 |     for (int i = 0; i < decoder_layers; ++i)
302 |     {
303 |         device_free(param[i].attn_resnorm.gamma);
304 |         device_free(param[i].attention.query_weight.kernel);
305 |         device_free(param[i].attention.query_weight.weight_scale);
306 |         device_free(param[i].attention.key_weight.kernel);
307 |         device_free(param[i].attention.key_weight.weight_scale);
308 |         device_free(param[i].attention.value_weight.kernel);
309 |         device_free(param[i].attention.value_weight.weight_scale);
310 |         device_free(param[i].attention.attention_output_weight.kernel);
311 |         device_free(param[i].attention.attention_output_weight.weight_scale);
312 |         device_free(param[i].ffn_resnorm.gamma);
313 |         device_free(param[i].ffn.w1_weight.kernel);
314 |         device_free(param[i].ffn.w1_weight.weight_scale);
315 |         device_free(param[i].ffn.w2_weight.kernel);
316 |         device_free(param[i].ffn.w2_weight.weight_scale);
317 |         device_free(param[i].ffn.w3_weight.kernel);
318 |         device_free(param[i].ffn.w3_weight.weight_scale);
319 |     }
320 | 
321 |     device_free(decoding_params.embedding_table);
322 |     device_free(decoding_params.freq_cis);
323 |     device_free(decoding_params.prompt_sequence_length);
324 |     device_free(decoding_params.prompt_tokens);
325 |     device_free(decoding_params.prompt_tokens_mask);
326 |     device_free(decoding_params.decodingnorm.gamma);
327 |     device_free(decoding_params.output_weight.kernel);
328 |     device_free(decoding_params.output_ids);
329 |     device_free(decoding_params.sequence_length);
330 | 
331 |     delete[] param;
332 |     delete[] h_prompt_sequence_length;
333 |     delete[] h_prompt_tokens_mask;
334 |     delete[] h_word_ids;
335 |     delete[] h_seq_lengths;
336 |     delete[] h_prompt_tokens;
337 |     delete decoding;
338 |     return;
339 | }
340 | 
341 | int main()
342 | {
343 |     srand(0);
344 |     struct cudaDeviceProp prop;
345 |     CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0));
346 |     printf("Device %s\n", prop.name);
347 | 
348 |     const int batch_size = 4;
349 |     const int candidate_num = 0;
350 |     const float probability_threshold = 0.8;
351 |     const int head_num = 8;
352 |     const int size_per_head = 128;
353 |     const int vocab_size = 200;
354 |     const int max_prompt_len = 16;
355 |     const int max_gen_len = 16;
356 |     const int decoder_layers = 4;
357 |     const int ffn_hidden_units = 64 * 2;
358 | 
359 |     decoding_sample<half>(batch_size, candidate_num, probability_threshold, head_num, size_per_head, vocab_size,
360 |                           max_prompt_len, max_gen_len, decoder_layers, ffn_hidden_units);
361 | 
362 |     return 0;
363 | }
364 | 


--------------------------------------------------------------------------------
/samples/llama_fp32.cu:
--------------------------------------------------------------------------------
  1 | #include "decoding_sampling.h"
  2 | #include <cstdio>
  3 | #include <curand_kernel.h>
  4 | #include <cstdlib>
  5 | #include <cuda_runtime.h>
  6 | #include <cfloat>
  7 | #include <cuda_fp16.h>
  8 | #include <utils.h>
  9 | 
 10 | template <typename T>
 11 | void printVecInVec(const T *clusters, const int nrows, const int ncols, const int end_row, const int end_col, const char *str)
 12 | {
 13 |     printf("%s:\n[\n", str);
 14 |     for (int i = 0; i < end_row; ++i)
 15 |     {
 16 |         printf("[");
 17 |         for (int j = 0; j < end_col; ++j)
 18 |         {
 19 |             printf("%g  ", static_cast<float>(clusters[i * ncols + j]));
 20 |         }
 21 |         printf("]\n");
 22 |     }
 23 |     printf("]\n");
 24 | }
 25 | 
 26 | template <>
 27 | void printVecInVec(const half *clusters, const int nrows, const int ncols, const int end_row, const int end_col, const char *str)
 28 | {
 29 |     printf("%s:\n[\n", str);
 30 |     if (end_row >= nrows || end_col >= ncols)
 31 |         printf("invalid arguments!!!\nend_row >= nrows or end_col >= ncols\n");
 32 |     for (int i = 0; i < end_row; ++i)
 33 |     {
 34 |         printf("[");
 35 |         for (int j = 0; j < end_col; ++j)
 36 |         {
 37 |             printf("%g  ", __half2float(clusters[i * ncols + j]));
 38 |         }
 39 |         printf("]\n");
 40 |     }
 41 |     printf("]\n");
 42 | }
 43 | 
 44 | template <typename T>
 45 | void device_malloc(T **ptr, int size)
 46 | {
 47 |     CHECK_CUDA_ERROR(cudaMalloc((void **)ptr, sizeof(T) * size));
 48 |     T *tmp = new T[size];
 49 |     for (int i = 0; i < size; i++)
 50 |         tmp[i] = (T)((float)rand() / (RAND_MAX + 1.0) * 0.02);
 51 |     CHECK_CUDA_ERROR(cudaMemcpy(*ptr, tmp, sizeof(T) * size, cudaMemcpyHostToDevice));
 52 |     delete[] tmp;
 53 | }
 54 | 
 55 | template <>
 56 | void device_malloc(half **ptr, int size)
 57 | {
 58 |     CHECK_CUDA_ERROR(cudaMalloc((void **)ptr, sizeof(half) * size));
 59 |     half *tmp = new half[size];
 60 |     for (int i = 0; i < size; i++)
 61 |         tmp[i] = __float2half((float)rand() / (RAND_MAX + 1.0) * 0.02);
 62 |     CHECK_CUDA_ERROR(cudaMemcpy(*ptr, tmp, sizeof(half) * size, cudaMemcpyHostToDevice));
 63 |     delete[] tmp;
 64 | }
 65 | 
 66 | template <typename T>
 67 | void device_free(T *ptr)
 68 | {
 69 |     CHECK_CUDA_ERROR(cudaFree(ptr));
 70 |     return;
 71 | }
 72 | 
 73 | template <typename T>
 74 | __global__ void initAttnMaskKernel(T *attn_mask, const int length)
 75 | {
 76 |     int row = blockIdx.x;
 77 |     for (int tid = threadIdx.x; tid < length; tid += blockDim.x)
 78 |     {
 79 |         attn_mask[row * length + tid] = (row > tid) ? 0.0f : (T)(-1 * FLT_MAX);
 80 |     }
 81 | }
 82 | 
 83 | __global__ void initIntVecKernel(int *mat, const int length, const int max_val, const int min_val)
 84 | {
 85 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 86 |     if (tid < length)
 87 |     {
 88 |         curandState_t local_state;
 89 |         curand_init(0, tid, 0, &local_state);
 90 |         int val = static_cast<int>(curand(&local_state) % (max_val - min_val));
 91 |         mat[tid] = val + min_val;
 92 |     }
 93 | }
 94 | 
 95 | template <typename T>
 96 | void decoding_sample(const int batch_size, const int candidate_num, const float probability_threshold, const int head_num,
 97 |                      const int size_per_head, const int vocab_size, const int max_prompt_len, const int max_gen_len,
 98 |                      const int decoder_layers, const int ffn_hidden_units)
 99 | {
100 |     const int hidden_units = head_num * size_per_head;
101 |     const int total_len = max_prompt_len + max_gen_len;
102 |     const int end_id = 2;
103 | 
104 |     cublasHandle_t cublasHandle;
105 |     CHECK_CUBLAS_STATUS(cublasCreate(&cublasHandle));
106 | 
107 |     cudaStream_t stream;
108 |     CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
109 |     CHECK_CUBLAS_STATUS(cublasSetStream(cublasHandle, stream));
110 | 
111 |     FasterLLaMA::Allocator<FasterLLaMA::AllocatorType::CUDA> allocator(0);
112 |     FasterLLaMA::DecoderInitParam<T, int8_t> *param = new FasterLLaMA::DecoderInitParam<T, int8_t>[decoder_layers];
113 | 
114 |     for (int i = 0; i < decoder_layers; i++)
115 |     {
116 |         param[i].stream = stream;
117 |         param[i].cublas_handle = cublasHandle;
118 | 
119 |         T *d_attn_resnorm_gamma;
120 |         int8_t *d_self_Q_kernel, *d_self_K_kernel, *d_self_V_kernel, *d_self_output_kernel;
121 |         float *d_self_Q_kernel_scale, *d_self_K_kernel_scale, *d_self_V_kernel_scale, *d_self_output_kernel_scale;
122 |         float *d_attn_mask; // [total_len, total_len]
123 |         T *d_ffn_resnorm_gamma;
124 |         int8_t *d_ffn_kernel1, *d_ffn_kernel2, *d_ffn_kernel3;
125 |         float *d_ffn_kernel1_scale, *d_ffn_kernel2_scale, *d_ffn_kernel3_scale;
126 | 
127 |         device_malloc(&d_attn_resnorm_gamma, hidden_units);
128 | 
129 |         device_malloc(&d_self_Q_kernel, hidden_units * hidden_units);
130 |         device_malloc(&d_self_K_kernel, hidden_units * hidden_units);
131 |         device_malloc(&d_self_V_kernel, hidden_units * hidden_units);
132 |         device_malloc(&d_self_output_kernel, hidden_units * hidden_units);
133 |         device_malloc(&d_self_Q_kernel_scale, hidden_units);
134 |         device_malloc(&d_self_K_kernel_scale, hidden_units);
135 |         device_malloc(&d_self_V_kernel_scale, hidden_units);
136 |         device_malloc(&d_self_output_kernel_scale, hidden_units);
137 | 
138 |         // attn_mask 为下三角为 0，其他元素为 -lnf 的矩阵，各层复用
139 |         if (i == 0)
140 |         {
141 |             device_malloc(&d_attn_mask, total_len * total_len);
142 |             initAttnMaskKernel<<<total_len, 256, 0, stream>>>(d_attn_mask, total_len);
143 |             cudaDeviceSynchronize();
144 |             CHECK_CUDA_ERROR(cudaGetLastError());
145 |             param[i].attn_mask = d_attn_mask;
146 |         }
147 |         else
148 |         {
149 |             param[i].attn_mask = param[i - 1].attn_mask;
150 |         }
151 | 
152 |         device_malloc(&d_ffn_resnorm_gamma, hidden_units);
153 | 
154 |         device_malloc(&d_ffn_kernel1, ffn_hidden_units * hidden_units);
155 |         device_malloc(&d_ffn_kernel1_scale, ffn_hidden_units);
156 |         device_malloc(&d_ffn_kernel2, ffn_hidden_units * hidden_units);
157 |         device_malloc(&d_ffn_kernel2_scale, hidden_units);
158 |         device_malloc(&d_ffn_kernel3, ffn_hidden_units * hidden_units);
159 |         device_malloc(&d_ffn_kernel3_scale, ffn_hidden_units);
160 | 
161 |         param[i].attn_resnorm.gamma = d_attn_resnorm_gamma;
162 |         param[i].attn_resnorm.eps = 1e-5f;
163 |         param[i].attention.query_weight.kernel = d_self_Q_kernel;
164 |         param[i].attention.query_weight.weight_scale = d_self_Q_kernel_scale;
165 |         param[i].attention.key_weight.kernel = d_self_K_kernel;
166 |         param[i].attention.key_weight.weight_scale = d_self_K_kernel_scale;
167 |         param[i].attention.value_weight.kernel = d_self_V_kernel;
168 |         param[i].attention.value_weight.weight_scale = d_self_V_kernel_scale;
169 |         param[i].attention.attention_output_weight.kernel = d_self_output_kernel;
170 |         param[i].attention.attention_output_weight.weight_scale = d_self_output_kernel_scale;
171 |         param[i].ffn_resnorm.gamma = d_ffn_resnorm_gamma;
172 |         param[i].ffn_resnorm.eps = 1e-5f;
173 |         param[i].ffn.w1_weight.kernel = d_ffn_kernel1;
174 |         param[i].ffn.w1_weight.weight_scale = d_ffn_kernel1_scale;
175 |         param[i].ffn.w2_weight.kernel = d_ffn_kernel2;
176 |         param[i].ffn.w2_weight.weight_scale = d_ffn_kernel2_scale;
177 |         param[i].ffn.w3_weight.kernel = d_ffn_kernel3;
178 |         param[i].ffn.w3_weight.weight_scale = d_ffn_kernel3_scale;
179 |     }
180 | 
181 |     FasterLLaMA::DecodingInitParam<T> decoding_params;
182 | 
183 |     T *d_embedding_table;
184 |     float *d_freq_cis;
185 |     int *d_prompt_sequence_length;
186 |     int *d_prompt_tokens;
187 |     bool *d_prompt_tokens_mask;
188 |     T *d_decoding_resnorm_gamma;
189 |     T *d_output_weight_kernel;
190 |     int *d_output_ids;
191 |     int *d_sequence_lengths;
192 | 
193 |     device_malloc(&d_embedding_table, hidden_units * vocab_size);
194 |     device_malloc(&d_freq_cis, total_len * size_per_head);
195 |     device_malloc(&d_prompt_sequence_length, batch_size);
196 |     device_malloc(&d_prompt_tokens, max_prompt_len * batch_size);
197 |     device_malloc(&d_prompt_tokens_mask, max_prompt_len * batch_size);
198 |     device_malloc(&d_decoding_resnorm_gamma, hidden_units);
199 |     device_malloc(&d_output_weight_kernel, hidden_units * vocab_size);
200 |     device_malloc(&d_output_ids, batch_size * total_len);
201 |     device_malloc(&d_sequence_lengths, batch_size);
202 | 
203 |     int *h_prompt_sequence_length = new int[batch_size];
204 |     bool *h_prompt_tokens_mask = new bool[max_prompt_len * batch_size];
205 |     int min_prompt_seq_len = INT_MAX;
206 |     int max_prompt_seq_len = -1;
207 |     for (int i = 0; i < batch_size; i++)
208 |     {
209 |         h_prompt_sequence_length[i] = max_prompt_len - batch_size + (rand() % batch_size) + 1;
210 |         min_prompt_seq_len = min(min_prompt_seq_len, h_prompt_sequence_length[i]);
211 |         max_prompt_seq_len = max(max_prompt_seq_len, h_prompt_sequence_length[i]);
212 |         for (int j = 0; j < max_prompt_len; ++j)
213 |         {
214 |             h_prompt_tokens_mask[i * max_prompt_len + j] = (j < h_prompt_sequence_length[i]);
215 |         }
216 |     }
217 | 
218 |     CHECK_CUDA_ERROR(cudaMemcpy(d_prompt_sequence_length, h_prompt_sequence_length, sizeof(int) * batch_size, cudaMemcpyHostToDevice));
219 |     CHECK_CUDA_ERROR(cudaMemcpy(d_prompt_tokens_mask, h_prompt_tokens_mask, sizeof(bool) * max_prompt_len * batch_size, cudaMemcpyHostToDevice));
220 | 
221 |     int block_size = 256;
222 |     int grid_size = (max_prompt_len * batch_size + block_size - 1) / block_size;
223 |     initIntVecKernel<<<grid_size, block_size>>>(d_prompt_tokens, max_prompt_len * batch_size, vocab_size, 3);
224 |     cudaDeviceSynchronize();
225 |     CHECK_CUDA_ERROR(cudaGetLastError());
226 | 
227 |     int *h_prompt_tokens = new int[max_prompt_len * batch_size];
228 |     CHECK_CUDA_ERROR(cudaMemcpy(h_prompt_tokens, d_prompt_tokens, sizeof(int) * max_prompt_len * batch_size, cudaMemcpyDeviceToHost));
229 |     printVecInVec(h_prompt_tokens, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens");
230 | 
231 |     decoding_params.cublas_handle = cublasHandle;
232 |     decoding_params.stream = stream;
233 |     decoding_params.embedding_table = d_embedding_table;
234 |     decoding_params.freq_cis = d_freq_cis;
235 |     decoding_params.prompt_sequence_length = d_prompt_sequence_length;
236 |     decoding_params.prompt_tokens = d_prompt_tokens;
237 |     decoding_params.prompt_tokens_mask = d_prompt_tokens_mask;
238 |     decoding_params.decodingnorm.gamma = d_decoding_resnorm_gamma;
239 |     decoding_params.output_weight.kernel = d_output_weight_kernel;
240 |     decoding_params.output_ids = d_output_ids;
241 |     decoding_params.sequence_length = d_sequence_lengths;
242 |     decoding_params.min_prompt_seq_len = min_prompt_seq_len;
243 |     decoding_params.max_prompt_seq_len = max_prompt_seq_len;
244 | 
245 |     const FasterLLaMA::OperationType type = sizeof(T) == sizeof(float) ? FasterLLaMA::OperationType::FP32 : FasterLLaMA::OperationType::FP16;
246 | 
247 |     FasterLLaMA::DecodingSampling<type> *decoding = new FasterLLaMA::DecodingSampling<type>(allocator, batch_size, max_prompt_len,
248 |                                                                                                 max_gen_len, head_num, size_per_head,
249 |                                                                                                 vocab_size, decoder_layers,
250 |                                                                                                 end_id, ffn_hidden_units, candidate_num,
251 |                                                                                                 probability_threshold);
252 | 
253 |     cudaEvent_t start, stop;
254 |     CHECK_CUDA_ERROR(cudaEventCreate(&start));
255 |     CHECK_CUDA_ERROR(cudaEventCreate(&stop));
256 |     CHECK_CUDA_ERROR(cudaEventRecord(start));
257 |     cudaEventQuery(start);
258 | 
259 |     decoding->forward(param, decoding_params);
260 | 
261 |     cudaDeviceSynchronize();
262 | 
263 |     CHECK_CUDA_ERROR(cudaEventRecord(stop));
264 |     CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
265 |     float elapsedTime;
266 |     CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
267 |     CHECK_CUDA_ERROR(cudaEventDestroy(start));
268 |     CHECK_CUDA_ERROR(cudaEventDestroy(stop));
269 | 
270 |     printf("Time = %g ms.\n", elapsedTime);
271 |     printf("[INFO] batch_size %d topk %d topp %f head_num %d size_per_head %d max_prompt_len %d max_gen_len %d decoder_layers"
272 |            " %d vocab_size %d FL-CPP-decoding-sampling-time %.2f ms\n",
273 |            batch_size, candidate_num, probability_threshold, head_num, size_per_head, max_prompt_len, max_gen_len, decoder_layers,
274 |            vocab_size, elapsedTime);
275 | 
276 |     int *h_word_ids = new int[batch_size * total_len];
277 |     CHECK_CUDA_ERROR(cudaMemcpy(h_word_ids, decoding_params.output_ids, sizeof(int) * batch_size * total_len, cudaMemcpyDeviceToHost));
278 | 
279 |     int *h_seq_lengths = new int[batch_size];
280 |     CHECK_CUDA_ERROR(cudaMemcpy(h_seq_lengths, d_sequence_lengths, sizeof(int) * batch_size, cudaMemcpyDeviceToHost));
281 | 
282 |     printVecInVec(h_seq_lengths, 1, batch_size, 1, batch_size, "h_seq_lengths");
283 | 
284 |     printf("word_ids:\n[\n");
285 |     for (int i = 0; i < batch_size; ++i)
286 |     {
287 |         printf("[");
288 |         for (int j = 0; j < h_seq_lengths[i]; ++j)
289 |         {
290 |             printf("%d\t", h_word_ids[i * total_len + j]);
291 |         }
292 |         printf("]\n");
293 |     }
294 |     printf("]\n");
295 | 
296 |     printVecInVec(h_prompt_tokens, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens");
297 | 
298 |     printVecInVec(h_prompt_tokens_mask, batch_size, max_prompt_len, batch_size, max_prompt_len, "h_prompt_tokens_mask");
299 | 
300 |     device_free(param[0].attn_mask);
301 |     for (int i = 0; i < decoder_layers; ++i)
302 |     {
303 |         device_free(param[i].attn_resnorm.gamma);
304 |         device_free(param[i].attention.query_weight.kernel);
305 |         device_free(param[i].attention.query_weight.weight_scale);
306 |         device_free(param[i].attention.key_weight.kernel);
307 |         device_free(param[i].attention.key_weight.weight_scale);
308 |         device_free(param[i].attention.value_weight.kernel);
309 |         device_free(param[i].attention.value_weight.weight_scale);
310 |         device_free(param[i].attention.attention_output_weight.kernel);
311 |         device_free(param[i].attention.attention_output_weight.weight_scale);
312 |         device_free(param[i].ffn_resnorm.gamma);
313 |         device_free(param[i].ffn.w1_weight.kernel);
314 |         device_free(param[i].ffn.w1_weight.weight_scale);
315 |         device_free(param[i].ffn.w2_weight.kernel);
316 |         device_free(param[i].ffn.w2_weight.weight_scale);
317 |         device_free(param[i].ffn.w3_weight.kernel);
318 |         device_free(param[i].ffn.w3_weight.weight_scale);
319 |     }
320 | 
321 |     device_free(decoding_params.embedding_table);
322 |     device_free(decoding_params.freq_cis);
323 |     device_free(decoding_params.prompt_sequence_length);
324 |     device_free(decoding_params.prompt_tokens);
325 |     device_free(decoding_params.prompt_tokens_mask);
326 |     device_free(decoding_params.decodingnorm.gamma);
327 |     device_free(decoding_params.output_weight.kernel);
328 |     device_free(decoding_params.output_ids);
329 |     device_free(decoding_params.sequence_length);
330 | 
331 |     delete[] param;
332 |     delete[] h_prompt_sequence_length;
333 |     delete[] h_prompt_tokens_mask;
334 |     delete[] h_word_ids;
335 |     delete[] h_seq_lengths;
336 |     delete[] h_prompt_tokens;
337 |     delete decoding;
338 |     return;
339 | }
340 | 
341 | int main()
342 | {
343 |     srand(0);
344 |     struct cudaDeviceProp prop;
345 |     CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0));
346 |     printf("Device %s\n", prop.name);
347 | 
348 |     const int batch_size = 4;
349 |     const int candidate_num = 0;
350 |     const float probability_threshold = 0.8;
351 |     const int head_num = 8;
352 |     const int size_per_head = 128;
353 |     const int vocab_size = 200;
354 |     const int max_prompt_len = 16;
355 |     const int max_gen_len = 16;
356 |     const int decoder_layers = 4;
357 |     const int ffn_hidden_units = 64 * 2;
358 | 
359 |     decoding_sample<float>(batch_size, candidate_num, probability_threshold, head_num, size_per_head, vocab_size,
360 |                            max_prompt_len, max_gen_len, decoder_layers, ffn_hidden_units);
361 | 
362 |     return 0;
363 | }
364 | 


--------------------------------------------------------------------------------