├── README_zh-CN.md ├── README.md ├── requirements ├── runtime.txt ├── build.txt └── test.txt ├── requirements.txt ├── src └── turbomind │ ├── CMakeLists.txt │ ├── utils │ ├── CMakeLists.txt │ ├── macro.h │ ├── parser.h │ ├── parser.cc │ └── tensor.h │ ├── kernels │ ├── gemm │ │ ├── gpu_metric.h │ │ ├── simt.h │ │ ├── tuner │ │ │ ├── cache_utils.h │ │ │ ├── stopping_criterion.h │ │ │ ├── sampler.h │ │ │ ├── cache_utils.cu │ │ │ ├── stats.h │ │ │ ├── params.h │ │ │ ├── stopping_criterion.cc │ │ │ ├── measurer.h │ │ │ ├── measurer.cu │ │ │ ├── sampler.cu │ │ │ └── params.cc │ │ ├── test │ │ │ ├── reference.h │ │ │ ├── quantization.h │ │ │ ├── models.h │ │ │ ├── test_utils.h │ │ │ ├── quantization.cu │ │ │ ├── gemm_bench.cu │ │ │ ├── gemm_test.cu │ │ │ └── reference.cu │ │ ├── dispatch_cache.h │ │ ├── arch.h │ │ ├── predicate.h │ │ ├── registry.h │ │ ├── cast.h │ │ ├── registry.cu │ │ ├── kernel │ │ │ ├── f16_u4g128_f16_tnt_sm75_simt.cu │ │ │ ├── f16_u4g128_f16_tnt_sm75_s16816.cu │ │ │ ├── f16_u4g128_f16_tnt_sm90_s16816.cu │ │ │ ├── f16_u4g128_f16_tnt_sm70_s884.cu │ │ │ └── f16_u4g128_f16_tnt_sm80_s16816.cu │ │ ├── arch │ │ │ ├── mma_simt.h │ │ │ ├── mma_sm80.h │ │ │ ├── mma_sm70.h │ │ │ ├── smem_copy_simt.h │ │ │ ├── config_sm70_s884.h │ │ │ ├── config_sm75_s16816.h │ │ │ ├── config_sm80_s16816.h │ │ │ ├── config_simt.h │ │ │ ├── smem_copy_sm70.h │ │ │ ├── operand_sm70_s884.h │ │ │ └── operand_simt.h │ │ ├── format.h │ │ ├── gemm.h │ │ ├── operand.h │ │ ├── CMakeLists.txt │ │ ├── desc.h │ │ ├── iterator.h │ │ ├── unpack.cu │ │ ├── cta_map.h │ │ ├── kernel.h │ │ ├── utils.h │ │ ├── thread_group_map.h │ │ ├── transform.h │ │ ├── gpu_metric.cu │ │ └── types.h │ └── core │ │ ├── pipe_iter.h │ │ ├── math.h │ │ ├── meta.h │ │ ├── sub_byte_ptr.h │ │ ├── sync.h │ │ ├── common.h │ │ ├── data_type.h │ │ ├── smem.h │ │ ├── array.h │ │ ├── layout.h │ │ └── thread_map.h │ └── api │ └── python │ ├── linear.h │ └── CMakeLists.txt ├── MANIFEST.in ├── turbomind ├── __init__.py ├── version.py └── utils.py ├── generate.sh ├── .gitignore ├── .github ├── ISSUE_TEMPLATE │ ├── 3-documentation.yml │ ├── 1-feature-request.yml │ └── 2-bug-report.yml ├── workflows │ ├── lint.yml │ ├── windows-x64-gpu.yml │ ├── linux-x64-gpu.yml │ ├── pypi.yml │ └── cuda11.8-whl-release.yml └── md-link-config.json ├── example ├── generate.py ├── module.py ├── test_linear.py └── modeling_turbomind.py ├── .pre-commit-config.yaml └── .clang-format /README_zh-CN.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # turbomind 2 | -------------------------------------------------------------------------------- /requirements/runtime.txt: -------------------------------------------------------------------------------- 1 | torch 2 | -------------------------------------------------------------------------------- /requirements/build.txt: -------------------------------------------------------------------------------- 1 | pybind11<=2.13.1 2 | setuptools 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/runtime.txt 3 | -------------------------------------------------------------------------------- /src/turbomind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(utils) 2 | add_subdirectory(kernels/gemm) 3 | add_subdirectory(api/python) 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include turbomind/lib/*.so 3 | include turbomind/lib/*.so* 4 | include turbomind/lib/*.dll 5 | include turbomind/lib/*.pyd 6 | -------------------------------------------------------------------------------- /turbomind/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .linear import Linear 4 | 5 | __all__ = ['Linear'] 6 | -------------------------------------------------------------------------------- /src/turbomind/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | add_library(parser STATIC parser.cc) 4 | set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | allure-pytest 2 | coverage 3 | pynvml 4 | pytest 5 | pytest-assume 6 | pytest-order 7 | pytest-rerunfailures 8 | pytest-sugar 9 | pytest-xdist 10 | pyyaml 11 | -------------------------------------------------------------------------------- /src/turbomind/utils/macro.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__) 4 | 5 | #define __PRETTY_FUNCTION__ __FUNCSIG__ 6 | 7 | #endif 8 | 9 | typedef unsigned int uint; 10 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/gpu_metric.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | 7 | namespace turbomind::gemm { 8 | 9 | // bytes / second 10 | float MeasureL2CacheThroughput(); 11 | 12 | // fused multiply-add / second 13 | float MeasureMmaThroughput(int proble_size = 16384); 14 | 15 | } // namespace turbomind::gemm 16 | -------------------------------------------------------------------------------- /generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | WORKSPACE_PATH=$(dirname "$(readlink -f "$0")") 3 | 4 | builder="-G Ninja" 5 | 6 | if [ "$1" == "make" ]; then 7 | builder="" 8 | fi 9 | 10 | cmake ${builder} .. \ 11 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 12 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ 13 | -DCMAKE_INSTALL_PREFIX=${WORKSPACE_PATH}/install \ 14 | -DCMAKE_CUDA_FLAGS="-lineinfo" \ 15 | -DUSE_NVTX=ON \ 16 | -DBUILD_TEST=OFF 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::gemm::simt { 6 | 7 | // constexpr int OP_M = 2; 8 | // constexpr int OP_N = 16; 9 | // constexpr int OP_K = 4; 10 | 11 | // constexpr int OP_M = 4; 12 | // constexpr int OP_N = 8; 13 | // constexpr int OP_K = 8; 14 | 15 | constexpr int OP_M = 1; 16 | constexpr int OP_N = 32; 17 | constexpr int OP_K = 8; 18 | 19 | } // namespace turbomind::gemm::simt 20 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/cache_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | namespace turbomind::gemm { 8 | 9 | class CacheFlushing { 10 | public: 11 | static void flush(cudaStream_t stream = {}); 12 | 13 | private: 14 | CacheFlushing(); 15 | void operator()(cudaStream_t stream) const; 16 | 17 | uint32_t* buffer_; 18 | size_t size_; 19 | }; 20 | 21 | } // namespace turbomind::gemm 22 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stopping_criterion.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/stats.h" 4 | #include 5 | 6 | namespace turbomind::gemm { 7 | 8 | class StoppingCriterion { 9 | public: 10 | virtual ~StoppingCriterion() = default; 11 | virtual bool should_stop(const Stats& stats) = 0; 12 | }; 13 | 14 | std::unique_ptr CreateStoppingCriterion(int min_iter, int max_iter, float max_ms); 15 | 16 | } // namespace turbomind::gemm 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/pipe_iter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | template 8 | struct PipeIter { 9 | static constexpr int kMaxStep = Stages * Step; 10 | 11 | int r = 0; 12 | int w = kMaxStep - Step; 13 | 14 | __inline__ __device__ PipeIter& operator++() 15 | { 16 | w = r; 17 | r += Step; 18 | if (r == kMaxStep) { 19 | r -= kMaxStep; 20 | } 21 | return *this; 22 | } 23 | }; 24 | 25 | } // namespace turbomind 26 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/reference.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | class Reference { 12 | public: 13 | Reference(); 14 | ~Reference(); 15 | 16 | void set_stream(cudaStream_t stream); 17 | 18 | void gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc); 19 | 20 | private: 21 | cublasHandle_t handle_; 22 | }; 23 | 24 | } // namespace turbomind::gemm 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | 35 | .cache 36 | /build 37 | 38 | # Byte-compiled / optimized / DLL files 39 | __pycache__/ 40 | .vscode/ 41 | 42 | # Distribution / packaging 43 | .eggs/ 44 | wheels/ 45 | *.egg-info/ 46 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to the documentation. 3 | labels: "kind/doc,status/unconfirmed" 4 | title: "[Docs] " 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description the issue. 12 | validations: 13 | required: true 14 | 15 | - type: textarea 16 | attributes: 17 | label: Suggest a potential alternative/fix 18 | description: > 19 | Tell us how we could improve the documentation in this regard. 20 | - type: markdown 21 | attributes: 22 | value: > 23 | Thanks for contributing 🎉! 24 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/sampler.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/desc.h" 6 | #include "src/turbomind/kernels/gemm/tuner/measurer.h" 7 | 8 | #include 9 | 10 | namespace turbomind::gemm { 11 | 12 | class Sampler { 13 | public: 14 | explicit Sampler(Measurer& measurer, int k_clusters): measurer_{measurer}, k_clusters_{k_clusters} {} 15 | 16 | std::vector Run(std::vector specs, const Launcher& launcher, cudaStream_t stream); 17 | 18 | private: 19 | Measurer& measurer_; 20 | int k_clusters_; 21 | }; 22 | 23 | } // namespace turbomind::gemm 24 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/cache_utils.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h" 4 | 5 | namespace turbomind::gemm { 6 | 7 | CacheFlushing::CacheFlushing() 8 | { 9 | cudaDeviceProp props{}; 10 | cudaGetDeviceProperties(&props, 0); 11 | 12 | size_ = props.l2CacheSize; 13 | 14 | cudaMalloc(&buffer_, size_); 15 | } 16 | 17 | void CacheFlushing::flush(cudaStream_t stream) 18 | { 19 | thread_local CacheFlushing inst{}; 20 | inst(stream); 21 | } 22 | 23 | void CacheFlushing::operator()(cudaStream_t stream) const 24 | { 25 | cudaMemsetAsync(buffer_, 0, size_, stream); 26 | } 27 | 28 | } // namespace turbomind::gemm 29 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/math.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | #include 7 | 8 | namespace turbomind { 9 | 10 | template 11 | TM_HOST_DEVICE constexpr T ceil_div(T a, T b) 12 | { 13 | return (a + b - 1) / b; 14 | } 15 | 16 | template 17 | TM_HOST_DEVICE constexpr T round_up(T a, T b) 18 | { 19 | return (a + b - 1) / b * b; 20 | } 21 | 22 | template 23 | TM_HOST_DEVICE constexpr T log2(T x) 24 | { 25 | T n = 0; 26 | while (x != 1) { 27 | x /= 2; 28 | ++n; 29 | } 30 | return n; 31 | } 32 | 33 | // static_assert(log2(65536) == 16); 34 | // static_assert(log2(32) == 5); 35 | // static_assert(log2(1) == 0); 36 | 37 | } // namespace turbomind 38 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/dispatch_cache.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/desc.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | class DispatchCache { 12 | public: 13 | DispatchCache(std::vector kernels); 14 | 15 | ~DispatchCache(); 16 | 17 | std::optional LowerBound(const GemmDesc& desc) const; 18 | 19 | std::optional Find(const GemmDesc& desc) const; 20 | 21 | bool Insert(const GemmDesc& desc, const LaunchSpec& spec); 22 | 23 | int Export(std::ostream& os) const; 24 | 25 | int Import(std::istream& is); 26 | 27 | private: 28 | struct Impl; 29 | std::unique_ptr impl_; 30 | }; 31 | 32 | } // namespace turbomind::gemm 33 | -------------------------------------------------------------------------------- /src/turbomind/utils/parser.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | #include 5 | 6 | namespace turbomind { 7 | 8 | std::vector> ParseArgsList(const std::string& str); 9 | 10 | std::vector ParseListOrTuple(const std::string& str); 11 | 12 | inline void Parse(int& value, const std::string& str) 13 | { 14 | value = std::stoi(str); 15 | } 16 | 17 | inline void Parse(float& value, const std::string& str) 18 | { 19 | value = std::stof(str); 20 | } 21 | 22 | template 23 | void Parse(std::vector& xs, const std::string& str) 24 | { 25 | const auto ss = ParseListOrTuple(str); 26 | for (const auto& s : ss) { 27 | xs.emplace_back(); 28 | Parse(xs.back(), s); 29 | } 30 | } 31 | 32 | } // namespace turbomind 33 | -------------------------------------------------------------------------------- /turbomind/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Tuple 3 | 4 | __version__ = '0.0.2' 5 | short_version = __version__ 6 | 7 | 8 | def parse_version_info(version_str: str) -> Tuple: 9 | """Parse version from a string. 10 | 11 | Args: 12 | version_str (str): A string represents a version info. 13 | 14 | Returns: 15 | tuple: A sequence of integer and string represents version. 16 | """ 17 | _version_info = [] 18 | for x in version_str.split('.'): 19 | if x.isdigit(): 20 | _version_info.append(int(x)) 21 | elif x.find('rc') != -1: 22 | patch_version = x.split('rc') 23 | _version_info.append(int(patch_version[0])) 24 | _version_info.append(f'rc{patch_version[1]}') 25 | return tuple(_version_info) 26 | 27 | 28 | version_info = parse_version_info(__version__) 29 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/quantization.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/types.h" 4 | #include 5 | #include 6 | 7 | #pragma once 8 | 9 | namespace turbomind::gemm { 10 | 11 | template 12 | void Quantize(const thrust::universal_vector& x, 13 | int m, 14 | int k, 15 | Order order, 16 | int group_size, 17 | thrust::universal_vector& x_p, // pseudo-quantized 18 | thrust::universal_vector& x_q, // quantized ushort 19 | thrust::universal_vector& x_u, // scales & zeros (always m-major) 20 | cudaStream_t stream); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/turbomind/api/python/linear.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | #include "src/turbomind/utils/tensor.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace turbomind { 13 | 14 | enum class WeightType : int 15 | { 16 | kFP32, 17 | kFP16, 18 | kFP8, // not supported yet 19 | kBF16, 20 | kINT8, 21 | kINT4 22 | }; 23 | 24 | class Linear { 25 | public: 26 | Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size); 27 | void post_init(std::shared_ptr qweight, const Tensor& scales, const Tensor& qzeros, bool simt); 28 | void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr); 29 | ~Linear() {} 30 | 31 | static void clearWorkspaces(); 32 | 33 | private: 34 | struct Impl; 35 | std::shared_ptr impl_; 36 | }; 37 | }; // namespace turbomind 38 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | 5 | namespace turbomind::gemm { 6 | 7 | class Stats { 8 | public: 9 | Stats(): count_{}, mean_{}, m2_{} {} 10 | 11 | float mean() const noexcept 12 | { 13 | return mean_; 14 | } 15 | 16 | float sum() const noexcept 17 | { 18 | return mean_ * count_; 19 | } 20 | 21 | int count() const noexcept 22 | { 23 | return count_; 24 | } 25 | 26 | float get_variance() const noexcept 27 | { 28 | return count_ < 2 ? std::numeric_limits::quiet_NaN() : m2_ / count_; 29 | } 30 | 31 | void add_sample(float x) noexcept 32 | { 33 | ++count_; 34 | float delta = x - mean_; 35 | mean_ += delta / count_; 36 | float delta2 = x - mean_; 37 | m2_ += delta * delta2; 38 | } 39 | 40 | private: 41 | int count_; 42 | float mean_; 43 | float m2_; 44 | }; 45 | 46 | } // namespace turbomind::gemm 47 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/meta.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | template 8 | struct basic_type { 9 | using type = T; 10 | }; 11 | 12 | template 13 | constexpr basic_type type_c{}; 14 | 15 | template 16 | struct constant { 17 | using type = constant; 18 | using value_type = decltype(v); 19 | 20 | static constexpr value_type value = v; 21 | 22 | constexpr value_type operator()() const noexcept 23 | { 24 | return v; 25 | } 26 | constexpr operator value_type() const noexcept 27 | { 28 | return v; 29 | } 30 | }; 31 | 32 | template 33 | struct pair { 34 | }; 35 | 36 | template 37 | constexpr auto first(pair) 38 | { 39 | return u; 40 | } 41 | 42 | template 43 | constexpr auto second(pair) 44 | { 45 | return v; 46 | } 47 | 48 | template 49 | struct triplet { 50 | }; 51 | 52 | } // namespace turbomind 53 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/models.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | static const std::vector> config{ 10 | {11008 * 2, 4096}, {4096, 11008}, {12288, 4096}, {4096, 4096}, // llama2-7b 11 | {14336 * 2, 4096}, {4096, 14336}, {6144, 4096}, {4096, 4096}, // llama3-8b / internlm2.5-7b 12 | {16384 * 2, 6144}, {6144, 16384}, {8192, 6144}, {6144, 6144}, // internlm2-20b 13 | {13696 * 2, 4096}, {4096, 13696}, {4608, 4096}, {4096, 4096}, // glm4-9b 14 | {18944 * 2, 3584}, {3584, 18944}, {4608, 3584}, {3584, 3584}, // qwen2-7b 15 | {20480 * 2, 7168}, {7168, 20480}, {9216, 7168}, {7168, 7168}, // yi-34b 16 | {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192}, // llama2-70b / llama3-70b 17 | {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192} // qwen2-72b-instruct-awq 18 | }; 19 | // {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192}, // qwen2-72b 20 | -------------------------------------------------------------------------------- /src/turbomind/api/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | project(_turbomind_ext) 5 | 6 | find_package(pybind11 CONFIG) 7 | if(NOT pybind11_FOUND) 8 | execute_process(COMMAND "pybind11-config" "--cmakedir" 9 | RESULT_VARIABLE _COMMAND_SUCCESS 10 | OUTPUT_VARIABLE pybind11_DIR 11 | OUTPUT_STRIP_TRAILING_WHITESPACE) 12 | find_package(pybind11 CONFIG) 13 | endif() 14 | 15 | pybind11_add_module(${PROJECT_NAME} bind.cpp linear.cc) 16 | target_link_libraries(${PROJECT_NAME} PRIVATE gemm2) 17 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) 18 | 19 | set(_INSTALL_CUDA_RPATH 20 | "\$ORIGIN" 21 | "\$ORIGIN/../../nvidia/nccl/lib/" 22 | "\$ORIGIN/../../nvidia/cuda_runtime/lib/" 23 | "\$ORIGIN/../../nvidia/cublas/lib/" 24 | "\$ORIGIN/../../nvidia/curand/lib/" 25 | ) 26 | set_target_properties(${PROJECT_NAME} PROPERTIES 27 | BUILD_RPATH "\$ORIGIN" 28 | INSTALL_RPATH "${_INSTALL_CUDA_RPATH}" 29 | ) 30 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v2 10 | - name: Set up Python 3.8 11 | uses: actions/setup-python@v2 12 | with: 13 | python-version: 3.8 14 | - name: Install pre-commit hook 15 | run: | 16 | python -m pip install pre-commit 17 | pre-commit install 18 | - name: Linting 19 | run: pre-commit run --all-files 20 | - name: Format c/cuda codes with clang-format 21 | uses: DoozyX/clang-format-lint-action@v0.13 22 | with: 23 | source: src 24 | extensions: h,c,cpp,hpp,cu,cuh,cc 25 | clangFormatVersion: 11 26 | style: file 27 | - name: Check markdown link 28 | uses: gaurav-nelson/github-action-markdown-link-check@v1 29 | with: 30 | use-quiet-mode: 'yes' 31 | use-verbose-mode: 'yes' 32 | config-file: '.github/md-link-config.json' 33 | file-path: './README.md, ./LICENSE, ./README_zh-CN.md' 34 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/params.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | struct TuningParams { 12 | // Split-k params 13 | int max_splits = 8; 14 | int max_waves = 10; 15 | 16 | // Swizzling params 17 | std::vector swizzle{3}; 18 | 19 | // Sampling params 20 | float top_k = 0; 21 | int clusters = 5; 22 | int min_iter = 1; 23 | int max_iter = 10; 24 | float max_time = 1.f; 25 | 26 | std::vector seq; 27 | }; 28 | 29 | // example 30 | // max_splits=8,top_splits=5,max_waves=16,top_k=10,swizzle=[2,3,4],clusters=5,max_iter=10,min_iter=1,max_time=10.0 31 | void ParseTuningParams(TuningParams& params, const std::string& str); 32 | 33 | // example 34 | // 16-16-128,256-128-1024,8192 35 | std::vector ParseTuningSequence(const std::string& str); 36 | 37 | std::vector GenerateTuningSequence(const std::vector>& generators); 38 | 39 | std::vector> GetDefaultTuningGenerators(); 40 | 41 | } // namespace turbomind::gemm 42 | -------------------------------------------------------------------------------- /.github/md-link-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignorePatterns": [ 3 | { 4 | "pattern": "^https://www.reddit.com/" 5 | }, 6 | { 7 | "pattern": "^https://developer.nvidia.com/" 8 | }, 9 | { 10 | "pattern": "^https://docs.openvino.ai/" 11 | }, 12 | { 13 | "pattern": "^https://developer.android.com/" 14 | }, 15 | { 16 | "pattern": "^https://developer.qualcomm.com/" 17 | }, 18 | { 19 | "pattern": "^http://localhost" 20 | }, 21 | { 22 | "pattern": "^https://twitter.com" 23 | }, 24 | { 25 | "pattern": "^https://platform.openai.com" 26 | }, 27 | { 28 | "pattern": "^http://0.0.0.0" 29 | } 30 | ], 31 | "httpHeaders": [ 32 | { 33 | "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"], 34 | "headers": { 35 | "Accept-Encoding": "zstd, br, gzip, deflate" 36 | } 37 | } 38 | ], 39 | "timeout": "20s", 40 | "retryOn429": true, 41 | "retryCount": 5, 42 | "fallbackRetryDelay": "30s", 43 | "aliveStatusCodes": [200, 206, 429] 44 | } 45 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stopping_criterion.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h" 4 | #include 5 | 6 | namespace turbomind::gemm { 7 | 8 | namespace stopping_criterions { 9 | 10 | class Optimistic: public StoppingCriterion { 11 | public: 12 | Optimistic(int min_iter, int max_iter, float max_ms) 13 | { 14 | min_iter_ = std::max(min_iter, 1); 15 | max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits::max(); 16 | max_ms_ = max_ms > 0 ? max_ms : std::numeric_limits::infinity(); 17 | } 18 | bool should_stop(const Stats& stats) override 19 | { 20 | return stats.count() >= min_iter_ && (stats.count() >= max_iter_ || stats.sum() >= max_ms_); 21 | } 22 | 23 | private: 24 | int min_iter_; 25 | int max_iter_; 26 | float max_ms_; 27 | }; 28 | 29 | } // namespace stopping_criterions 30 | 31 | std::unique_ptr CreateStoppingCriterion(int min_iter, int max_iter, float max_ms) 32 | { 33 | return std::make_unique(min_iter, max_iter, max_ms); 34 | } 35 | 36 | } // namespace turbomind::gemm 37 | -------------------------------------------------------------------------------- /src/turbomind/utils/parser.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace turbomind { 9 | 10 | std::vector> ParseArgsList(const std::string& str) 11 | { 12 | const std::regex regex(R"((\w+)=([^,\[\(]+|\[.*\]|\(.*\)))"); 13 | 14 | std::sregex_iterator beg(str.begin(), str.end(), regex); 15 | std::sregex_iterator end{}; 16 | 17 | std::vector> ret; 18 | for (auto it = beg; it != end; ++it) { 19 | std::smatch match = *it; 20 | ret.emplace_back(match[1], match[2]); 21 | } 22 | 23 | return ret; 24 | } 25 | 26 | std::vector ParseListOrTuple(const std::string& str) 27 | { 28 | const std::regex regex(R"([,\[\]\(\)]+)"); 29 | 30 | std::vector ret; 31 | std::copy_if(std::sregex_token_iterator(str.begin(), str.end(), regex, -1), 32 | std::sregex_token_iterator{}, 33 | std::back_inserter(ret), 34 | [](const std::string& s) { return !s.empty(); }); 35 | 36 | return ret; 37 | } 38 | 39 | } // namespace turbomind 40 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::gemm { 6 | 7 | // tags for dispatching & conditional codegen 8 | 9 | template 10 | struct Arch { 11 | static constexpr bool is_compatible(int arch) 12 | { 13 | return Begin <= arch && (End == -1 || arch < End); 14 | } 15 | }; 16 | 17 | struct Sm70: Arch<700, 750> { 18 | static constexpr int value = 700; 19 | }; 20 | 21 | struct Sm75: Arch<750, 800> { 22 | static constexpr int value = 750; 23 | }; 24 | 25 | struct Sm80: Arch<800, 900> { 26 | static constexpr int value = 800; 27 | }; 28 | 29 | struct Sm90: Arch<900> { 30 | static constexpr int value = 900; 31 | }; 32 | 33 | inline bool is_arch_compatible(int karch, int darch) 34 | { 35 | switch (karch) { 36 | case 700: 37 | return Sm70::is_compatible(darch); 38 | case 750: 39 | return Sm75::is_compatible(darch); 40 | case 800: 41 | return Sm80::is_compatible(darch); 42 | case 900: 43 | return Sm90::is_compatible(darch); 44 | default: 45 | return false; 46 | } 47 | } 48 | 49 | } // namespace turbomind::gemm 50 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/predicate.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | namespace turbomind::gemm { 9 | 10 | template 11 | struct Predicate { 12 | 13 | static constexpr int kSizeC = AlignedC ? 1 : C; 14 | 15 | static_assert(S * kSizeC <= 32); 16 | 17 | static constexpr bool is_active = true; 18 | 19 | uint32_t pred_{}; 20 | 21 | __device__ int operator()(int s, int c) const 22 | { 23 | return (pred_ & (1 << (s * kSizeC + c))) != 0; 24 | } 25 | 26 | __device__ void set(int s, int c) 27 | { 28 | pred_ |= (1 << (s * kSizeC + c)); 29 | } 30 | 31 | __device__ void clear() 32 | { 33 | pred_ = 0; 34 | } 35 | }; 36 | 37 | template 38 | struct Predicate { 39 | 40 | static constexpr bool is_active = false; 41 | 42 | __device__ constexpr std::integral_constant operator()(int, int) const 43 | { 44 | return {}; 45 | } 46 | 47 | __device__ void set(int, int) {} 48 | 49 | __device__ void clear() 50 | { 51 | // pred_ = 0; 52 | } 53 | }; 54 | 55 | } // namespace turbomind::gemm 56 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/registry.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/kernel_impl.h" 6 | #include 7 | 8 | namespace turbomind::gemm { 9 | 10 | class Registry { 11 | public: 12 | explicit Registry(std::shared_ptr device_prop); 13 | 14 | template 15 | [[maybe_unused]] bool Add() 16 | { 17 | return Add(std::make_unique>()); 18 | } 19 | 20 | [[nodiscard]] const std::vector& kernels() const 21 | { 22 | return ptrs_; 23 | } 24 | 25 | private: 26 | bool Add(std::unique_ptr kernel); 27 | 28 | void f16_u4g128_f16_tnt_sm70_s884(); 29 | void f16_u4g128_f16_tnt_sm75_simt(); 30 | void f16_u4g128_f16_tnt_sm75_s16816(); 31 | void f16_u4g128_f16_tnt_sm80_s16816(); 32 | void f16_u4g128_f16_tnt_sm90_s16816(); 33 | 34 | void u4g128_f16_f16_nnn_sm80_s16816(); 35 | 36 | private: 37 | std::shared_ptr device_prop_; 38 | int arch_; 39 | std::vector> kernels_; 40 | std::vector ptrs_; 41 | }; 42 | 43 | } // namespace turbomind::gemm 44 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/measurer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/desc.h" 4 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace turbomind::gemm { 11 | 12 | struct Measurement { 13 | cudaError_t status; 14 | int sample_count; 15 | float mean; 16 | float variance; 17 | }; 18 | 19 | using Launcher = std::function; 20 | 21 | class Measurer { 22 | public: 23 | Measurer(std::unique_ptr stop_criterion); 24 | 25 | ~Measurer(); 26 | 27 | std::vector 28 | Measure(const std::vector& specs, const Launcher& launcher, cudaStream_t stream); 29 | 30 | private: 31 | Measurement MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream); 32 | 33 | std::pair ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream); 34 | 35 | private: 36 | cudaEvent_t ev_beg_; 37 | cudaEvent_t ev_end_; 38 | std::unique_ptr stop_criterion_; 39 | }; 40 | 41 | } // namespace turbomind::gemm 42 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Suggest an idea for this project 3 | title: "[Feature] " 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | We strongly appreciate you creating a PR to implement this feature [here](https://github.com/InternLM/lmdeploy/pulls)! 10 | If you need our help, please fill in as much of the following form as you're able to. 11 | 12 | **The less clear the description, the longer it will take to solve it.** 13 | - type: textarea 14 | attributes: 15 | label: Motivation 16 | description: | 17 | A clear and concise description of the motivation of the feature. 18 | Ex1. It is inconvenient when \[....\]. 19 | validations: 20 | required: true 21 | - type: textarea 22 | attributes: 23 | label: Related resources 24 | description: | 25 | If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. 26 | - type: textarea 27 | attributes: 28 | label: Additional context 29 | description: | 30 | Add any other context or screenshots about the feature request here. 31 | If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated. 32 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/sub_byte_ptr.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/data_type.h" 6 | 7 | namespace turbomind { 8 | 9 | template 10 | struct SubBytePtr { 11 | 12 | constexpr SubBytePtr() = default; 13 | 14 | constexpr __host__ __device__ explicit SubBytePtr(T* ptr): ptr_((char*)ptr) {} 15 | 16 | constexpr __host__ __device__ SubBytePtr(char* ptr): ptr_(ptr) {} 17 | 18 | __device__ T& operator[](int i) 19 | { 20 | return *reinterpret_cast(ptr_ + i * bitsof / bitsof); 21 | } 22 | 23 | friend __device__ SubBytePtr operator+(const SubBytePtr a, int n) 24 | { 25 | return SubBytePtr{a.ptr_ + n * bitsof / bitsof}; 26 | } 27 | 28 | friend __device__ SubBytePtr operator+(int n, const SubBytePtr a) 29 | { 30 | return a + n; 31 | } 32 | 33 | friend __device__ bool operator==(const SubBytePtr& a, const SubBytePtr& b) 34 | { 35 | return a.ptr_ == b.ptr_; 36 | } 37 | 38 | __device__ explicit operator T*() const 39 | { 40 | return (T*)ptr_; 41 | } 42 | 43 | char* ptr_; 44 | }; 45 | 46 | template 47 | struct get_pointer_type_t % 8 != 0>> { 48 | using type = SubBytePtr; 49 | }; 50 | 51 | } // namespace turbomind 52 | -------------------------------------------------------------------------------- /example/generate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from modeling_turbomind import TurbomindForCausalLM 3 | from transformers import AutoTokenizer, TextStreamer 4 | 5 | quant_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ' 6 | 7 | # Load model 8 | model = TurbomindForCausalLM.from_quantized(quant_path) 9 | tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) 10 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 11 | 12 | prompt = "You're standing on the surface of the Earth. "\ 13 | 'You walk one mile south, one mile west and one mile north. '\ 14 | 'You end up exactly where you started. Where are you?' 15 | 16 | chat = [ 17 | { 18 | 'role': 'system', 19 | 'content': 'You are a concise assistant that helps answer questions.' 20 | }, 21 | { 22 | 'role': 'user', 23 | 'content': prompt 24 | }, 25 | ] 26 | 27 | terminators = [ 28 | tokenizer.eos_token_id, 29 | tokenizer.convert_tokens_to_ids('<|eot_id|>') 30 | ] 31 | 32 | tokens = tokenizer.apply_chat_template(chat, return_tensors='pt') 33 | tokens = tokens.to(torch.device('cuda')) 34 | 35 | # Generate output 36 | generation_output = model.generate(tokens, 37 | streamer=streamer, 38 | max_new_tokens=64, 39 | eos_token_id=terminators) 40 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/test_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/utils/macro.h" 6 | #include 7 | #include 8 | #include 9 | 10 | namespace turbomind { 11 | 12 | template 13 | void Compare(const T* src, 14 | const T* ref, 15 | size_t stride, 16 | int dims, 17 | int bsz, 18 | bool show = false, 19 | float rtol = 1e-2, 20 | float atol = 1e-4); 21 | 22 | template 23 | std::vector 24 | FastCompare(const T* src, const T* ref, int dims, int bsz, cudaStream_t stream, float rtol = 1e-2, float atol = 1e-4); 25 | 26 | void LoadBinary(const std::string& path, size_t size, void* dst); 27 | 28 | class RNG { 29 | public: 30 | RNG(); 31 | ~RNG(); 32 | void GenerateUInt(uint* out, size_t count); 33 | 34 | template 35 | void GenerateUniform(T* out, size_t count, float scale = 1.f, float shift = 0.f); 36 | 37 | template 38 | void GenerateNormal(T* out, size_t count, float scale = 1.f, float shift = 0.f); 39 | 40 | cudaStream_t stream() const; 41 | 42 | void set_stream(cudaStream_t stream); 43 | 44 | private: 45 | struct Impl; 46 | std::unique_ptr impl_; 47 | }; 48 | 49 | } // namespace turbomind 50 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/cast.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/core/data_type.h" 4 | #include 5 | 6 | namespace turbomind { 7 | 8 | void extend_to_u8(uint8_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {}); 9 | 10 | void extend_to_u16(uint16_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {}); 11 | 12 | void compact_to_u4(uint4_t* dst, const uint8_t* src, size_t n, cudaStream_t st = {}); 13 | 14 | void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st = {}); 15 | 16 | void fuse_scales_and_zeros(half* fused, const half* scales, half* zeros, size_t n, cudaStream_t st = {}); 17 | 18 | template 19 | void interleave_output_dims_impl(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st); 20 | 21 | template 22 | inline void interleave_output_dims(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st) 23 | { 24 | auto dispatch = [&](auto u) { 25 | using U = decltype(u); 26 | return interleave_output_dims_impl((U*)fused, (const U*)a, (const U*)b, m, k, st); 27 | }; 28 | if constexpr (bitsof == 8) { 29 | return dispatch(uint8_t{}); 30 | } 31 | else if constexpr (bitsof == 16) { 32 | return dispatch(uint16_t{}); 33 | } 34 | else if constexpr (bitsof == 32) { 35 | return dispatch(uint32_t{}); 36 | } 37 | } 38 | 39 | } // namespace turbomind 40 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/sync.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | __inline__ __device__ int sem_fetch(int* lock, bool pred) 8 | { 9 | int state{}; 10 | if (pred) { 11 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 12 | asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); 13 | #else 14 | asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); 15 | #endif 16 | } 17 | return state; 18 | } 19 | 20 | __inline__ __device__ void sem_wait(int* lock, int status, bool pred) 21 | { 22 | int state = 0; 23 | while (__syncthreads_and(state != status)) { 24 | state = sem_fetch(lock, pred); 25 | } 26 | 27 | __syncthreads(); // memory fence 28 | } 29 | 30 | __inline__ __device__ void sem_wait_many(int* lock, int count, bool pred) 31 | { 32 | int state = 0; 33 | while (__syncthreads_count(state) != count) { 34 | state = sem_fetch(lock, pred); 35 | } 36 | 37 | __syncthreads(); // memory fence 38 | } 39 | 40 | __inline__ __device__ void sem_post(int* lock, int status, bool pred) 41 | { 42 | __syncthreads(); // memory fence 43 | 44 | if (pred) { 45 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 46 | asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); 47 | #else 48 | asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); 49 | #endif 50 | } 51 | } 52 | 53 | } // namespace turbomind 54 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/registry.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch.h" 4 | #include "src/turbomind/kernels/gemm/registry.h" 5 | 6 | namespace turbomind::gemm { 7 | 8 | Registry::Registry(std::shared_ptr device_prop): 9 | device_prop_{std::move(device_prop)}, arch_{device_prop_->major * 100 + device_prop_->minor * 10} 10 | { 11 | f16_u4g128_f16_tnt_sm70_s884(); 12 | f16_u4g128_f16_tnt_sm75_simt(); 13 | f16_u4g128_f16_tnt_sm75_s16816(); 14 | f16_u4g128_f16_tnt_sm80_s16816(); 15 | f16_u4g128_f16_tnt_sm90_s16816(); 16 | 17 | u4g128_f16_f16_nnn_sm80_s16816(); 18 | } 19 | 20 | bool Registry::Add(std::unique_ptr kernel) 21 | { 22 | if (!is_arch_compatible(kernel->arch(), arch_)) { 23 | return false; 24 | } 25 | if ((int)device_prop_->sharedMemPerBlockOptin < kernel->smem_size()) { 26 | return false; 27 | } 28 | // std::cout << "register: " << kernel->name() // 29 | // << ", shared: " << (kernel->smem_size() >> 10) << " KB" // 30 | // << ", regs: " << kernel->desc().attr.numRegs // 31 | // << ", local: " << (float)kernel->desc().attr.localSizeBytes << " bytes" // 32 | // << ", max_active_ctas: " << kernel->desc().max_active_ctas << " \n"; 33 | 34 | kernels_.push_back(std::move(kernel)); 35 | ptrs_.push_back(kernels_.back().get()); 36 | return true; 37 | } 38 | 39 | } // namespace turbomind::gemm 40 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)) 6 | #define TURBOMIND_ARCH_SM70 1 7 | #else 8 | #define TURBOMIND_ARCH_SM70 0 9 | #endif 10 | 11 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)) 12 | #define TURBOMIND_ARCH_SM75 1 13 | #else 14 | #define TURBOMIND_ARCH_SM75 0 15 | #endif 16 | 17 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) 18 | #define TURBOMIND_ARCH_SM80 1 19 | #else 20 | #define TURBOMIND_ARCH_SM80 0 21 | #endif 22 | 23 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) 24 | #define TURBOMIND_ARCH_SM90 1 25 | #else 26 | #define TURBOMIND_ARCH_SM90 0 27 | #endif 28 | 29 | #if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__) 30 | #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__)) 31 | #define PRAGMA_UNROLL _Pragma("unroll") 32 | #define PRAGMA_NO_UNROLL _Pragma("unroll 1") 33 | #else 34 | #define PRAGMA_UNROLL #pragma unroll 35 | #define PRAGMA_NO_UNROLL #pragma unroll 1 36 | #endif 37 | #else 38 | #define PRAGMA_UNROLL 39 | #define PRAGMA_NO_UNROLL 40 | #endif 41 | 42 | #if defined(__CUDACC__) 43 | #define TM_HOST_DEVICE __forceinline__ __host__ __device__ 44 | #define TM_DEVICE __forceinline__ __device__ 45 | #define TM_HOST __forceinline__ __host__ 46 | #else 47 | #define TM_HOST_DEVICE inline 48 | #define TM_DEVICE inline 49 | #define TM_HOST inline 50 | #endif 51 | 52 | constexpr int WARP_SIZE = 32; 53 | 54 | #ifndef uint 55 | using uint = unsigned int; 56 | #endif 57 | 58 | #ifndef ushort 59 | using ushort = unsigned short int; 60 | #endif 61 | -------------------------------------------------------------------------------- /turbomind/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from typing import List 4 | 5 | import torch 6 | 7 | 8 | def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: 9 | assert x.dtype == torch.int32 10 | xs = [] 11 | for _ in range(8): 12 | xs.append((x & 15).to(dtype)) 13 | x = x >> 4 14 | return xs 15 | 16 | 17 | def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor: 18 | xs = get_u4_slices(x, torch.uint8) 19 | order = [0, 4, 1, 5, 2, 6, 3, 7] 20 | ys = [xs[i] for i in order] 21 | return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1) 22 | 23 | 24 | def process_awq_gemm(x: torch.Tensor, kind: str): 25 | if x.dtype == torch.int32: 26 | x = unpack_awq_gemm(x) 27 | if kind in ['qweight', 'qzeros', 'scales']: 28 | x = x.t() 29 | return x 30 | 31 | 32 | def process_gptq(x: torch.Tensor, kind: str): 33 | if x.dtype == torch.int32: 34 | xs = get_u4_slices(x, torch.uint8) 35 | if kind == 'qweight': # (k/8,n) 36 | x = torch.stack(xs, dim=1).view(-1, x.size(-1)) 37 | else: # 'qzeros' (k/g,n/8) 38 | x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1 39 | if kind in ['qweight', 'qzeros', 'scales']: 40 | x = x.t() 41 | return x 42 | 43 | 44 | def pack_u4_row(x: torch.Tensor) -> torch.Tensor: 45 | assert x.dtype == torch.uint8 46 | xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1) 47 | a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device) 48 | for t in reversed(xs): 49 | a = (a << 4) | t 50 | return a.squeeze(dim=-1) 51 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/data_type.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #if ENABLE_BF16 10 | #include 11 | #endif 12 | 13 | namespace turbomind { 14 | 15 | struct uint1_t { 16 | }; 17 | struct uint2_t { 18 | }; 19 | struct uint3_t { 20 | }; 21 | struct uint4_t { 22 | }; 23 | struct uint5_t { 24 | }; 25 | struct uint6_t { 26 | }; 27 | 28 | template 29 | struct bitsof_t: std::integral_constant { 30 | }; 31 | 32 | template<> 33 | struct bitsof_t: std::integral_constant { 34 | }; 35 | 36 | template<> 37 | struct bitsof_t: std::integral_constant { 38 | }; 39 | 40 | template<> 41 | struct bitsof_t: std::integral_constant { 42 | }; // 2 + 1 43 | 44 | template<> 45 | struct bitsof_t: std::integral_constant { 46 | }; 47 | 48 | template<> 49 | struct bitsof_t: std::integral_constant { 50 | }; // 4 + 1 51 | 52 | template<> 53 | struct bitsof_t: std::integral_constant { 54 | }; // 4 + 2 55 | 56 | template 57 | inline constexpr bitsof_t bitsof{}; 58 | 59 | struct fp8 { 60 | char v; 61 | }; 62 | struct fp8_e4m3: fp8 { 63 | }; 64 | struct fp8_e5m2: fp8 { 65 | }; 66 | 67 | namespace detail { 68 | 69 | struct __uint4_t { 70 | uint32_t x; 71 | }; 72 | 73 | } // namespace detail 74 | 75 | template 76 | struct get_pointer_type_t { 77 | using type = T*; 78 | }; 79 | 80 | template 81 | using get_pointer_type = typename get_pointer_type_t::type; 82 | 83 | } // namespace turbomind 84 | -------------------------------------------------------------------------------- /.github/workflows/windows-x64-gpu.yml: -------------------------------------------------------------------------------- 1 | name: windows-x64-gpu 2 | on: 3 | push: 4 | paths: 5 | - '.github/workflows/windows-x64-gpu.yml' 6 | - 'src/**' 7 | - 'CMakeLists.txt' 8 | pull_request: 9 | paths: 10 | - '.github/workflows/windows-x64-gpu.yml' 11 | - 'src/**' 12 | - 'CMakeLists.txt' 13 | concurrency: 14 | group: windows-x64-gpu-${{ github.ref }} 15 | cancel-in-progress: true 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | build: 21 | strategy: 22 | matrix: 23 | cudaver: [11.8.0, 12.1.0] 24 | name: cuda-${{ matrix.cudaver }} 25 | runs-on: windows-latest 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v3 29 | - name: Set up python 30 | uses: actions/setup-python@v4 31 | with: 32 | python-version: '3.8' 33 | - name: Install python packages 34 | run: | 35 | pip install -r requirements/build.txt 36 | pip install wheel 37 | - name: Setup CUDA Toolkit 38 | id: cuda-toolkit 39 | shell: pwsh 40 | run: ./builder/windows/setup_cuda.ps1 41 | env: 42 | INPUT_CUDA_VERSION: ${{ matrix.cudaver }} 43 | - name: Build wheel 44 | run: | 45 | $env:BUILD_TEST="ON" 46 | mkdir build 47 | cd build 48 | ..\builder\windows\generate.ps1 49 | cmake --build . --config Release -- /m /v:q 50 | if (-Not $?) { 51 | echo "build failed" 52 | exit 1 53 | } 54 | cmake --install . --config Release 55 | cd .. 56 | rm build -Force -Recurse 57 | python setup.py bdist_wheel -d build/wheel 58 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/quantization.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/test/quantization_impl.h" 4 | 5 | namespace turbomind::gemm { 6 | 7 | template void Quantize(const thrust::universal_vector& x, 8 | int m, 9 | int k, 10 | Order order, 11 | int group_size, 12 | thrust::universal_vector& x_p, // pseudo-quantized 13 | thrust::universal_vector& x_q, // quantized ushort 14 | thrust::universal_vector& x_u, // scales & zeros (always m-major) 15 | cudaStream_t stream); 16 | 17 | template void Quantize(const thrust::universal_vector& x, 18 | int m, 19 | int k, 20 | Order order, 21 | int group_size, 22 | thrust::universal_vector& x_p, // pseudo-quantized 23 | thrust::universal_vector& x_q, // quantized ushort 24 | thrust::universal_vector& x_u, // scales & zeros (always m-major) 25 | cudaStream_t stream); 26 | 27 | } // namespace turbomind::gemm 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/flake8 3 | rev: 4.0.1 4 | hooks: 5 | - id: flake8 6 | - repo: https://github.com/PyCQA/isort 7 | rev: 5.11.5 8 | hooks: 9 | - id: isort 10 | - repo: https://github.com/pre-commit/mirrors-yapf 11 | rev: v0.32.0 12 | hooks: 13 | - id: yapf 14 | name: yapf 15 | description: 'Formatter for Python code' 16 | entry: yapf 17 | language: python 18 | args: ['-i', '--style={based_on_style: pep8, column_limit: 79}'] 19 | 20 | - repo: https://github.com/pre-commit/pre-commit-hooks 21 | rev: v4.2.0 22 | hooks: 23 | - id: trailing-whitespace 24 | - id: check-yaml 25 | - id: end-of-file-fixer 26 | - id: requirements-txt-fixer 27 | - id: double-quote-string-fixer 28 | - id: check-merge-conflict 29 | - id: fix-encoding-pragma 30 | args: ["--remove"] 31 | - id: mixed-line-ending 32 | args: ["--fix=lf"] 33 | - repo: https://github.com/executablebooks/mdformat 34 | rev: 0.7.9 35 | hooks: 36 | - id: mdformat 37 | args: ["--number"] 38 | additional_dependencies: 39 | - mdformat-openmmlab 40 | - mdformat_frontmatter 41 | - linkify-it-py 42 | - repo: https://github.com/codespell-project/codespell 43 | rev: v2.1.0 44 | hooks: 45 | - id: codespell 46 | args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"] 47 | 48 | - repo: https://github.com/myint/docformatter 49 | rev: v1.4 50 | hooks: 51 | - id: docformatter 52 | args: ["--in-place", "--wrap-descriptions", "79"] 53 | 54 | - repo: https://github.com/open-mmlab/pre-commit-hooks 55 | rev: v0.2.0 56 | hooks: 57 | - id: check-copyright 58 | args: ["turbomind"] 59 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch/config_simt.h" 4 | #include "src/turbomind/kernels/gemm/operand.h" 5 | #include "src/turbomind/kernels/gemm/registry.h" 6 | #include "src/turbomind/kernels/gemm/transform.h" 7 | #include "src/turbomind/kernels/gemm/types.h" 8 | 9 | namespace turbomind::gemm { 10 | 11 | void Registry::f16_u4g128_f16_tnt_sm75_simt() 12 | { 13 | using namespace simt; 14 | 15 | using S = cache_policy::Stream; 16 | using D = cache_policy::Default; 17 | 18 | { // quant B 19 | using Operand_A = typename GetOperand::Operand; 20 | using Operand_B_U4 = typename GetOperand::Operand; 21 | using Operand_V = typename GetOperand::Operand; 22 | 23 | using C = Sm75_Simt; 31 | 32 | // clang-format off 33 | Add>(); 34 | Add>(); 35 | Add>(); 36 | Add>(); 37 | Add>(); 38 | Add>(); 39 | Add>(); 40 | Add>(); 41 | // clang-format on 42 | } 43 | } 44 | 45 | } // namespace turbomind::gemm 46 | -------------------------------------------------------------------------------- /example/module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | 5 | def get_named_linears(module): 6 | return { 7 | name: m 8 | for name, m in module.named_modules() if isinstance(m, nn.Linear) 9 | } 10 | 11 | 12 | def get_op_by_name(module, op_name): 13 | # get the op by its name relative to the module 14 | for name, m in module.named_modules(): 15 | if name == op_name: 16 | return m 17 | raise ValueError(f'Cannot find op {op_name} in module {module}') 18 | 19 | 20 | def set_op_by_name(layer, name, new_module): 21 | levels = name.split('.') 22 | if len(levels) > 1: 23 | mod_ = layer 24 | for l_idx in range(len(levels) - 1): 25 | if levels[l_idx].isdigit(): 26 | mod_ = mod_[int(levels[l_idx])] 27 | else: 28 | mod_ = getattr(mod_, levels[l_idx]) 29 | setattr(mod_, levels[-1], new_module) 30 | else: 31 | setattr(layer, name, new_module) 32 | 33 | 34 | def get_op_name(module, op): 35 | # get the name of the op relative to the module 36 | for name, m in module.named_modules(): 37 | if m is op: 38 | return name 39 | raise ValueError(f'Cannot find op {op} in module {module}') 40 | 41 | 42 | def append_str_prefix(x, prefix): 43 | if isinstance(x, str): 44 | return prefix + x 45 | elif isinstance(x, tuple): 46 | return tuple([append_str_prefix(y, prefix) for y in x]) 47 | elif isinstance(x, list): 48 | return [append_str_prefix(y, prefix) for y in x] 49 | else: 50 | return x 51 | 52 | 53 | def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert): 54 | if modules_to_not_convert is None: 55 | return linear_layers 56 | 57 | filtered_layers = {} 58 | for name, linear_layer in linear_layers.items(): 59 | if not any(key in name for key in modules_to_not_convert): 60 | filtered_layers[name] = linear_layer 61 | return filtered_layers 62 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/mma_simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include "src/turbomind/kernels/gemm/desc.h" 8 | #include "src/turbomind/kernels/gemm/simt.h" 9 | 10 | namespace turbomind::gemm { 11 | 12 | template 13 | struct MMA_SIMT { 14 | static constexpr int M = simt::OP_M; 15 | static constexpr int N = simt::OP_N; 16 | static constexpr int K = simt::OP_K; 17 | 18 | static constexpr int kThreadCount = 32; 19 | 20 | static constexpr auto kOpClass = OpClass::kSIMT; 21 | 22 | using FragA = Array; 23 | using FragB = Array; 24 | using FragC = Array; 25 | 26 | using OffsetC = Array; 27 | using FragC_ = FragC[1]; 28 | 29 | __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c) 30 | { 31 | PRAGMA_UNROLL 32 | for (int k = 0; k < K; ++k) { 33 | d[0] = c[0] + float(a[k]) * float(b[k]); 34 | } 35 | 36 | // PRAGMA_UNROLL 37 | // for (int k = 0; k < K; ++k) { 38 | // d[0] = c[0] + float(a[k] * b[k]); 39 | // } 40 | 41 | // T acc{}; 42 | // PRAGMA_UNROLL 43 | // for (int k = 0; k < K; ++k) { 44 | // acc += a[k] * b[k]; 45 | // } 46 | // d[0] = c[0] + float(acc); 47 | } 48 | 49 | __device__ static constexpr OffsetC static_offset_C() 50 | { 51 | return {}; 52 | } 53 | 54 | __device__ static int2 thread_offset_C() // -> (m,n) 55 | { 56 | const int lane_id = threadIdx.x % WARP_SIZE; 57 | return {lane_id / N, lane_id % N}; 58 | } 59 | 60 | __device__ static void ReshapeC(const FragC& c, FragC_& c_) 61 | { 62 | c_[0] = c; 63 | } 64 | 65 | __device__ static int get_group_id(int thread_idx) 66 | { 67 | return thread_idx / WARP_SIZE; 68 | } 69 | }; 70 | 71 | } // namespace turbomind::gemm 72 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch/config_sm75_s16816.h" 4 | #include "src/turbomind/kernels/gemm/operand.h" 5 | #include "src/turbomind/kernels/gemm/registry.h" 6 | #include "src/turbomind/kernels/gemm/transform.h" 7 | #include "src/turbomind/kernels/gemm/types.h" 8 | 9 | namespace turbomind::gemm { 10 | 11 | void Registry::f16_u4g128_f16_tnt_sm75_s16816() 12 | { 13 | using namespace sm75_s16816; 14 | 15 | { // fp x u4 16 | using C = Sm75_s16816, 17 | Transform_Default, 18 | VoidOperand, 19 | Operand_B_Pack, 20 | Transform_HMMA_16816<1, 0>, 21 | Operand_UV_Pack, 22 | kRowMajor, 23 | half>; 24 | 25 | using S = cache_policy::Stream; 26 | using D = cache_policy::Default; 27 | 28 | // clang-format off 29 | Add>(); 30 | Add>(); 31 | Add>(); 32 | Add>(); 33 | Add>(); 34 | Add>(); 35 | Add>(); 36 | Add>(); 37 | Add>(); 38 | Add>(); 39 | Add>(); 40 | // clang-format on 41 | } 42 | } 43 | 44 | } // namespace turbomind::gemm 45 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/format.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | 7 | namespace turbomind::gemm { 8 | 9 | template 10 | struct Converter { 11 | }; 12 | 13 | template 14 | struct Converter { 15 | template 16 | __device__ Array operator()(Array x) 17 | { 18 | return x; 19 | } 20 | }; 21 | 22 | template<> 23 | struct Converter { 24 | 25 | static __device__ Array pack(const Array& vi) 26 | { 27 | Array ui = (Array&)vi; 28 | 29 | ui[0] |= (ui[0] >> 12); 30 | ui[1] |= (ui[1] >> 12); 31 | 32 | // 7 6 5 4 3 2 1 0 33 | // _7_67564_3_23120 34 | uint32_t uo = __byte_perm(ui[0], ui[1], 0x5140); 35 | 36 | return (Array&)uo; 37 | } 38 | 39 | template 40 | __device__ Array operator()(const Array& x) 41 | { 42 | static_assert(sizeof(U) == 2); 43 | auto& vi = (const Array&)x; 44 | Array tmp; 45 | PRAGMA_UNROLL 46 | for (int i = 0; i < N; ++i) { 47 | tmp[i] = static_cast(vi[i]); 48 | } 49 | Array vo; 50 | PRAGMA_UNROLL 51 | for (int i = 0; i < N; i += 8) { 52 | (Array&)vo[i] = pack((Array&)tmp[i]); 53 | } 54 | return vo; 55 | } 56 | }; 57 | 58 | template<> 59 | struct Converter { 60 | template 61 | __device__ Array operator()(const Array& x) 62 | { 63 | // static_assert(sizeof(U) == 2); 64 | auto& vi = (const Array&)x; 65 | Array vo; 66 | PRAGMA_UNROLL 67 | for (int i = 0; i < N; ++i) { 68 | vo[i] = static_cast(vi[i]); 69 | } 70 | return vo; 71 | } 72 | }; 73 | 74 | } // namespace turbomind::gemm 75 | -------------------------------------------------------------------------------- /.github/workflows/linux-x64-gpu.yml: -------------------------------------------------------------------------------- 1 | name: linux-x64-gpu 2 | on: 3 | push: 4 | paths: 5 | - '.github/workflows/linux-x64-gpu.yml' 6 | - 'src/**' 7 | - 'CMakeLists.txt' 8 | pull_request: 9 | paths: 10 | - '.github/workflows/linux-x64-gpu.yml' 11 | - 'src/**' 12 | - 'CMakeLists.txt' 13 | concurrency: 14 | group: linux-x64-gpu-${{ github.ref }} 15 | cancel-in-progress: true 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | build: 21 | strategy: 22 | matrix: 23 | cudaver: [11.8, 12.1] 24 | name: cuda-${{ matrix.cudaver }} 25 | runs-on: ubuntu-latest 26 | steps: 27 | - name: Free disk space 28 | uses: jlumbroso/free-disk-space@main 29 | with: 30 | # This might remove tools that are actually needed, if set to "true" but frees about 6 GB 31 | tool-cache: false 32 | docker-images: false 33 | # All of these default to true, but feel free to set to "false" if necessary for your workflow 34 | android: true 35 | dotnet: true 36 | haskell: true 37 | large-packages: true 38 | swap-storage: false 39 | - name: Checkout repository 40 | uses: actions/checkout@v3 41 | - name: Build 42 | uses: addnab/docker-run-action@v3 43 | with: 44 | image: openmmlab/lmdeploy-builder:cuda${{ matrix.cudaver }} 45 | options: -v ${{ github.workspace }}:/work 46 | run: | 47 | cd /work 48 | source /opt/conda/bin/activate 49 | conda activate py38 50 | mkdir build && cd build 51 | cmake .. \ 52 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 53 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ 54 | -DCMAKE_INSTALL_PREFIX=./install \ 55 | -DCMAKE_CUDA_FLAGS="-lineinfo" \ 56 | -DUSE_NVTX=ON \ 57 | -DBUILD_TEST=ON 58 | make -j$(nproc) && make install 59 | cd .. 60 | rm -rf build 61 | python setup.py bdist_wheel --plat-name manylinux2014_x86_64 -d /tmp 62 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/gemm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | #include 7 | #include 8 | #include 9 | 10 | namespace turbomind::gemm { 11 | 12 | class Gemm { 13 | public: 14 | static constexpr size_t kBarriersSize = 1 << 20; 15 | static constexpr size_t kPartialsSize = 32 << 20; 16 | 17 | Gemm(); 18 | 19 | ~Gemm(); 20 | 21 | Gemm(Gemm&& other); 22 | Gemm& operator =(Gemm&& other); 23 | Gemm(const Gemm&) = delete; 24 | Gemm& operator=(const Gemm&) = delete; 25 | 26 | [[nodiscard]] int Run(const Operation& operation, 27 | float alpha, 28 | const void* A, 29 | const MatrixLayout& Adesc, 30 | const void* U, 31 | const MatrixLayout& Udesc, 32 | const void* B, 33 | const MatrixLayout& Bdesc, 34 | const void* V, 35 | const MatrixLayout& Vdesc, 36 | float beta, 37 | const void* C, 38 | const MatrixLayout& Cdesc, 39 | void* D, 40 | const MatrixLayout& Ddesc, 41 | const Workspace& workspace, 42 | cudaStream_t stream); 43 | 44 | [[maybe_unused]] int Export(std::ostream& os); 45 | 46 | [[maybe_unused]] int Import(std::istream& is); 47 | 48 | [[nodiscard]] std::vector GetTuningSeq() const; 49 | 50 | private: 51 | struct Impl; 52 | std::unique_ptr impl_; 53 | }; 54 | 55 | [[nodiscard]] int 56 | Convert(const void* S, const MatrixLayout& Sdesc, void* D, const MatrixLayout& Ddesc, cudaStream_t stream); 57 | 58 | std::tuple get_weight_and_scales_layout(int sm, bool force_simt); 59 | 60 | } // namespace turbomind::gemm 61 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | AccessModifierOffset: -4 3 | AlignAfterOpenBracket: Align 4 | AllowShortEnumsOnASingleLine: false 5 | AlignConsecutiveAssignments: true 6 | AlignConsecutiveDeclarations: true 7 | AlignEscapedNewlines: Right 8 | AlignOperands: true 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: true 11 | AllowAllArgumentsOnNextLine: true 12 | AllowShortBlocksOnASingleLine: Empty 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: Never 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: false 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: false 21 | BinPackParameters: false 22 | BreakBeforeBinaryOperators: NonAssignment 23 | BreakBeforeBraces: Stroustrup 24 | BreakBeforeTernaryOperators: false 25 | BreakConstructorInitializers: AfterColon 26 | BreakInheritanceList: AfterColon 27 | BreakStringLiterals: false 28 | ColumnLimit: 120 29 | CompactNamespaces: false 30 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 31 | ConstructorInitializerIndentWidth: 4 32 | ContinuationIndentWidth: 4 33 | Cpp11BracedListStyle: true 34 | DerivePointerAlignment: false 35 | FixNamespaceComments: true 36 | IndentCaseLabels: true 37 | IndentPPDirectives: None 38 | IndentWidth: 4 39 | IndentWrappedFunctionNames: false 40 | KeepEmptyLinesAtTheStartOfBlocks: true 41 | MaxEmptyLinesToKeep: 1 42 | NamespaceIndentation: None 43 | PointerAlignment: Left 44 | ReflowComments: true 45 | SortIncludes: true 46 | SortUsingDeclarations: false 47 | SpaceAfterCStyleCast: false 48 | SpaceAfterTemplateKeyword: false 49 | SpaceBeforeAssignmentOperators: true 50 | SpaceBeforeCtorInitializerColon: false 51 | SpaceBeforeInheritanceColon: false 52 | SpaceBeforeParens: ControlStatements 53 | SpaceInEmptyParentheses: false 54 | SpacesBeforeTrailingComments: 2 55 | SpacesInAngles: false 56 | SpacesInCStyleCastParentheses: false 57 | SpacesInContainerLiterals: false 58 | SpacesInParentheses: false 59 | SpacesInSquareBrackets: false 60 | Standard: c++17 61 | TabWidth: 4 62 | UseTab: Never 63 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/operand.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/layout.h" 6 | #include "src/turbomind/kernels/core/meta.h" 7 | #include "src/turbomind/kernels/gemm/iterator.h" 8 | #include "src/turbomind/kernels/gemm/smem_copy.h" 9 | #include "src/turbomind/kernels/gemm/types.h" 10 | #include "src/turbomind/kernels/gemm/utils.h" 11 | 12 | namespace turbomind::gemm { 13 | 14 | struct VoidOperand { 15 | using Dtype = int; 16 | 17 | static constexpr Pack kPack = 0; 18 | static constexpr Order kOrder = Order::kColMajor; 19 | 20 | struct GetSmemLayout { 21 | static constexpr SmemLayoutV2<1, 1> apply(...) 22 | { 23 | return {}; 24 | } 25 | }; 26 | 27 | using SmemCopyAtom = VoidSmemCopyAtom; 28 | 29 | struct GetGmemIter { 30 | static constexpr auto apply(...) 31 | { 32 | return type_c; 33 | } 34 | }; 35 | }; 36 | 37 | /// TODO: fix AlignC, AlignS 38 | /// TODO: fix GroupSize 39 | template 40 | struct MakeOperand { 41 | 42 | using Dtype = typename Operand::Dtype; 43 | 44 | static constexpr Pack kPack = Operand::kPack; 45 | static constexpr Order kOrder = Operand::kOrder; 46 | static constexpr int kGroupSize = GroupSize; 47 | 48 | static constexpr int2 kPackMK = Packing_v2::apply({M, ceil_div(K, kGroupSize)}); 49 | 50 | static constexpr pair kShapeMK{}; 51 | 52 | using SmemLayout = decltype(Operand::GetSmemLayout::apply(kShapeMK)); 53 | using SmemAccessor = SmemAccessorV2; 54 | 55 | using GmemIter = typename decltype(Operand::GetGmemIter::apply( 56 | type_c, type_c, type_c, kShapeMK, constant{}))::type; 57 | 58 | using SmemCopyAtom = typename Operand::SmemCopyAtom; 59 | }; 60 | 61 | // CPO for getting specific operand templates 62 | template 63 | struct GetOperand: std::false_type { 64 | }; 65 | 66 | } // namespace turbomind::gemm 67 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐞 Bug report 2 | description: Create a report to help us reproduce and fix the bug 3 | title: "[Bug] " 4 | labels: ['Bug'] 5 | 6 | body: 7 | - type: checkboxes 8 | attributes: 9 | label: Checklist 10 | options: 11 | - label: 1. I have searched related issues but cannot get the expected help. 12 | - label: 2. The bug has not been fixed in the latest version. 13 | - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback. 14 | - type: textarea 15 | attributes: 16 | label: Describe the bug 17 | description: A clear and concise description of what the bug is. 18 | validations: 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: Reproduction 23 | description: | 24 | 1. What command or script did you run? 25 | placeholder: | 26 | A placeholder for the command. 27 | validations: 28 | required: true 29 | - type: textarea 30 | attributes: 31 | label: Environment 32 | description: | 33 | 1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here. 34 | 2. You may add addition that may be helpful for locating the problem, such as 35 | - Which **model** are you using? 36 | - How you installed PyTorch \[e.g., pip, conda, source\] 37 | - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) 38 | placeholder: Environment here. 39 | render: Shell 40 | validations: 41 | required: true 42 | - type: textarea 43 | attributes: 44 | label: Error traceback 45 | description: | 46 | If applicable, paste the error trackback here. 47 | placeholder: Logs and traceback here. 48 | render: Shell 49 | - type: markdown 50 | attributes: 51 | value: > 52 | If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated! 53 | 54 | Thanks for your bug report. We appreciate it a lot. 55 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | add_library(gemm2 4 | gemm.cu 5 | kernel.cu 6 | registry.cu 7 | dispatch_cache.cu 8 | gpu_metric.cu 9 | convert_v2.cu 10 | cast.cu 11 | unpack.cu 12 | tuner/cache_utils.cu 13 | tuner/measurer.cu 14 | tuner/sampler.cu 15 | tuner/stopping_criterion.cc 16 | tuner/params.cc 17 | kernel/f16_u4g128_f16_tnt_sm90_s16816.cu 18 | kernel/f16_u4g128_f16_tnt_sm80_s16816.cu 19 | kernel/f16_u4g128_f16_tnt_sm75_s16816.cu 20 | kernel/f16_u4g128_f16_tnt_sm70_s884.cu 21 | kernel/f16_u4g128_f16_tnt_sm75_simt.cu 22 | kernel/u4g128_f16_f16_nnn_sm80_s16816.cu 23 | ) 24 | 25 | target_link_libraries(gemm2 PRIVATE parser) 26 | 27 | 28 | target_compile_options(gemm2 PRIVATE 29 | $<$: 30 | -Xptxas=-v 31 | --generate-line-info 32 | --threads 8> 33 | ) 34 | set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON) 35 | set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 36 | 37 | if (BUILD_TEST) 38 | add_executable(gemm_test 39 | test/gemm_test.cu 40 | test/test_utils.cu 41 | test/quantization.cu 42 | test/reference.cu) 43 | target_link_libraries(gemm_test PRIVATE gemm2 cublas) 44 | 45 | if (NOT MSVC) 46 | FetchContent_Declare( 47 | repo-nvbench 48 | GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git 49 | GIT_TAG d8dced8a64d9ce305add92fa6d274fd49b569b7e 50 | ) 51 | 52 | set(NVBench_ENABLE_EXAMPLES OFF) 53 | set(BUILD_SHARED_LIBS OFF) 54 | 55 | FetchContent_MakeAvailable(repo-nvbench) 56 | 57 | add_executable(gemm_bench 58 | test/gemm_bench.cu 59 | test/test_utils.cu 60 | test/quantization.cu 61 | test/reference.cu) 62 | target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas) 63 | endif () 64 | endif () 65 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/mma_sm80.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include "src/turbomind/kernels/core/mma.h" 8 | #include "src/turbomind/kernels/gemm/desc.h" 9 | 10 | namespace turbomind::gemm { 11 | 12 | struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN { 13 | static constexpr int M = 16; 14 | static constexpr int N = 8; 15 | static constexpr int K = 16; 16 | 17 | static constexpr int kThreadCount = 32; 18 | 19 | static constexpr auto kOpClass = OpClass::kMMA_s16816; 20 | 21 | using FragA = Array; 22 | using FragB = Array; 23 | using FragC = Array; 24 | 25 | using OffsetC = Array; // (m, n) 26 | using FragC_ = Array[2]; 27 | 28 | __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c) 29 | { 30 | mma_m16n8k16_row_col(d, a, b, (FragC&)c); 31 | } 32 | 33 | __device__ static constexpr OffsetC static_offset_C() 34 | { 35 | return {int2{0, 0}, int2{8, 0}}; 36 | } 37 | 38 | __device__ static int2 thread_offset_C() // -> (m,n) 39 | { 40 | const int lane_id = threadIdx.x % WARP_SIZE; 41 | return {lane_id / 4, lane_id % 4 * 2}; 42 | } 43 | 44 | __device__ static void ReshapeC(const FragC& c, FragC_& c_) 45 | { 46 | PRAGMA_UNROLL 47 | for (int m = 0; m < 2; ++m) { 48 | c_[m] = (Array&)c[m * 2]; 49 | } 50 | } 51 | 52 | __device__ static int get_group_id(int thread_idx) 53 | { 54 | return thread_idx / WARP_SIZE; 55 | } 56 | }; 57 | 58 | // This is not used yet 59 | struct SM75_MMA_16x8x8_F32_F16_F16_F32_TN: SM80_MMA_16x8x16_F32_F16_F16_F32_TN { 60 | static constexpr int M = 16; 61 | static constexpr int N = 8; 62 | static constexpr int K = 8; 63 | 64 | using FragA = Array; 65 | using FragB = Array; 66 | using FragC = Array; 67 | 68 | __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c) 69 | { 70 | mma_m16n8k8_row_col(d, a, b, (FragC&)c); 71 | } 72 | }; 73 | 74 | } // namespace turbomind::gemm 75 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/desc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/data_type.h" 6 | #include "src/turbomind/kernels/gemm/types.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | // aggregate that uniquely identifies a GEMM problem 11 | struct GemmDesc { 12 | int arch; 13 | DataType type_a; 14 | DataType type_b; 15 | DataType type_c; 16 | Order order_a; 17 | Order order_b; 18 | Order order_c; 19 | Pack pack_a; 20 | Pack pack_b; 21 | Pack pack_u; 22 | Pack pack_v; 23 | QuantDesc quant_a; 24 | QuantDesc quant_b; 25 | Epilogue epilogue; 26 | int m; 27 | int n; 28 | int k; 29 | int batch_dim; 30 | }; 31 | 32 | enum class OpClass 33 | { 34 | kSIMT, 35 | kMMA_s884, 36 | kMMA_s16816, 37 | }; 38 | 39 | inline const char* to_string(OpClass op) 40 | { 41 | switch (op) { 42 | case OpClass::kSIMT: 43 | return "simt"; 44 | case OpClass::kMMA_s884: 45 | return "s884"; 46 | case OpClass::kMMA_s16816: 47 | return "s16816"; 48 | default: 49 | return "unknown_op_cls"; 50 | } 51 | } 52 | 53 | // aggregate that uniquely identifies a kernel 54 | struct KernelDesc { 55 | int arch; 56 | OpClass op_class; 57 | DataType type_a; 58 | DataType type_b; 59 | DataType type_c; 60 | Order order_a; 61 | Order order_b; 62 | Order order_c; 63 | Pack pack_a; 64 | Pack pack_b; 65 | Pack pack_u; 66 | Pack pack_v; 67 | QuantDesc quant_a; 68 | QuantDesc quant_b; 69 | int policy_a; 70 | int policy_b; 71 | int3 cta_tile; 72 | int3 mma_tile; 73 | int3 align; 74 | int2 c_tile; 75 | int stages; 76 | bool split_k; 77 | 78 | // set by `KernelImpl` 79 | int max_active_ctas; 80 | cudaFuncAttributes attr; 81 | }; 82 | 83 | class Kernel; 84 | struct LaunchSpec { 85 | Kernel* kernel; 86 | int swizzle; 87 | int splits; 88 | float estimated; 89 | float measured; 90 | }; 91 | 92 | } // namespace turbomind::gemm 93 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/gemm_bench.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "nvbench/main.cuh" 4 | #include "src/turbomind/kernels/gemm/operand.h" 5 | #include "src/turbomind/kernels/gemm/test/models.h" 6 | #include "src/turbomind/kernels/gemm/test/testbed.h" 7 | #include 8 | #include 9 | #include 10 | 11 | void gemm_bench(nvbench::state& state) 12 | { 13 | const auto idx = state.get_int64("idx"); 14 | 15 | const auto bs = state.get_int64("bs"); 16 | const auto tp = state.get_int64("tp"); 17 | 18 | auto [output_dims, input_dims] = config[idx]; 19 | 20 | constexpr int group_size = 128; 21 | 22 | if (idx % 4 == 0 || idx % 4 == 2) { 23 | if (output_dims % tp) 24 | return; 25 | output_dims /= tp; 26 | } 27 | else { 28 | if (input_dims % tp) 29 | return; 30 | input_dims /= tp; 31 | } 32 | 33 | if (input_dims % group_size) 34 | return; 35 | 36 | using turbomind::gemm::get_test; 37 | 38 | { 39 | int m = bs; 40 | int n = output_dims; 41 | int k = input_dims; 42 | if (get_test().kBatchDim == 1) { 43 | std::swap(m, n); 44 | } 45 | std::cerr << "m" << m << "n" << n << "k" << k << "\n"; 46 | get_test().Initialize(m, n, k, group_size, state.get_cuda_stream()); 47 | } 48 | 49 | state.add_element_count((size_t)bs * output_dims * input_dims * 2); // mul + add 50 | 51 | // state.collect_dram_throughput(); 52 | // state.collect_l2_hit_rates(); 53 | 54 | if constexpr (1) { 55 | state.add_global_memory_reads(get_test().global_memory_reads()); 56 | get_test().Run(); 57 | state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // 58 | get_test().Run(); 59 | }); 60 | } 61 | else { 62 | state.add_global_memory_reads(sizeof(half) * (bs * input_dims + output_dims * input_dims)); 63 | state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // 64 | get_test().RunCublas(); 65 | }); 66 | } 67 | } 68 | 69 | NVBENCH_BENCH(gemm_bench) 70 | .add_int64_axis("idx", nvbench::range(0, (int)config.size() - 1)) 71 | .add_int64_power_of_two_axis("bs", nvbench::range(0, 10)) 72 | .add_int64_axis("tp", {1, 2, 4}); 73 | 74 | int main(int argc, char* argv[]) 75 | { 76 | NVBENCH_MAIN_BODY(argc, argv); 77 | } 78 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/mma_sm70.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include "src/turbomind/kernels/core/mma.h" 8 | #include "src/turbomind/kernels/gemm/desc.h" 9 | 10 | namespace turbomind::gemm { 11 | 12 | struct SM70_MMA_884 { 13 | // static constexpr int M = 16; 14 | // static constexpr int N = 16; 15 | static constexpr int M = 8; 16 | static constexpr int N = 32; 17 | static constexpr int K = 8; 18 | 19 | static constexpr int kThreadCount = 32; 20 | 21 | static constexpr auto kOpClass = OpClass::kMMA_s884; 22 | 23 | using FragA = Array; 24 | using FragB = Array; 25 | using FragC = Array; 26 | 27 | using OffsetC = Array; 28 | using FragC_ = Array[4]; 29 | 30 | __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c) 31 | { 32 | mma_m8n8k4_row_col(d, (const Array&)a[0], (const Array&)b[0], (FragC&)c); 33 | if constexpr (K == 8) { 34 | mma_m8n8k4_row_col(d, (const Array&)a[4], (const Array&)b[4], (FragC&)d); 35 | } 36 | } 37 | 38 | __device__ static constexpr OffsetC static_offset_C() 39 | { 40 | OffsetC r{}; 41 | PRAGMA_UNROLL 42 | for (int n = 0; n < 2; ++n) { 43 | PRAGMA_UNROLL 44 | for (int m = 0; m < 2; ++m) { 45 | r[n * 2 + m] = int2{m * 2, n * 4}; 46 | } 47 | } 48 | return r; 49 | } 50 | 51 | __device__ static int2 thread_offset_C() // -> (m,n) 52 | { 53 | const int lane_id = threadIdx.x % WARP_SIZE; 54 | // return { 55 | // (lane_id & 8) * 1 + (lane_id & 1) + lane_id / 16 * 4, 56 | // (lane_id & 4) * 2 + (lane_id & 2), 57 | // }; 58 | return {(lane_id & 1) + (lane_id / 16) * 4, // 59 | (lane_id & 2) + (lane_id & 12) * 2}; 60 | } 61 | 62 | __device__ static void ReshapeC(const FragC& c, FragC_& c_) 63 | { 64 | PRAGMA_UNROLL 65 | for (int m = 0; m < 4; ++m) { 66 | c_[m] = (Array&)c[m * 2]; 67 | } 68 | } 69 | 70 | __device__ static int get_group_id(int thread_idx) 71 | { 72 | return thread_idx / WARP_SIZE; 73 | } 74 | }; 75 | 76 | } // namespace turbomind::gemm 77 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/measurer.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/kernel.h" 4 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h" 5 | #include "src/turbomind/kernels/gemm/tuner/measurer.h" 6 | #include 7 | 8 | namespace turbomind::gemm { 9 | 10 | Measurer::Measurer(std::unique_ptr stop_criterion): stop_criterion_{std::move(stop_criterion)} 11 | { 12 | cudaEventCreate(&ev_beg_); 13 | cudaEventCreate(&ev_end_); 14 | } 15 | 16 | Measurer::~Measurer() 17 | { 18 | cudaEventDestroy(ev_beg_); 19 | cudaEventDestroy(ev_end_); 20 | ev_beg_ = ev_end_ = {}; 21 | } 22 | 23 | std::vector 24 | Measurer::Measure(const std::vector& specs, const Launcher& launcher, cudaStream_t stream) 25 | { 26 | std::vector m; 27 | m.reserve(specs.size()); 28 | for (const auto& spec : specs) { 29 | auto measure = MeasureOne(spec, launcher, stream); 30 | if (measure.sample_count) { 31 | m.push_back(measure); 32 | } 33 | /// TODO: report error 34 | } 35 | return m; 36 | } 37 | 38 | Measurement Measurer::MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream) 39 | { 40 | Stats stats{}; 41 | cudaError_t status = cudaSuccess; 42 | while (true) { 43 | float ms{}; 44 | std::tie(ms, status) = ColdRun(spec, launcher, stream); 45 | if (status != cudaSuccess) { 46 | break; 47 | } 48 | stats.add_sample(ms); 49 | // std::cout << spec.kernel->name() << " " << spec.swizzle << " " << stats.count() << " " << stats.mean() << " " 50 | // << stats.get_variance() << "\n"; 51 | if (stop_criterion_->should_stop(stats)) { 52 | break; 53 | } 54 | } 55 | return Measurement{ 56 | status, 57 | stats.count(), 58 | stats.mean(), 59 | stats.get_variance(), 60 | }; 61 | } 62 | 63 | std::pair Measurer::ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream) 64 | { 65 | CacheFlushing::flush(stream); 66 | 67 | cudaEventRecord(ev_beg_, stream); 68 | 69 | launcher(spec, stream); 70 | 71 | cudaEventRecord(ev_end_, stream); 72 | cudaEventSynchronize(ev_end_); 73 | 74 | const auto status = cudaGetLastError(); 75 | float ms{}; 76 | 77 | if (status == cudaSuccess) { 78 | cudaEventElapsedTime(&ms, ev_beg_, ev_end_); 79 | } 80 | 81 | return {ms, status}; 82 | } 83 | 84 | } // namespace turbomind::gemm 85 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/iterator.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | #include "src/turbomind/kernels/core/data_type.h" 7 | #include "src/turbomind/kernels/core/meta.h" 8 | #include "src/turbomind/kernels/gemm/thread_map.h" 9 | #include "src/turbomind/kernels/gemm/utils.h" 10 | 11 | namespace turbomind::gemm { 12 | 13 | struct VoidGmemIter { 14 | static constexpr int ITER_S = 0; 15 | using Fragments = int; 16 | template 17 | __device__ VoidGmemIter(P, int, int2, int2) 18 | { 19 | } 20 | __device__ void ClearSmem() {} 21 | __device__ void Prefetch(int, int, bool) {} 22 | __device__ void Prefetch(bool) {} 23 | __device__ void Fetch(Fragments&, bool) {} 24 | __device__ void Store(const Fragments&) {} 25 | __device__ void Advance() {} 26 | int* smem_data_; 27 | bool g_mask{false}; 28 | }; 29 | 30 | struct GetGmemIter { 31 | template 32 | static constexpr auto 33 | apply(basic_type, basic_type, basic_type, pair, constant) 34 | { 35 | using Dtype = typename Operand::Dtype; 36 | 37 | constexpr int kAccessSize = 38 | std::min(128 / bitsof, std::max(32 / bitsof, M * K / (WARPS * WARP_SIZE))); 39 | 40 | constexpr int2 kAligned = mk2cs(0, 1); 41 | constexpr int2 kCS = mk2cs(M, K); 42 | 43 | constexpr int kMaxThrS = std::min(WARP_SIZE, ceil_div(kCS.y, WARPS)); 44 | constexpr int kMaxThrC = std::min(WARP_SIZE, ceil_div(kCS.x, kAccessSize)); 45 | 46 | constexpr int kTgtThrC = ceil_div(256, sizeof(Array)); 47 | 48 | constexpr int kWarpThrC = std::min(kMaxThrC, std::max(WARP_SIZE / kMaxThrS, kTgtThrC)); 49 | 50 | using GmemIter = typename Iterator::template Type, 52 | SmemLayout, 53 | Operand::kPack, 54 | Operand::kOrder, 55 | kAligned.x, // aligned C 56 | kAligned.y>; // aligned S 57 | return type_c; 58 | } 59 | }; 60 | 61 | } // namespace turbomind::gemm 62 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/sampler.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/desc.h" 4 | #include "src/turbomind/kernels/gemm/kernel.h" 5 | #include "src/turbomind/kernels/gemm/tuner/sampler.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace turbomind::gemm { 12 | 13 | template 14 | static std::vector ArgSort(size_t size, const Cmp& cmp) 15 | { 16 | std::vector idxs(size); 17 | std::iota(idxs.begin(), idxs.end(), 0); 18 | std::stable_sort(idxs.begin(), idxs.end(), cmp); 19 | return idxs; 20 | } 21 | 22 | std::vector Sampler::Run(std::vector specs, const Launcher& launcher, cudaStream_t stream) 23 | { 24 | std::vector> clusters; // ptr into `specs` 25 | if (k_clusters_) { 26 | clusters = Cluster(specs, ClusteringParam{true, true}); 27 | } 28 | else { 29 | for (auto& s : specs) { 30 | clusters.push_back({s}); 31 | } 32 | } 33 | // std::cout << "k_clusters=" << k_clusters_ << ", #specs" << specs.size() << ", #clusters" << clusters.size() << 34 | // "\n"; 35 | 36 | std::vector s_1; 37 | for (const auto& c : clusters) { 38 | s_1.push_back(c.front()); 39 | } 40 | 41 | auto m_1 = measurer_.Measure(s_1, launcher, stream); 42 | 43 | auto idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; }); 44 | 45 | if (k_clusters_) { 46 | const auto top_k = std::min(k_clusters_, (int)idxs.size()); 47 | idxs.resize(top_k); 48 | 49 | std::vector s_2; 50 | for (const auto& idx : idxs) { 51 | auto& cluster = clusters[idx]; 52 | // Skip cluster leader 53 | for (size_t j = 1; j < cluster.size(); ++j) { 54 | s_2.push_back(cluster[j]); 55 | } 56 | } 57 | 58 | // std::cout << "#s_2=" << s_2.size() << "\n"; 59 | 60 | auto m_2 = measurer_.Measure(s_2, launcher, stream); 61 | // Merge measurements of the 2 runs 62 | m_2.insert(m_2.end(), m_1.begin(), m_1.end()); 63 | s_2.insert(s_2.end(), s_1.begin(), s_1.end()); 64 | m_1.swap(m_2); 65 | s_1.swap(s_2); 66 | } 67 | 68 | idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; }); 69 | 70 | std::vector ret; 71 | for (const auto& i : idxs) { 72 | s_1[i].measured = m_1[i].mean; 73 | ret.push_back(s_1[i]); 74 | } 75 | 76 | return ret; 77 | } 78 | 79 | } // namespace turbomind::gemm 80 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/gemm_test.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/attention/quantization.h" 4 | 5 | #include "src/turbomind/kernels/gemm/convert_v2.h" 6 | #include "src/turbomind/kernels/gemm/gemm.h" 7 | #include "src/turbomind/kernels/gemm/gpu_metric.h" 8 | #include "src/turbomind/kernels/gemm/kernel.h" 9 | #include "src/turbomind/kernels/gemm/test/models.h" 10 | #include "src/turbomind/kernels/gemm/test/quantization.h" 11 | #include "src/turbomind/kernels/gemm/test/test_utils.h" 12 | #include "src/turbomind/kernels/gemm/test/testbed.h" 13 | #include "src/turbomind/kernels/gemm/types.h" 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | using namespace turbomind; 23 | using namespace gemm; 24 | using thrust::universal_vector; 25 | 26 | cublasHandle_t cublas_handle{}; 27 | 28 | void ComputeRefCpu(half* C, const half* A, const half* B, int m, int n, int k) 29 | { 30 | for (int mm = 0; mm < m; ++mm) { 31 | for (int nn = 0; nn < n; ++nn) { 32 | float c = 0; 33 | for (int kk = 0; kk < k; ++kk) { 34 | c += (float)A[mm * k + kk] * (float)B[nn * k + kk]; 35 | } 36 | C[mm * n + nn] = c; 37 | } 38 | } 39 | } 40 | 41 | static int g_check = 0; 42 | 43 | void Run(int batch_size, int output_dims, int input_dims, int g = 128) 44 | { 45 | auto& test = get_test(); 46 | int m = batch_size; 47 | int n = output_dims; 48 | int k = input_dims; 49 | if (get_test().kBatchDim == 1) { 50 | std::swap(m, n); 51 | } 52 | std::cerr << "m" << m << "n" << n << "k" << k << "\n"; 53 | test.Initialize(m, n, k, g, 0); 54 | 55 | if (g_check) { 56 | test.Check(); 57 | } 58 | else { 59 | for (int i = 0; i < 10; ++i) { 60 | test.Run(); 61 | } 62 | test.CompareC(); 63 | } 64 | } 65 | 66 | int main(int argc, char* argv[]) 67 | { 68 | g_check = 0; 69 | Run(16384, 16384, 16384); 70 | 71 | // g_check = 1; 72 | // std::vector bsz(1024); 73 | // { 74 | // std::iota(bsz.begin(), bsz.end(), 1); 75 | // std::random_device rd; 76 | // std::mt19937 g(rd()); 77 | // std::shuffle(bsz.begin() + 1, bsz.end(), g); 78 | // } 79 | // for (const auto& b : bsz) { 80 | // for (const auto& [out, in] : config) { 81 | // Run(b, out, in); 82 | // } 83 | // } 84 | 85 | if (auto ec = cudaDeviceSynchronize(); ec != cudaSuccess) { 86 | std::cerr << "un-clean exit: " << cudaGetErrorString(ec) << "\n"; 87 | } 88 | 89 | return 0; 90 | } 91 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/unpack.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/core/array_ops.h" 4 | #include "src/turbomind/kernels/core/common.h" 5 | #include "src/turbomind/kernels/core/data_type.h" 6 | #include 7 | 8 | namespace turbomind { 9 | 10 | namespace { 11 | 12 | __device__ void atomic_assign_u4(uint32_t* address, uint32_t index, uint32_t value) 13 | { 14 | uint32_t old = *address; 15 | uint32_t assumed; 16 | do { 17 | assumed = old; 18 | uint32_t tmp = (assumed & ~(0xfu << (index * 4u))) | (value << (index * 4u)); 19 | old = atomicCAS(address, assumed, tmp); 20 | } while (assumed != old); 21 | } 22 | 23 | __device__ uint32_t read_u4(const uint32_t* address, uint32_t index) 24 | { 25 | return (*address >> (index * 4u)) & 0xfu; 26 | } 27 | 28 | template 29 | __global__ void permute_u4(uint* dst, const uint* src, Array dims) 30 | { 31 | constexpr int N = sizeof...(Ds); 32 | 33 | size_t count = 1; 34 | PRAGMA_UNROLL 35 | for (int i = 0; i < N; ++i) { 36 | count *= dims[i]; 37 | } 38 | 39 | constexpr int order[] = {Ds...}; 40 | 41 | for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) { 42 | 43 | int indices[N]{}; 44 | 45 | PRAGMA_UNROLL 46 | for (int j = N - 1, ii = i; j >= 0; --j) { 47 | indices[j] = ii % dims[j]; 48 | ii /= dims[j]; 49 | } 50 | 51 | auto data = read_u4(src + i / 8, i % 8); 52 | 53 | int index = 0; 54 | 55 | PRAGMA_UNROLL 56 | for (int j = N - 1, stride = 1; j >= 0; --j) { 57 | index += indices[order[j]] * stride; 58 | stride *= dims[order[j]]; 59 | } 60 | 61 | atomic_assign_u4(dst + index / 8, index % 8, data); 62 | } 63 | } 64 | 65 | } // namespace 66 | 67 | // col-major interleaved 68 | void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaStream_t st) 69 | { 70 | Array shape{cols, rows / 8, 2, 4}; 71 | permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); 72 | } 73 | 74 | void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st) 75 | { 76 | if (s % 8 || c % 8) { 77 | std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl; 78 | return; 79 | } 80 | Array shape{s, c}; 81 | permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape); 82 | } 83 | 84 | // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store 85 | // load -> extend_to_u16 -> convert -> run 86 | 87 | } // namespace turbomind 88 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/cta_map.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | #include "src/turbomind/kernels/core/math.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | struct CtaMap { 11 | 12 | TM_HOST_DEVICE static int3 get_tiled_shape(int m, int n, int k, int cta_m, int cta_n, int split_cnt) 13 | { 14 | return {(m + cta_m - 1) / cta_m, (n + cta_n - 1) / cta_n, split_cnt}; 15 | } 16 | 17 | TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int N) 18 | { 19 | auto n = tiled_shape.y; 20 | if (N >= 32 && n >= 24) 21 | return 5; 22 | if (N >= 16 && n >= 12) 23 | return 4; 24 | if (N >= 8 && n >= 6) 25 | return 3; 26 | if (N >= 4 && n >= 3) 27 | return 2; 28 | if (N >= 2 && n >= 2) 29 | return 1; 30 | return 0; 31 | } 32 | 33 | TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile) 34 | { 35 | int tile = 1 << log_tile; 36 | return {static_cast(tiled_shape.x * tile), 37 | static_cast((tiled_shape.y + tile - 1) / tile), 38 | static_cast(tiled_shape.z)}; 39 | } 40 | 41 | TM_DEVICE static int3 get_tile_offset(int log_tile) 42 | { 43 | int block_idx_x = blockIdx.x; 44 | int block_idx_y = blockIdx.y; 45 | int block_idx_z = blockIdx.z; 46 | return {(block_idx_x >> log_tile), // 47 | (block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)), 48 | block_idx_z}; 49 | } 50 | }; 51 | 52 | struct CtaMapN: public CtaMap { 53 | TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile) 54 | { 55 | int tile = 1 << log_tile; 56 | return {static_cast(tiled_shape.y * tile), // n * tile 57 | static_cast((tiled_shape.x + tile - 1) / tile), // m / tile 58 | static_cast(tiled_shape.z)}; 59 | } 60 | TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int M) 61 | { 62 | auto m = tiled_shape.x; 63 | if (M >= 32 && m >= 24) 64 | return 5; 65 | if (M >= 16 && m >= 12) 66 | return 4; 67 | if (M >= 8 && m >= 6) 68 | return 3; 69 | if (M >= 4 && m >= 3) 70 | return 2; 71 | if (M >= 2 && m >= 2) 72 | return 1; 73 | return 0; 74 | } 75 | TM_DEVICE static int3 get_tile_offset(int log_tile) 76 | { 77 | int block_idx_x = blockIdx.x; 78 | int block_idx_y = blockIdx.y; 79 | int block_idx_z = blockIdx.z; 80 | return {(block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)), // 81 | (block_idx_x >> log_tile), 82 | block_idx_z}; 83 | } 84 | }; 85 | 86 | } // namespace turbomind::gemm 87 | -------------------------------------------------------------------------------- /src/turbomind/utils/tensor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | // #include 9 | // #include 10 | #include 11 | #include 12 | 13 | namespace turbomind { 14 | 15 | typedef enum datatype_enum 16 | { 17 | TYPE_INVALID, 18 | TYPE_BOOL, 19 | TYPE_UINT8, 20 | TYPE_UINT16, 21 | TYPE_UINT32, 22 | TYPE_UINT64, 23 | TYPE_INT8, 24 | TYPE_INT16, 25 | TYPE_INT32, 26 | TYPE_INT64, 27 | TYPE_FP16, 28 | TYPE_FP32, 29 | TYPE_FP64, 30 | TYPE_BYTES, 31 | TYPE_BF16 32 | } DataType; 33 | 34 | typedef enum memorytype_enum 35 | { 36 | MEMORY_CPU, 37 | MEMORY_CPU_PINNED, 38 | MEMORY_GPU 39 | } MemoryType; 40 | 41 | struct Tensor { 42 | MemoryType where; 43 | DataType type; 44 | std::vector shape; 45 | const void* data; 46 | 47 | Tensor(): where(MEMORY_CPU), type(TYPE_INVALID), shape({}), data(nullptr) {} 48 | Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data): 49 | where(_where), type(_type), shape(_shape), data(_data) 50 | { 51 | } 52 | 53 | size_t size() const 54 | { 55 | if (data == nullptr || shape.size() == 0) { 56 | return 0; 57 | } 58 | return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies()); 59 | } 60 | 61 | size_t sizeBytes() const 62 | { 63 | return size() * typeSize(); 64 | } 65 | 66 | size_t typeSize() const 67 | { 68 | static const std::unordered_map type_map{{TYPE_BOOL, sizeof(bool)}, 69 | {TYPE_BYTES, sizeof(char)}, 70 | {TYPE_UINT8, sizeof(uint8_t)}, 71 | {TYPE_UINT16, sizeof(uint16_t)}, 72 | {TYPE_UINT32, sizeof(uint32_t)}, 73 | {TYPE_UINT64, sizeof(uint64_t)}, 74 | {TYPE_INT8, sizeof(int8_t)}, 75 | {TYPE_INT16, sizeof(int16_t)}, 76 | {TYPE_INT32, sizeof(int32_t)}, 77 | {TYPE_INT64, sizeof(int64_t)}, 78 | #ifdef ENABLE_BF16 79 | {TYPE_BF16, sizeof(__nv_bfloat16)}, 80 | #endif 81 | {TYPE_FP16, sizeof(half)}, 82 | {TYPE_FP32, sizeof(float)}, 83 | {TYPE_FP64, sizeof(double)}}; 84 | return type_map.at(type); 85 | } 86 | }; 87 | } // namespace turbomind 88 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/smem_copy_simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array_ops.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include "src/turbomind/kernels/gemm/simt.h" 8 | #include "src/turbomind/kernels/gemm/smem_copy.h" 9 | #include "src/turbomind/kernels/gemm/types.h" 10 | 11 | namespace turbomind::gemm { 12 | 13 | template 14 | struct SmemCopy_MMA_SIMT_A { 15 | static constexpr int M = simt::OP_M; 16 | static constexpr int K = simt::OP_K; 17 | 18 | static constexpr int OP_N = simt::OP_N; 19 | 20 | static constexpr int kFragNum = 1; 21 | 22 | using Frag = Array; 23 | 24 | __device__ static int2 get_offset(int thread_idx) 25 | { 26 | const int lane_id = thread_idx % WARP_SIZE; 27 | return {lane_id / OP_N, 0}; 28 | } 29 | 30 | template 31 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool) // -> (m, k) 32 | { 33 | Lds(*(Frag*)dst_ptr, (S &&) src_ptr); 34 | } 35 | 36 | __device__ static int2 unique(int thread_idx, int pack_idx) // -> (unique id, repeat id) 37 | { 38 | const int lane_id = thread_idx % WARP_SIZE; 39 | return {pack_idx * M + lane_id / OP_N, lane_id % OP_N}; 40 | } 41 | }; 42 | 43 | template 44 | struct SmemCopy_MMA_SIMT_B { 45 | static constexpr int M = simt::OP_N; 46 | static constexpr int K = simt::OP_K; 47 | 48 | static constexpr int OP_N = simt::OP_N; 49 | 50 | static constexpr int kFragNum = 1; 51 | 52 | using Frag = Array; 53 | 54 | __device__ static int2 get_offset(int thread_idx) // -> (m, k) 55 | { 56 | const int lane_id = thread_idx % WARP_SIZE; 57 | return {lane_id % OP_N, 0}; 58 | } 59 | 60 | template 61 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool) 62 | { 63 | Lds(*(Frag*)dst_ptr, (S &&) src_ptr); 64 | } 65 | 66 | __device__ static int2 unique(int thread_idx, int pack_idx) // -> (unique id, repeat id) 67 | { 68 | const int lane_id = thread_idx % WARP_SIZE; 69 | return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N}; 70 | } 71 | }; 72 | 73 | template 74 | struct SmemCopy_MMA_SIMT_V { 75 | static constexpr int M = simt::OP_N; 76 | static constexpr int K = K_; 77 | 78 | static constexpr int OP_N = simt::OP_N; 79 | 80 | static constexpr int kFragNum = 1; 81 | 82 | using Frag = Array; 83 | 84 | __device__ static int2 unique(int thread_idx, int pack_idx) 85 | { 86 | const int lane_id = thread_idx % WARP_SIZE; 87 | return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N}; 88 | } 89 | 90 | __device__ static int2 get_offset(int thread_idx) // -> (m, k) 91 | { 92 | return {unique(thread_idx, 0).x, 0}; 93 | } 94 | 95 | template 96 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool mask) 97 | { 98 | Lds(*(Frag*)dst_ptr, src_ptr); 99 | } 100 | }; 101 | 102 | } // namespace turbomind::gemm 103 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/config_sm70_s884.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/arch.h" 6 | #include "src/turbomind/kernels/gemm/arch/mma_sm70.h" 7 | #include "src/turbomind/kernels/gemm/arch/operand_sm70_s884.h" 8 | #include "src/turbomind/kernels/gemm/cta_map.h" 9 | #include "src/turbomind/kernels/gemm/epilogue.h" 10 | #include "src/turbomind/kernels/gemm/gemm_universal.h" 11 | #include "src/turbomind/kernels/gemm/iterator_sm70.h" 12 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h" 13 | #include "src/turbomind/kernels/gemm/thread_group_map.h" 14 | #include "src/turbomind/kernels/gemm/tiled_mma.h" 15 | #include "src/turbomind/kernels/gemm/types.h" 16 | 17 | namespace turbomind::gemm::sm70_s884 { 18 | 19 | template 20 | struct Sm70_s884 { 21 | 22 | static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K); 23 | 24 | static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum; 25 | static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum; 26 | static constexpr int SMEM_K = A::SmemCopyAtom::K; 27 | 28 | template 42 | struct Type { 43 | 44 | // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom) 45 | using MMA_Atom = SM70_MMA_884; 46 | 47 | using Partition = Blocked; 48 | using MMA_Map = MMA_Map; 49 | 50 | using MMA = Tiled_MMA_v2; 51 | 52 | using Mainloop = MainloopSm70, 55 | TransformA, 56 | U, 57 | GroupSizeU, 58 | B, 59 | IteratorSm70, 60 | TransformB, 61 | V, 62 | GroupSizeV, 63 | Stages, 64 | true>; // FusePrefetch_ 65 | 66 | static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_; 67 | static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_; 68 | 69 | using Epilogue = gemm::Epilogue_, 76 | Operand_C, 77 | SplitK>; 78 | 79 | using Kernel = GemmUniversal; 80 | }; 81 | }; 82 | 83 | } // namespace turbomind::gemm::sm70_s884 84 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/desc.h" 6 | #include "src/turbomind/kernels/gemm/types.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace turbomind::gemm { 14 | 15 | struct KernelMetric { 16 | int64_t mio_cost; 17 | int64_t mma_cost; 18 | }; 19 | 20 | class Kernel { 21 | public: 22 | virtual ~Kernel() = default; 23 | 24 | virtual int Launch(const Operation& operation, 25 | float alpha, 26 | const void* A, 27 | const MatrixLayout& Adesc, 28 | const void* U, 29 | const MatrixLayout& Udesc, 30 | const void* B, 31 | const MatrixLayout& Bdesc, 32 | const void* V, 33 | const MatrixLayout& Vdesc, 34 | float beta, 35 | const void* C, 36 | const MatrixLayout& Cdesc, 37 | void* D, 38 | const MatrixLayout& Ddesc, 39 | int swizzle, 40 | int splits, 41 | Workspace& workspace, 42 | cudaStream_t stream) = 0; 43 | 44 | // virtual because different implementation may have different workspace requeirements 45 | virtual int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) = 0; 46 | 47 | // true if this kernel can be used to compute the gemm 48 | bool is_feasible(const GemmDesc& desc) const noexcept; 49 | 50 | std::vector> 51 | Estimate_v2(std::array size, int max_splits, int max_waves, int sm_count) const; 52 | 53 | virtual int GetSwizzle(int m, int n, int k, int splits, int swizzle) = 0; 54 | 55 | const KernelDesc& desc() const noexcept 56 | { 57 | return desc_; 58 | } 59 | 60 | int3 cta_tile_size() const noexcept 61 | { 62 | return desc_.cta_tile; 63 | } 64 | 65 | int3 warp_tile_size() const noexcept 66 | { 67 | return desc_.mma_tile; 68 | } 69 | 70 | int chunk_size_k() const noexcept 71 | { 72 | return chunk_size_k_; 73 | } 74 | 75 | int stages() const noexcept 76 | { 77 | return desc_.stages; 78 | } 79 | 80 | bool split_k() const noexcept 81 | { 82 | return desc_.split_k; 83 | } 84 | 85 | int arch() const noexcept 86 | { 87 | return desc_.arch; 88 | } 89 | 90 | int smem_size() const noexcept 91 | { 92 | return smem_size_; 93 | } 94 | 95 | std::string name() const 96 | { 97 | return name_; 98 | } 99 | 100 | protected: 101 | std::string GetName() const; 102 | 103 | KernelDesc desc_; 104 | 105 | int chunk_size_k_; 106 | int smem_size_; 107 | 108 | std::string name_; 109 | }; 110 | 111 | struct ClusteringParam { 112 | bool cache_policy; 113 | bool max_active_ctas; 114 | }; 115 | 116 | std::vector> Cluster(const std::vector& specs, const ClusteringParam& param); 117 | 118 | } // namespace turbomind::gemm 119 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/config_sm75_s16816.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch.h" 4 | #include "src/turbomind/kernels/gemm/arch/mma_sm80.h" 5 | #include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h" 6 | #include "src/turbomind/kernels/gemm/cta_map.h" 7 | #include "src/turbomind/kernels/gemm/epilogue.h" 8 | #include "src/turbomind/kernels/gemm/gemm_universal.h" 9 | #include "src/turbomind/kernels/gemm/iterator_sm70.h" 10 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h" 11 | #include "src/turbomind/kernels/gemm/thread_group_map.h" 12 | #include "src/turbomind/kernels/gemm/tiled_mma.h" 13 | #include "src/turbomind/kernels/gemm/types.h" 14 | 15 | namespace turbomind::gemm { 16 | 17 | namespace sm75_s16816 { 18 | 19 | using namespace sm80_s16816; 20 | 21 | template 22 | struct Sm75_s16816 { 23 | 24 | static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K); 25 | 26 | static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum; 27 | static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum; 28 | static constexpr int SMEM_K = A::SmemCopyAtom::K; 29 | 30 | template 44 | struct Type { 45 | // Raked partition dont support `Pack_M > 1` 46 | using Partition = Blocked; 47 | using MMA_Map = MMA_Map; 48 | using MMA = Tiled_MMA_v2; 49 | 50 | using Mainloop = MainloopSm70, 53 | TransformA, 54 | U, 55 | GroupSizeU, 56 | B, 57 | IteratorSm70, 58 | TransformB, 59 | V, 60 | GroupSizeV, 61 | Stages, 62 | true>; // FusePrefetch_ 63 | 64 | static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_; 65 | static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_; 66 | 67 | using Epilogue = gemm::Epilogue_, 74 | Operand_C, 75 | SplitK>; 76 | 77 | using Kernel = GemmUniversal; 78 | }; 79 | }; 80 | 81 | } // namespace sm75_s16816 82 | 83 | } // namespace turbomind::gemm 84 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/smem.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/array.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include 8 | 9 | namespace turbomind { 10 | 11 | __inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr) 12 | { 13 | return (uint32_t)__cvta_generic_to_shared(ptr); 14 | } 15 | 16 | __inline__ __device__ void ldmatrix_m8n8_x4_b16(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr) 17 | { 18 | #if TURBOMIND_ARCH_SM75 19 | asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" 20 | : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) 21 | : "r"(smem_int_ptr)); 22 | #else 23 | assert(TURBOMIND_ARCH_SM75); 24 | #endif 25 | } 26 | 27 | __inline__ __device__ void ldsm_x4_trans(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr) 28 | { 29 | #if TURBOMIND_ARCH_SM75 30 | asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n" 31 | : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3) 32 | : "r"(smem_int_ptr)); 33 | #else 34 | assert(TURBOMIND_ARCH_SM75); 35 | #endif 36 | } 37 | 38 | __inline__ __device__ void ldmatrix_m8n8_x2_b16(uint& d0, uint& d1, uint32_t smem_int_ptr) 39 | { 40 | #if TURBOMIND_ARCH_SM75 41 | asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(d0), "=r"(d1) : "r"(smem_int_ptr)); 42 | #else 43 | assert(TURBOMIND_ARCH_SM75); 44 | #endif 45 | } 46 | 47 | __inline__ __device__ void ldsm_x2_trans(uint& d0, uint& d1, uint32_t smem_int_ptr) 48 | { 49 | #if TURBOMIND_ARCH_SM75 50 | asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0,%1}, [%2];\n" 51 | : "=r"(d0), "=r"(d1) 52 | : "r"(smem_int_ptr)); 53 | #else 54 | assert(TURBOMIND_ARCH_SM75); 55 | #endif 56 | } 57 | 58 | __inline__ __device__ void ldmatrix_m8n8_x1_b16(uint& d0, uint32_t smem_int_ptr) 59 | { 60 | #if TURBOMIND_ARCH_SM75 61 | asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr)); 62 | #else 63 | assert(TURBOMIND_ARCH_SM75); 64 | #endif 65 | } 66 | 67 | __inline__ __device__ void ldsm_x1_trans(uint& d0, uint32_t smem_int_ptr) 68 | { 69 | #if TURBOMIND_ARCH_SM75 70 | asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr)); 71 | #else 72 | assert(TURBOMIND_ARCH_SM75); 73 | #endif 74 | } 75 | 76 | __inline__ __device__ void ldsm_x4(Array& d, uint32_t smem_int_ptr) 77 | { 78 | ldmatrix_m8n8_x4_b16(d[0], d[1], d[2], d[3], smem_int_ptr); 79 | } 80 | 81 | __inline__ __device__ void ldsm_x2(Array& d, uint32_t smem_int_ptr) 82 | { 83 | ldmatrix_m8n8_x2_b16(d[0], d[1], smem_int_ptr); 84 | } 85 | 86 | __inline__ __device__ void ldsm_x1(Array& d, uint32_t smem_int_ptr) 87 | { 88 | ldmatrix_m8n8_x1_b16(d[0], smem_int_ptr); 89 | } 90 | 91 | __inline__ __device__ void ldsm_x4_trans(Array& d, uint32_t smem_int_ptr) 92 | { 93 | ldsm_x4_trans(d[0], d[1], d[2], d[3], smem_int_ptr); 94 | } 95 | 96 | __inline__ __device__ void ldsm_x2_trans(Array& d, uint32_t smem_int_ptr) 97 | { 98 | ldsm_x2_trans(d[0], d[1], smem_int_ptr); 99 | } 100 | 101 | __inline__ __device__ void ldsm_x1_trans(Array& d, uint32_t smem_int_ptr) 102 | { 103 | ldsm_x1_trans(d[0], smem_int_ptr); 104 | } 105 | 106 | } // namespace turbomind 107 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/simt.h" 6 | #include "src/turbomind/kernels/gemm/types.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | __host__ __device__ constexpr Order transpose(Order order) 11 | { 12 | return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; 13 | } 14 | 15 | __host__ __device__ constexpr MatrixLayout transpose(MatrixLayout x) 16 | { 17 | auto tmp = x.cols; // `std::swap` is not constexpr 18 | x.cols = x.rows; 19 | x.rows = tmp; 20 | x.order = transpose(x.order); 21 | return x; 22 | } 23 | 24 | template 25 | __host__ __device__ constexpr int2 mk2cs(int m, int k) 26 | { 27 | if constexpr (order == Order::kRowMajor) { 28 | return {k, m}; 29 | } 30 | else { 31 | return {m, k}; 32 | } 33 | } 34 | 35 | template 36 | __host__ __device__ constexpr int2 mk2cs(int2 mk) 37 | { 38 | return mk2cs(mk.x, mk.y); 39 | } 40 | 41 | template 42 | __host__ __device__ constexpr int2 cs2mk(int c, int s) 43 | { 44 | if constexpr (order == Order::kRowMajor) { 45 | return {s, c}; 46 | } 47 | else { 48 | return {c, s}; 49 | } 50 | } 51 | 52 | template 53 | __host__ __device__ constexpr int2 _kn2cs(int k, int n) 54 | { 55 | if constexpr (order == Order::kColMajor) { 56 | return {k, n}; 57 | } 58 | else { 59 | return {n, k}; 60 | } 61 | } 62 | 63 | template 64 | __host__ __device__ constexpr Index cs2idx(int2 cs, Index ld) 65 | { 66 | return ld * cs.y + cs.x; 67 | } 68 | 69 | template 70 | struct PackingImpl { 71 | __host__ __device__ static constexpr int2 apply(int2 mk) 72 | { 73 | return mk; 74 | } 75 | }; 76 | 77 | template 78 | struct Packing_v2: PackingImpl { 79 | }; 80 | 81 | /// TODO: move packing utility to arch/smem_copy_xxx 82 | 83 | template 84 | struct PackingImpl { 85 | __host__ __device__ static constexpr int2 apply(int2 mk) 86 | { 87 | return {mk.x / 16 / num, mk.y * 16 * num}; 88 | } 89 | }; 90 | 91 | template 92 | struct PackingImpl { 93 | __host__ __device__ static constexpr int2 apply(int2 mk) 94 | { 95 | return {mk.x * 16, mk.y / 16}; 96 | } 97 | }; 98 | 99 | template 100 | struct PackingImpl: PackingImpl { 101 | }; 102 | 103 | template 104 | struct PackingImpl { 105 | __host__ __device__ static constexpr int2 apply(int2 mk) 106 | { 107 | return {mk.x / (simt::OP_M * num), mk.y * simt::OP_M * num}; 108 | } 109 | }; 110 | 111 | template 112 | struct PackingImpl { 113 | __host__ __device__ static constexpr int2 apply(int2 mk) 114 | { 115 | return {mk.x / (simt::OP_N * num), mk.y * simt::OP_N * num}; 116 | } 117 | }; 118 | 119 | template 120 | struct PackingImpl { 121 | __host__ __device__ static constexpr int2 apply(int2 mk) 122 | { 123 | // return {mk.x / (16 * num), mk.y * 16 * num}; 124 | return {mk.x / (32 * num), mk.y * 32 * num}; 125 | } 126 | }; 127 | 128 | } // namespace turbomind::gemm 129 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h" 4 | #include "src/turbomind/kernels/gemm/registry.h" 5 | #include "src/turbomind/kernels/gemm/transform.h" 6 | #include "src/turbomind/kernels/gemm/types.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | void Registry::f16_u4g128_f16_tnt_sm90_s16816() 11 | { 12 | using namespace sm80_s16816; 13 | using namespace cache_policy; 14 | ////////////////////////////////////////////////////////////////////////////// 15 | // ! sm_90 + cp.async + evict policy = warp illegal instruction 16 | ////////////////////////////////////////////////////////////////////////////// 17 | using D = cache_policy::Default; 18 | 19 | using C = Sm80_s16816, // A 21 | Transform_Default, // tarnsform A 22 | VoidOperand, // U 23 | Operand_B_Pack, // B 24 | Transform_HMMA_16816<1, 0>, // transform B 25 | Operand_UV_Pack, // V 26 | kRowMajor, // order_C 27 | half>; // Tc 28 | 29 | // clang-format off 30 | Add>(); 31 | Add>(); 32 | Add>(); 33 | Add>(); 34 | Add>(); 35 | Add>(); 36 | 37 | Add>(); 38 | Add>(); 39 | Add>(); 40 | Add>(); 41 | 42 | Add>(); 43 | Add>(); 44 | Add>(); 45 | Add>(); 46 | Add>(); 47 | Add>(); 48 | 49 | Add>(); 50 | Add>(); 51 | Add>(); 52 | Add>(); 53 | 54 | Add>(); 55 | Add>(); 56 | Add>(); 57 | Add>(); 58 | Add>(); 59 | 60 | Add>(); 61 | Add>(); 62 | Add>(); 63 | Add>(); 64 | Add>(); 65 | // clang-format on 66 | } 67 | 68 | } // namespace turbomind::gemm 69 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/config_sm80_s16816.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/arch.h" 6 | #include "src/turbomind/kernels/gemm/arch/mma_sm80.h" 7 | #include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h" 8 | #include "src/turbomind/kernels/gemm/cta_map.h" 9 | #include "src/turbomind/kernels/gemm/epilogue.h" 10 | #include "src/turbomind/kernels/gemm/gemm_universal.h" 11 | #include "src/turbomind/kernels/gemm/iterator_sm80.h" 12 | #include "src/turbomind/kernels/gemm/mainloop_sm80_v2.h" 13 | #include "src/turbomind/kernels/gemm/thread_group_map.h" 14 | #include "src/turbomind/kernels/gemm/tiled_mma.h" 15 | #include "src/turbomind/kernels/gemm/types.h" 16 | 17 | namespace turbomind::gemm::sm80_s16816 { 18 | 19 | template 29 | struct Sm80_s16816 { 30 | 31 | static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K); 32 | 33 | static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum; 34 | static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum; 35 | static constexpr int SMEM_K = A::SmemCopyAtom::K; 36 | 37 | template 52 | 53 | struct Type { 54 | 55 | // Raked partition dont support `Pack_M > 1` 56 | using Partition = Blocked; 57 | using MMA_Map = MMA_Map; 58 | using MMA = Tiled_MMA_v2; 59 | 60 | using Mainloop = MainloopSm80_v2, 63 | TransformA, 64 | U, 65 | GroupSizeU, 66 | B, 67 | IteratorSm80, 68 | TransformB, 69 | V, 70 | GroupSizeV, 71 | Stages, 72 | FusePrefecth>; 73 | 74 | static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_; 75 | static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_; 76 | 77 | using Epilogue = gemm::Epilogue_, 84 | Operand_C, 85 | SplitK>; 86 | 87 | using Kernel = GemmUniversal; 88 | }; 89 | }; 90 | 91 | } // namespace turbomind::gemm::sm80_s16816 92 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/config_simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/arch.h" 6 | #include "src/turbomind/kernels/gemm/arch/mma_simt.h" 7 | #include "src/turbomind/kernels/gemm/arch/operand_simt.h" 8 | #include "src/turbomind/kernels/gemm/cta_map.h" 9 | #include "src/turbomind/kernels/gemm/gemm_universal.h" 10 | #include "src/turbomind/kernels/gemm/iterator_sm70.h" 11 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h" 12 | #include "src/turbomind/kernels/gemm/thread_group_map.h" 13 | #include "src/turbomind/kernels/gemm/tiled_mma.h" 14 | #include "src/turbomind/kernels/gemm/types.h" 15 | 16 | namespace turbomind::gemm { 17 | 18 | namespace simt { 19 | 20 | template 21 | struct Sm75_Simt { 22 | 23 | static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K); 24 | 25 | static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum; 26 | static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum; 27 | static constexpr int SMEM_K = A::SmemCopyAtom::K; 28 | 29 | template 43 | struct Type { 44 | 45 | // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom) 46 | using MMA_Atom = MMA_SIMT; 47 | 48 | static constexpr int TM = MMA_Atom::M; 49 | static constexpr int TN = MMA_Atom::N; 50 | static constexpr int TK = MMA_Atom::K; 51 | 52 | using Partition = Blocked; 53 | 54 | using MMA_Map = MMA_Map; 55 | using MMA = Tiled_MMA_v2; 56 | 57 | // using MMA_Map = RakedThreadGroupMap; 58 | 59 | using Mainloop = MainloopSm70, 62 | TransformA, 63 | U, 64 | GroupSizeU, 65 | B, 66 | IteratorSm70, 67 | TransformB, 68 | V, 69 | GroupSizeV, 70 | Stages, 71 | true>; 72 | 73 | static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_; 74 | static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_; 75 | 76 | using Epilogue = gemm::Epilogue_, 83 | Operand_C, 84 | SplitK>; 85 | 86 | using Kernel = GemmUniversal; 87 | }; 88 | }; 89 | 90 | } // namespace simt 91 | 92 | } // namespace turbomind::gemm 93 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/params.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/params.h" 4 | #include "src/turbomind/utils/parser.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | void ParseTuningParams(TuningParams& params, const std::string& str) 12 | { 13 | const auto list = ParseArgsList(str); 14 | 15 | auto try_parse = [&](auto& value, auto name) { 16 | auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == name; }); 17 | if (it != list.end()) { 18 | std::cout << name << " " << it->second << "\n"; 19 | Parse(value, it->second); 20 | } 21 | }; 22 | 23 | try_parse(params.max_splits, "max_splits"); 24 | try_parse(params.max_waves, "max_waves"); 25 | try_parse(params.swizzle, "swizzle"); 26 | try_parse(params.top_k, "top_k"); 27 | try_parse(params.clusters, "clusters"); 28 | try_parse(params.min_iter, "min_iter"); 29 | try_parse(params.max_iter, "max_iter"); 30 | try_parse(params.max_time, "max_time"); 31 | 32 | if (auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == "seq"; }); it != list.end()) { 33 | params.seq = ParseTuningSequence(it->second); 34 | } 35 | } 36 | 37 | std::vector ParseTuningSequence(const std::string& str) 38 | { 39 | const std::regex triplet(R"((\d+)-(\d+)-(\d+))"); 40 | 41 | std::vector> generators; 42 | 43 | const auto tokens = ParseListOrTuple(str); 44 | 45 | for (const auto& token : tokens) { 46 | std::smatch match; 47 | if (std::regex_match(token, match, triplet)) { 48 | generators.push_back({std::stoi(match[1].str()), // 49 | std::stoi(match[2].str()), 50 | std::stoi(match[3].str())}); 51 | } 52 | else { // must be an integer string 53 | generators.push_back({std::stoi(token), 0, 0}); 54 | } 55 | } 56 | 57 | if (generators.size() == 1) { // Replace sentinel of the default generators 58 | auto fallback = GetDefaultTuningGenerators(); 59 | fallback.back() = {generators.front().front(), 0, 0}; 60 | generators = std::move(fallback); 61 | } 62 | 63 | return GenerateTuningSequence(generators); 64 | } 65 | 66 | std::vector GenerateTuningSequence(const std::vector>& generators) 67 | { 68 | std::vector ret; 69 | if (generators.empty()) { 70 | return ret; 71 | } 72 | const int last = generators.back().front(); 73 | // The last generator is a sentinel `(max_bs, 0, 0)` 74 | for (int i = 0; i < (int)generators.size() - 1; ++i) { 75 | auto [curr, next, step] = generators[i]; 76 | if (curr >= last) { 77 | break; 78 | } 79 | if (next == 0 && step == 0) { // single value 80 | ret.push_back(curr); 81 | } 82 | else { // generator 83 | const int end = std::min(generators[i + 1][0], last); 84 | while (curr < end) { 85 | ret.push_back(curr); 86 | if (curr == next) { 87 | step *= 2; 88 | next *= 2; 89 | } 90 | curr += step; 91 | } 92 | } 93 | } 94 | ret.push_back(last); 95 | return ret; 96 | } 97 | 98 | std::vector> GetDefaultTuningGenerators() 99 | { 100 | /// TODO: set generators based on device 101 | return {{8, 16, 8}, {16, 64, 16}, {8192}}; 102 | } 103 | 104 | } // namespace turbomind::gemm 105 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/smem_copy_sm70.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | #include "src/turbomind/kernels/gemm/smem_copy.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | template 11 | struct SmemCopy_MMA_884_A { 12 | // static constexpr int M = 16; 13 | // static constexpr int K = 8; 14 | static constexpr int M = 8; 15 | static constexpr int K = 8; 16 | 17 | static constexpr int kFragNum = 1; 18 | 19 | using Frag = Array; 20 | 21 | __device__ static int2 unique(int thread_idx, int pack_idx) 22 | { 23 | const int lane_id = thread_idx % WARP_SIZE; 24 | // 4 3 01 25 | // const int m = lane_id / 16 * 4 + (lane_id & 8) + lane_id % 4; 26 | // return {pack_idx * M + m, (lane_id & 4) >> 2}; 27 | 28 | // 4 01 29 | const int m = lane_id / 16 * 4 + lane_id % 4; 30 | return {pack_idx * M + m, (lane_id & 12) >> 2}; 31 | } 32 | 33 | __device__ static int2 get_offset(int thread_idx) 34 | { 35 | return int2{unique(thread_idx, 0).x, 0}; 36 | } 37 | 38 | template 39 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool) 40 | { 41 | Lds(*(Frag*)dst_ptr, src_ptr); 42 | } 43 | }; 44 | 45 | template 46 | struct SmemCopy_MMA_884_B { 47 | // static constexpr int M = 16; 48 | // static constexpr int K = 8; 49 | static constexpr int M = 32; 50 | static constexpr int K = 8; 51 | 52 | static constexpr int kFragNum = 1; 53 | 54 | using Frag = Array; 55 | 56 | __device__ static int2 unique(int thread_idx, int pack_idx) 57 | { 58 | const int lane_id = thread_idx % WARP_SIZE; 59 | // 4 2 01 60 | // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4; 61 | // return {pack_idx * M + m, (lane_id & 8) >> 3}; 62 | 63 | // 4 23 01 64 | const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4; 65 | return {pack_idx * M + m, 0}; 66 | } 67 | 68 | __device__ static int2 get_offset(int thread_idx) 69 | { 70 | return int2{unique(thread_idx, 0).x, 0}; 71 | } 72 | 73 | template 74 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool) 75 | { 76 | Lds(*(Frag*)dst_ptr, src_ptr); 77 | } 78 | }; 79 | 80 | template 81 | struct SmemCopy_MMA_884_V { 82 | // static constexpr int M = 16; 83 | static constexpr int M = 32; 84 | static constexpr int K = K_; 85 | 86 | static constexpr int kFragNum = 1; 87 | 88 | using Frag = Array; 89 | 90 | __device__ static int2 unique(int thread_idx, int pack_idx) 91 | { 92 | const int lane_id = thread_idx % WARP_SIZE; 93 | // 4 2 01 94 | // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4; 95 | // return {pack_idx * 16 + m, (lane_id & 8) >> 3}; 96 | 97 | const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4; 98 | return {pack_idx * M + m, 0}; 99 | } 100 | 101 | __device__ static int2 get_offset(int thread_idx) 102 | { 103 | return int2{unique(thread_idx, 0).x, 0}; 104 | } 105 | 106 | template 107 | __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool) 108 | { 109 | Lds(*(Frag*)dst_ptr, src_ptr); 110 | } 111 | }; 112 | 113 | } // namespace turbomind::gemm 114 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/reference.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/test/reference.h" 4 | #include 5 | 6 | namespace turbomind::gemm { 7 | 8 | #define CHECK(cond) \ 9 | do { \ 10 | if (!(cond)) { \ 11 | fprintf(stderr, "*** Check failed: (%s) @ %s:%d\n", #cond, __FILE__, __LINE__); \ 12 | std::abort(); \ 13 | } \ 14 | } while (0) 15 | 16 | namespace { 17 | 18 | MatrixLayout transpose(MatrixLayout x) 19 | { 20 | std::swap(x.rows, x.cols); 21 | x.order = x.order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; 22 | return x; 23 | } 24 | 25 | cudaDataType to_cuda_dtype(DataType dtype) 26 | { 27 | switch (dtype) { 28 | case DataType::F16: 29 | return CUDA_R_16F; 30 | case DataType::BF16: 31 | return CUDA_R_16BF; 32 | default: 33 | CHECK("unsupported data type" && 0); 34 | } 35 | return {}; 36 | } 37 | 38 | } // namespace 39 | 40 | Reference::Reference() 41 | { 42 | cublasCreate(&handle_); 43 | } 44 | 45 | Reference::~Reference() 46 | { 47 | if (handle_) { 48 | cublasDestroy(handle_); 49 | handle_ = {}; 50 | } 51 | } 52 | 53 | void Reference::set_stream(cudaStream_t stream) 54 | { 55 | cublasSetStream(handle_, stream); 56 | } 57 | 58 | void Reference::gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc) 59 | { 60 | 61 | // Transpose the problem for C to be column major 62 | if (Cdesc.order == Order::kRowMajor) { 63 | std::swap(A, B); 64 | std::swap(Adesc, Bdesc); 65 | Adesc = transpose(Adesc); 66 | Bdesc = transpose(Bdesc); 67 | Cdesc = transpose(Cdesc); 68 | // (n, k) (k, m) 69 | } 70 | 71 | CHECK(Adesc.cols == Bdesc.rows); 72 | 73 | // (m, k) (k, n) 74 | int m = Cdesc.rows; 75 | int n = Cdesc.cols; 76 | int k = Adesc.cols; 77 | CHECK(Adesc.rows == m); 78 | CHECK(Bdesc.cols == n); 79 | CHECK(Bdesc.rows == k); 80 | 81 | float alpha = 1.f; 82 | float beta = 0.f; 83 | 84 | auto to_cublas_op = [](Order o) { return o == Order::kColMajor ? CUBLAS_OP_N : CUBLAS_OP_T; }; 85 | 86 | auto status = cublasGemmEx(handle_, 87 | to_cublas_op(Adesc.order), 88 | to_cublas_op(Bdesc.order), 89 | m, 90 | n, 91 | k, 92 | &alpha, 93 | A, 94 | to_cuda_dtype(Adesc.type), 95 | Adesc.ld, 96 | B, 97 | to_cuda_dtype(Bdesc.type), 98 | Bdesc.ld, 99 | &beta, 100 | C, 101 | to_cuda_dtype(Cdesc.type), 102 | Cdesc.ld, 103 | CUBLAS_COMPUTE_32F, 104 | CUBLAS_GEMM_DEFAULT_TENSOR_OP); 105 | 106 | CHECK(status == CUBLAS_STATUS_SUCCESS); 107 | } 108 | 109 | } // namespace turbomind::gemm 110 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/array.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | #include "src/turbomind/kernels/core/data_type.h" 7 | #include "src/turbomind/kernels/core/sub_byte_ptr.h" 8 | 9 | namespace turbomind { 10 | 11 | template 12 | struct Array { 13 | using value_type = T; 14 | using size_type = int; 15 | using difference_type = int; 16 | using reference = value_type&; 17 | using const_reference = const value_type&; 18 | using pointer = value_type*; 19 | using const_pointer = const value_type*; 20 | using iterator = pointer; 21 | using const_iterator = const_pointer; 22 | 23 | static_assert(N > 0); 24 | 25 | T __a[N]; 26 | 27 | TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept 28 | { 29 | return __a[i]; 30 | } 31 | 32 | TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept 33 | { 34 | return __a[i]; 35 | } 36 | 37 | TM_HOST_DEVICE constexpr reference front() noexcept 38 | { 39 | return *begin(); 40 | } 41 | 42 | TM_HOST_DEVICE constexpr const_reference front() const noexcept 43 | { 44 | return *begin(); 45 | } 46 | 47 | TM_HOST_DEVICE constexpr reference back() noexcept 48 | { 49 | return *(end() - 1); 50 | } 51 | 52 | TM_HOST_DEVICE constexpr const_reference back() const noexcept 53 | { 54 | return *(end() - 1); 55 | } 56 | 57 | TM_HOST_DEVICE constexpr pointer data() noexcept 58 | { 59 | return &__a[0]; 60 | } 61 | 62 | TM_HOST_DEVICE constexpr const_pointer data() const noexcept 63 | { 64 | return &__a[0]; 65 | } 66 | 67 | TM_HOST_DEVICE constexpr iterator begin() noexcept 68 | { 69 | return data(); 70 | } 71 | 72 | TM_HOST_DEVICE constexpr const_iterator begin() const noexcept 73 | { 74 | return data(); 75 | } 76 | 77 | TM_HOST_DEVICE constexpr iterator end() noexcept 78 | { 79 | return data() + N; 80 | } 81 | 82 | TM_HOST_DEVICE constexpr const_iterator end() const noexcept 83 | { 84 | return data() + N; 85 | } 86 | 87 | TM_HOST_DEVICE static constexpr std::integral_constant size() noexcept 88 | { 89 | return {}; 90 | } 91 | 92 | TM_HOST_DEVICE static constexpr std::false_type empty() noexcept 93 | { 94 | return {}; 95 | } 96 | }; 97 | 98 | template 99 | struct Array { 100 | using value_type = detail::__uint4_t; 101 | using size_type = int; 102 | using difference_type = int; 103 | using reference = value_type&; 104 | using const_reference = const value_type&; 105 | using pointer = SubBytePtr; 106 | using const_pointer = SubBytePtr; 107 | 108 | // static_assert(N % 8 == 0); 109 | 110 | detail::__uint4_t __a[N / 8]; 111 | 112 | TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept 113 | { 114 | return __a[i / 8]; 115 | } 116 | 117 | TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept 118 | { 119 | return __a[i / 8]; 120 | } 121 | 122 | TM_HOST_DEVICE static constexpr std::integral_constant size() noexcept 123 | { 124 | return {}; 125 | } 126 | 127 | TM_HOST_DEVICE static constexpr std::false_type empty() noexcept 128 | { 129 | return {}; 130 | } 131 | 132 | TM_HOST_DEVICE constexpr pointer data() noexcept 133 | { 134 | return {(char*)&__a[0]}; 135 | } 136 | }; 137 | 138 | static_assert(sizeof(Array) == 4); 139 | static_assert(sizeof(Array) == 8); 140 | static_assert(sizeof(Array) == 12); 141 | static_assert(sizeof(Array) == 16); 142 | 143 | } // namespace turbomind 144 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: publish to pypi 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "turbomind/version.py" 9 | workflow_dispatch: 10 | 11 | 12 | jobs: 13 | linux-build: 14 | strategy: 15 | matrix: 16 | pyver: [py38, py39, py310, py311, py312] 17 | runs-on: ubuntu-latest 18 | env: 19 | PYTHON_VERSION: ${{ matrix.pyver }} 20 | PLAT_NAME: manylinux2014_x86_64 21 | DOCKER_TAG: cuda12.1 22 | OUTPUT_FOLDER: cuda12.1_dist 23 | steps: 24 | - name: Free disk space 25 | uses: jlumbroso/free-disk-space@main 26 | with: 27 | # This might remove tools that are actually needed, if set to "true" but frees about 6 GB 28 | tool-cache: false 29 | docker-images: false 30 | # All of these default to true, but feel free to set to "false" if necessary for your workflow 31 | android: true 32 | dotnet: true 33 | haskell: true 34 | large-packages: true 35 | swap-storage: false 36 | - name: Checkout repository 37 | uses: actions/checkout@v3 38 | - name: Build 39 | run: | 40 | echo ${PYTHON_VERSION} 41 | echo ${PLAT_NAME} 42 | echo ${DOCKER_TAG} 43 | echo ${OUTPUT_FOLDER} 44 | # remove -it 45 | sed -i 's/docker run --rm -it/docker run --rm/g' builder/manylinux/build_wheel.sh 46 | bash builder/manylinux/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} 47 | - name: Upload Artifacts 48 | uses: actions/upload-artifact@v4 49 | with: 50 | if-no-files-found: error 51 | path: builder/manylinux/${{ env.OUTPUT_FOLDER }}/* 52 | retention-days: 1 53 | name: linux-${{ matrix.pyver }} 54 | 55 | windows-build: 56 | strategy: 57 | matrix: 58 | pyver: ['3.8', '3.9', '3.10', '3.11', '3.12'] 59 | runs-on: windows-latest 60 | steps: 61 | - name: Checkout repository 62 | uses: actions/checkout@v3 63 | - name: Set up python 64 | uses: actions/setup-python@v4 65 | with: 66 | python-version: ${{ matrix.pyver }} 67 | - name: Install python packages 68 | run: | 69 | pip install -r requirements/build.txt 70 | pip install wheel 71 | - name: Setup CUDA Toolkit 72 | id: cuda-toolkit 73 | shell: pwsh 74 | run: ./builder/windows/setup_cuda.ps1 75 | env: 76 | INPUT_CUDA_VERSION: '12.1.0' 77 | - name: Build wheel 78 | run: | 79 | mkdir build 80 | cd build 81 | # https://github.com/pypa/setuptools/issues/1631 82 | pip install -U setuptools 83 | ..\builder\windows\generate.ps1 84 | cmake --build . --config Release -- /m /v:q 85 | if (-Not $?) { 86 | echo "build failed" 87 | exit 1 88 | } 89 | cmake --install . --config Release 90 | cd .. 91 | rm build -Force -Recurse 92 | python setup.py bdist_wheel -d build/wheel 93 | - name: Upload Artifacts 94 | uses: actions/upload-artifact@v4 95 | with: 96 | if-no-files-found: error 97 | path: build/wheel/* 98 | retention-days: 1 99 | name: windows-${{ matrix.pyver }} 100 | 101 | publish: 102 | runs-on: ubuntu-latest 103 | environment: 'prod' 104 | needs: 105 | - linux-build 106 | - windows-build 107 | steps: 108 | - name: Download artifacts 109 | uses: actions/download-artifact@v4 110 | with: 111 | path: artifact 112 | merge-multiple: true 113 | - name: Display artifacts 114 | run: ls artifact/ -lh 115 | - name: Set up python3.8 116 | uses: actions/setup-python@v4 117 | with: 118 | python-version: '3.8' 119 | - name: Upload to pypi 120 | run: | 121 | pip install twine 122 | twine upload artifact/* -u __token__ -p ${{ secrets.pypi_password }} 123 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/layout.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/data_type.h" 6 | namespace turbomind { 7 | 8 | template 9 | struct Swizzle { 10 | 11 | using bit_mask = std::integral_constant; 12 | using yyy_mask = std::integral_constant; 13 | using shift = std::integral_constant; 14 | 15 | template 16 | __host__ __device__ constexpr static auto apply(Offset offset) 17 | { 18 | return offset ^ ((offset & yyy_mask{}) >> shift{}); 19 | } 20 | 21 | template 22 | __host__ __device__ constexpr auto operator()(Offset offset) 23 | { 24 | return apply(offset); 25 | } 26 | }; 27 | 28 | struct Identity { 29 | 30 | template 31 | __device__ constexpr static auto apply(Offset offset) 32 | { 33 | return offset; 34 | } 35 | 36 | template 37 | __device__ Offset operator()(Offset offset) 38 | { 39 | return apply(offset); 40 | } 41 | 42 | template 43 | __device__ int AdvanceS(int offset, int s0, int s1) 44 | { 45 | return offset; 46 | } 47 | }; 48 | 49 | template 50 | struct SmemLayoutV2 { 51 | 52 | // (C0,S0),( C1, S1) 53 | // ( 1,C0),(C0*S0, C0*S0*C1) 54 | 55 | static constexpr int S = S_; 56 | static constexpr int C = C_; 57 | 58 | static constexpr int S0 = S0_ < 0 ? S : S0_; 59 | static constexpr int C0 = C0_ < 0 ? C : C0_; 60 | 61 | static_assert(S % S0 == 0); 62 | static_assert(C % C0 == 0); 63 | 64 | static constexpr int S1 = S / S0; 65 | static constexpr int C1 = C / C0; 66 | 67 | static constexpr int kSize = S * C; 68 | 69 | static constexpr int kSize0 = S0 * C0; 70 | static constexpr int kSize1 = S1 * C1; 71 | 72 | using Swizzle = Swizzle_; 73 | 74 | static constexpr int kIsTrivial = S == S0 && C == C0 && std::is_same_v; 75 | 76 | __forceinline__ __device__ static int apply(int s, int c, int offset = 0) 77 | { 78 | int s1 = s / S0; 79 | int s0 = s % S0; 80 | int c1 = c / C0; 81 | int c0 = c % C0; 82 | // variable | uniform | constant 83 | // return Swizzle::apply(s0 * C0 + c0) + offset + (s1 * C1 + c1) * kSize0; 84 | 85 | // return offset + Swizzle::apply(s0 * C0 + c0) + (s1 * C1 + c1) * kSize0; 86 | 87 | return Swizzle::apply(s0 * C0 + c0) + (s1 * C1 + c1) * kSize0 + offset; 88 | } 89 | 90 | __forceinline__ __device__ int operator()(int s, int c, int offset = 0) 91 | { 92 | return apply(s, c, offset); 93 | } 94 | }; 95 | 96 | struct Offset { 97 | __device__ explicit Offset(int value): value_{value} {}; 98 | __device__ int& operator()() 99 | { 100 | return value_; 101 | } 102 | __device__ const int& operator()() const 103 | { 104 | return value_; 105 | } 106 | int value_; 107 | }; 108 | 109 | template 110 | struct SmemAccessor { 111 | using Pointer = get_pointer_type; 112 | Pointer ptr_; 113 | Layout layout_; 114 | 115 | __device__ SmemAccessor(Pointer ptr): ptr_{ptr} {} 116 | 117 | __device__ T& operator()(int s, int c) 118 | { 119 | return ptr_[layout_(s, c)]; 120 | } 121 | 122 | __device__ T& operator()(int s, int c, int offset) 123 | { 124 | return ptr_[layout_(s, c, offset)]; 125 | } 126 | 127 | __device__ T& operator()(int idx) 128 | { 129 | return ptr_[idx]; 130 | } 131 | }; 132 | 133 | template 134 | struct Stride { 135 | T0 v0; 136 | T1 v1; 137 | 138 | // CTAD 139 | __host__ __device__ Stride(T0 v0, T1 v1): v0{v0}, v1{v1} {} 140 | 141 | template 142 | __host__ __device__ constexpr auto operator()(I0 i0, I1 i1) const 143 | { 144 | return v0 * i0 + v1 * i1; 145 | } 146 | }; 147 | 148 | } // namespace turbomind 149 | -------------------------------------------------------------------------------- /.github/workflows/cuda11.8-whl-release.yml: -------------------------------------------------------------------------------- 1 | name: cuda11.8-whl-release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | linux-build: 14 | strategy: 15 | matrix: 16 | pyver: [py38, py39, py310, py311, py312] 17 | runs-on: ubuntu-latest 18 | env: 19 | PYTHON_VERSION: ${{ matrix.pyver }} 20 | PLAT_NAME: manylinux2014_x86_64 21 | DOCKER_TAG: cuda11.8 22 | OUTPUT_FOLDER: cuda11.8_dist 23 | CUDA_VER: 11.8 24 | steps: 25 | - name: Free disk space 26 | uses: jlumbroso/free-disk-space@main 27 | with: 28 | # This might remove tools that are actually needed, if set to "true" but frees about 6 GB 29 | tool-cache: false 30 | docker-images: false 31 | # All of these default to true, but feel free to set to "false" if necessary for your workflow 32 | android: true 33 | dotnet: true 34 | haskell: true 35 | large-packages: true 36 | swap-storage: false 37 | - name: Checkout repository 38 | uses: actions/checkout@v3 39 | - name: Build 40 | run: | 41 | echo ${PYTHON_VERSION} 42 | echo ${PLAT_NAME} 43 | echo ${DOCKER_TAG} 44 | echo ${OUTPUT_FOLDER} 45 | # remove -it 46 | sed -i 's/docker run --rm -it/docker run --rm/g' builder/manylinux/build_wheel.sh 47 | bash builder/manylinux/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} 48 | - name: Upload Artifacts 49 | uses: actions/upload-artifact@v4 50 | with: 51 | if-no-files-found: error 52 | path: builder/manylinux/${{ env.OUTPUT_FOLDER }}/* 53 | retention-days: 1 54 | name: linux-${{ matrix.pyver }} 55 | 56 | windows-build: 57 | strategy: 58 | matrix: 59 | pyver: ['3.8', '3.9', '3.10', '3.11', '3.12'] 60 | runs-on: windows-latest 61 | steps: 62 | - name: Checkout repository 63 | uses: actions/checkout@v3 64 | - name: Set up python 65 | uses: actions/setup-python@v4 66 | with: 67 | python-version: ${{ matrix.pyver }} 68 | - name: Install python packages 69 | run: | 70 | pip install pybind11 wheel 71 | - name: Setup CUDA Toolkit 72 | id: cuda-toolkit 73 | shell: pwsh 74 | run: ./builder/windows/setup_cuda.ps1 75 | env: 76 | INPUT_CUDA_VERSION: '11.8.0' 77 | - name: Build wheel 78 | run: | 79 | mkdir build 80 | cd build 81 | pip install -U setuptools 82 | ..\builder\windows\generate.ps1 83 | cmake --build . --config Release -- /m /v:q 84 | if (-Not $?) { 85 | echo "build failed" 86 | exit 1 87 | } 88 | cmake --install . --config Release 89 | cd .. 90 | rm build -Force -Recurse 91 | python setup.py bdist_wheel -d build/wheel 92 | - name: Upload Artifacts 93 | uses: actions/upload-artifact@v4 94 | with: 95 | if-no-files-found: error 96 | path: build/wheel/* 97 | retention-days: 1 98 | name: windows-${{ matrix.pyver }} 99 | 100 | publish: 101 | runs-on: ubuntu-latest 102 | environment: 'prod' 103 | needs: 104 | - linux-build 105 | - windows-build 106 | steps: 107 | - name: Checkout repository 108 | uses: actions/checkout@v3 109 | - name: Download artifacts 110 | uses: actions/download-artifact@v4 111 | with: 112 | path: artifact 113 | merge-multiple: true 114 | - name: Add cuda version to package name 115 | run: | 116 | ver=$(cat turbomind/version.py | grep '__version__ =' | cut -d\' -f2) 117 | cuver=$ver+cu118 118 | ls -lh 119 | cd artifact 120 | for file in *; do 121 | mv "$file" "`echo $file | sed "s/$ver/$cuver/g"`"; 122 | done 123 | - name: Display artifacts 124 | run: ls artifact/ -lh 125 | - name: Publish 126 | uses: softprops/action-gh-release@v1 127 | if: startsWith(github.ref, 'refs/tags/') 128 | with: 129 | files: artifact/* 130 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch/config_sm70_s884.h" 4 | #include "src/turbomind/kernels/gemm/operand.h" 5 | #include "src/turbomind/kernels/gemm/registry.h" 6 | #include "src/turbomind/kernels/gemm/transform.h" 7 | #include "src/turbomind/kernels/gemm/types.h" 8 | 9 | namespace turbomind::gemm { 10 | 11 | void Registry::f16_u4g128_f16_tnt_sm70_s884() 12 | { 13 | using namespace sm70_s884; 14 | { // quant B 15 | using Config = Sm70_s884::Operand, 16 | Transform_Default, 17 | VoidOperand, 18 | typename GetOperand::Operand, 19 | Transform_HMMA_SIMT_B, 20 | typename GetOperand::Operand, 21 | kRowMajor, 22 | half>; 23 | 24 | using namespace cache_policy; 25 | 26 | // m8n32k8: pack_bv=1 27 | // (8,226.234),(16,192.248),(32,120.564),(64,103.483),(96,98.209),(128,54.537),(192,13.739) 28 | // (256,-6.61),(4096,-16.622),(8192,-16.021) 29 | Add>(); // 50.631 30 | Add>(); 31 | Add>(); // 50.698 32 | Add>(); // 93.395 33 | Add>(); 34 | Add>(); // 93.482 35 | Add>(); // 82.113 36 | Add>(); // 80.686 37 | Add>(); // 92.014 38 | Add>(); // 110.979 39 | Add>(); // 147.616 40 | Add>(); // 186.569 41 | Add>(); // 218.194 42 | Add>(); // 209.224 43 | Add>(); // 219.651 44 | 45 | // m16n16k8: pack_bv=2 46 | // (8,179.471),(16,174.246),(32,114.659),(64,100.813),(96,96.822),(128,53.423),(192,12.433),(256,-7.601),(4096,-17.335) 47 | // Add>(); // 50.934 48 | // Add>(); // 47.874 49 | // Add>(); // 47.874 50 | // Add>(); // 95.303 51 | // Add>(); 52 | // Add>(); // 97.095 53 | // Add>(); // 86.559 54 | // Add>(); // 73.869 55 | // Add>(); // 115.205 56 | // Add>(); // 96.151 57 | // Add>(); // 175.285 58 | // Add>(); 59 | } 60 | } 61 | 62 | } // namespace turbomind::gemm 63 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/thread_group_map.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | #include "src/turbomind/kernels/core/math.h" 7 | #include "src/turbomind/kernels/core/meta.h" 8 | #include "src/turbomind/kernels/gemm/thread_map.h" 9 | 10 | #include 11 | 12 | namespace turbomind::gemm { 13 | 14 | template 15 | struct RakedThreadGroupMap { 16 | static constexpr int M = M_; 17 | static constexpr int N = N_; 18 | static constexpr int K = K_; 19 | 20 | static constexpr int TileM = TM; 21 | static constexpr int TileN = TN; 22 | static constexpr int TileK = TK; 23 | 24 | static constexpr int kGroupM = GM; 25 | static constexpr int kGroupN = GN; 26 | static constexpr int kGroupK = GK; 27 | 28 | static constexpr int kGroupCount = GM * GN * GK; 29 | 30 | static constexpr int M1 = GM * TM; 31 | static constexpr int N1 = GN * TN; 32 | static constexpr int K1 = GK * TK; 33 | 34 | static constexpr int kIterM = M / M1; 35 | static constexpr int kIterN = N / N1; 36 | static constexpr int kIterK = K / K1; 37 | 38 | static constexpr int kFootprintM = kIterM * TM; 39 | static constexpr int kFootprintN = kIterN * TN; 40 | static constexpr int kFootprintK = kIterK * TK; 41 | 42 | static constexpr int kDeltaM = TM; 43 | static constexpr int kDeltaN = TN; 44 | static constexpr int kDeltaK = TK; 45 | 46 | __device__ static int3 get_offset(int group_id) 47 | { 48 | const int m = group_id % GM; 49 | const int n = group_id / GM % GN; 50 | const int k = group_id / GM / GN; 51 | return {m * kFootprintM, n * kFootprintN, k * kFootprintK}; 52 | } 53 | }; 54 | 55 | template 56 | struct MMA_Map { 57 | static constexpr int M = M_; 58 | static constexpr int N = N_; 59 | static constexpr int K = K_; 60 | 61 | static constexpr int TileM = tM_; 62 | static constexpr int TileN = tN_; 63 | static constexpr int TileK = tK_; 64 | 65 | static constexpr int kGroupM = ArrangementMN::gM; 66 | static constexpr int kGroupN = ArrangementMN::gN; 67 | static constexpr int kGroupK = gK; 68 | 69 | static constexpr int kGroupCount = kGroupM * kGroupN * kGroupK; 70 | 71 | static constexpr int kIterM = M / tM_ / kGroupM; 72 | static constexpr int kIterN = N / tN_ / kGroupN; 73 | static constexpr int kIterK = K / tK_ / kGroupK; 74 | 75 | static constexpr int kFootprintM = kIterM * tM_; 76 | static constexpr int kFootprintN = kIterN * tN_; 77 | static constexpr int kFootprintK = kIterK * tK_; 78 | 79 | static constexpr int kDeltaM = tM_ * ArrangementMN::dM; 80 | static constexpr int kDeltaN = tN_ * ArrangementMN::dN; 81 | static constexpr int kDeltaK = tK_ * (rK ? gK : 1); 82 | 83 | static constexpr auto kPartitionM = ArrangementMN::pM; 84 | static constexpr auto kPartitionN = ArrangementMN::pN; 85 | static constexpr auto kPartitionK = rK ? Partition::kRaked : Partition::kBlocked; 86 | 87 | __device__ static int3 get_offset(int group_id) 88 | { 89 | constexpr int kGroupMN = kGroupM * kGroupN; 90 | 91 | const auto mn = ArrangementMN::get_offset(group_id % kGroupMN, pair{}); 92 | const int k = group_id / kGroupMN; 93 | 94 | return {mn.x * tM_, mn.y * tN_, k * tK_ * (rK ? 1 : kIterK)}; 95 | } 96 | }; 97 | 98 | namespace { 99 | 100 | template 101 | void Print_(TMap) 102 | { 103 | std::cout << "M, N, K = " << TMap::M << " " << TMap::N << " " << TMap::K << "\n"; 104 | std::cout << "TM, TN, TK = " << TMap::TileM << " " << TMap::TileN << " " << TMap::TileK << "\n"; 105 | std::cout << "group count = " << TMap::kGroupCount << "\n"; 106 | // std::cout << "M1, N1, K1 = " << TMap::M1 << " " << TMap::N1 << " " << TMap::K1 << "\n"; 107 | std::cout << "itM, itN, itK = " << TMap::kIterM << " " << TMap::kIterN << " " << TMap::kIterK << "\n"; 108 | std::cout << "fpM, fpN, fpK = " << TMap::kFootprintM << " " << TMap::kFootprintN << " " << TMap::kFootprintK 109 | << "\n"; 110 | std::cout << "dM, dN, dK = " << TMap::kDeltaM << " " << TMap::kDeltaN << " " << TMap::kDeltaK << "\n"; 111 | } 112 | 113 | } // namespace 114 | 115 | /// TODO: Striped partition? 116 | 117 | } // namespace turbomind::gemm 118 | -------------------------------------------------------------------------------- /example/test_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import turbomind as tm 5 | from turbomind.utils import unpack_awq_gemm 6 | 7 | torch.manual_seed(0) 8 | 9 | 10 | def i32x8_to_i4x8(w): 11 | """merge 8 integers (range from 0 to 15) into one 32-bit integer.""" 12 | assert w.shape[-1] % 8 == 0 13 | shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8) 14 | shape = shape[:-1] + (1, ) 15 | result = torch.zeros(shape, dtype=w.dtype, device=w.device) 16 | mask = torch.tensor([15], dtype=w.dtype, device=w.device) 17 | for i in range(8): 18 | shift = 4 * (7 - i) 19 | result[..., 0] |= (w[..., i] & mask) << shift 20 | result = result.view(w.shape[0], -1) 21 | return result 22 | 23 | 24 | def makeup_weights(in_features: int, out_features: int, group_size: int = 128): 25 | # make up qweight 26 | assert out_features % 8 == 0 27 | qweight = torch.randint(0, 28 | 16, (in_features, out_features // 8, 8), 29 | dtype=torch.int32, 30 | device='cuda') 31 | print(f'-- makeup qweight: shape {qweight.shape}') 32 | print(qweight.view(in_features, -1)) 33 | qweight = i32x8_to_i4x8(qweight) 34 | print(f'-- merge qweight: shape {qweight.shape}') 35 | print(qweight) 36 | 37 | # make up qzeros 38 | assert in_features % group_size == 0 and in_features // group_size >= 1 39 | qzeros = torch.randint(0, 40 | 16, 41 | (in_features // group_size, out_features // 8, 8), 42 | dtype=torch.int32, 43 | device='cuda') 44 | print(f'-- makeup qzero: shape {qzeros.shape}') 45 | print(qzeros.view(in_features // group_size, -1)) 46 | qzeros = i32x8_to_i4x8(qzeros) 47 | print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}') 48 | 49 | # make up scales 50 | scales = torch.rand((in_features // group_size, out_features), 51 | dtype=torch.float16, 52 | device='cuda') 53 | print(f'-- makeup scales: shape {scales.shape}\n{scales}') 54 | return qweight, qzeros, scales 55 | 56 | 57 | def dequantize(qweight, qzeros, scales, group_size: int = 128): 58 | _qweight = unpack_awq_gemm(qweight) 59 | _qzeros = unpack_awq_gemm(qzeros) 60 | _qzeros = _qzeros.float() 61 | _qweight = _qweight.float() 62 | _scales = scales.float() 63 | for i in range(qzeros.shape[0]): 64 | start = i * group_size 65 | end = start + group_size 66 | _qweight[start:end] = (_qweight[start:end, :] - 67 | _qzeros[i:i + 1, :]) * _scales[i:i + 1, :] 68 | return _qweight.half() 69 | 70 | 71 | group_size = 128 72 | batch_size = 16384 73 | in_features = 16384 74 | out_features = 16384 75 | qweight, qzeros, scales = makeup_weights(in_features, out_features, group_size) 76 | 77 | x = torch.randn((batch_size, in_features), 78 | device=qweight.device, 79 | dtype=torch.float16) 80 | 81 | weight = dequantize(qweight, qzeros, scales, group_size) 82 | print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}') 83 | ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda') 84 | with torch.no_grad(): 85 | ref_linear.weight = nn.Parameter(weight.T) 86 | ref_res = ref_linear(x) 87 | print(f'nn.linear.res: {ref_res}') 88 | 89 | model = tm.Linear(in_features=in_features, 90 | out_features=out_features, 91 | bias=False, 92 | quant_method='awq', 93 | w_bit=4, 94 | group_size=group_size) 95 | 96 | model.qweight = qweight 97 | model.qzeros = qzeros 98 | model.scales = scales 99 | 100 | model.post_init() 101 | 102 | stream = torch.cuda.Stream() 103 | with torch.cuda.stream(stream): 104 | res = model(x) 105 | stream.synchronize() 106 | 107 | print(f'tm.linear.res: {res}') 108 | abs_diff = torch.abs(res - ref_res).float() 109 | rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res)) 110 | rtol = 0.01 111 | atol = 0.0001 112 | outliers = abs_diff > atol + rtol * torch.abs(ref_res) 113 | abs_diff = torch.sum(abs_diff) / abs_diff.numel() 114 | rel_diff = torch.sum(rel_diff) / rel_diff.numel() 115 | outliers = torch.sum(outliers) / outliers.shape[0] 116 | print(f'abs_diff {abs_diff:4f}, ' 117 | f'rel_diff {rel_diff:4f}, ' 118 | f'outliers {outliers:4f}') 119 | 120 | tm.Linear.clear_workspaces() 121 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/transform.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/attention/quantization.h" 6 | #include "src/turbomind/kernels/core/common.h" 7 | #include "src/turbomind/kernels/core/meta.h" 8 | #include "src/turbomind/kernels/gemm/smem_copy.h" 9 | #include "src/turbomind/kernels/gemm/tiled_mma.h" 10 | #include 11 | 12 | namespace turbomind::gemm { 13 | 14 | struct Transform_Default { 15 | template 16 | __device__ static void apply(Array (&frag)[K][Mf], int k, Array (&data)[K][Md], S&, int div) 17 | { 18 | static_assert(Nf * Mf == Nd * Md); 19 | static_assert(Nd % Nf == 0 && Mf % Md == 0); 20 | static_assert(sizeof(frag) == sizeof(data)); 21 | 22 | // Alignment must be manually enforced for `reinterpret_cast` 23 | auto& frag_k = reinterpret_cast(&)[Md]>(frag[k]); 24 | auto& data_k = data[k]; 25 | 26 | PRAGMA_UNROLL 27 | for (int i = 0; i < std::size(frag_k); ++i) { 28 | frag_k[i] = data_k[i]; 29 | } 30 | } 31 | }; 32 | 33 | template 34 | struct Transform_HMMA_16816 { 35 | template 36 | __device__ static void 37 | apply(Array (&frag)[K][Mf], int k, Array (&data)[K][Md], Array (&stat)[Ks][Ms], int div) 38 | { 39 | static_assert(Nf * Mf == Nd * Md); 40 | static_assert(Nd % Nf == 0 && Mf % Md == 0); 41 | static_assert(Nf * Mf == Ns * Ms * 4); 42 | 43 | // static_assert(Nf != Nf); 44 | 45 | auto& frag_k = reinterpret_cast(&)[Md]>(frag[k]); 46 | auto& stat_k = reinterpret_cast(&)[Ns * Ms]>(stat[k / div]); 47 | auto& data_k = data[k]; 48 | 49 | PRAGMA_UNROLL 50 | for (int m = 0; m < Md; ++m) { 51 | // if (threadIdx.x == 0) { 52 | // printf("m = %d\n", m); 53 | // } 54 | auto tmp = ConvertKvCache::convert(data_k[m]); 55 | PRAGMA_UNROLL 56 | for (int i = 0; i < Nd; i += 8) { 57 | PRAGMA_UNROLL 58 | for (int s = 0; s < 2; ++s) { 59 | PRAGMA_UNROLL 60 | for (int c = 0; c < 2; ++c) { 61 | const int idx = (m * Nd + i) / 8 * 2 + s * StatStepS + c * StatStepC; 62 | // if (threadIdx.x == 0) { 63 | // printf("idx=%d\n", idx); 64 | // } 65 | dequant((Array&)tmp[i + s * 4 + c * 2], stat_k[idx]); 66 | } 67 | } 68 | } 69 | 70 | frag_k[m] = tmp; 71 | } 72 | } 73 | 74 | template 75 | __device__ static void dequant(Array& x, Array s) 76 | { 77 | Array& _s = (Array&)s; 78 | // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)_s[0], (float)_s[1]); 79 | // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)x[0], (float)x[1]); 80 | x[0] = __hfma(x[0], _s[0], _s[1]); 81 | x[1] = __hfma(x[1], _s[0], _s[1]); 82 | } 83 | }; 84 | 85 | struct Transform_HMMA_SIMT_B { 86 | template 87 | __device__ static void 88 | apply(Array (&frag)[K][Mf], int k, Array (&data)[K][Md], Array (&stat)[Ks][Ms], int div) 89 | { 90 | static_assert(Nf * Mf == Nd * Md); 91 | static_assert(Nd % Nf == 0 && Mf % Md == 0); 92 | 93 | auto& frag_k = reinterpret_cast(&)[Md]>(frag[k]); 94 | auto& stat_k = reinterpret_cast(&)[Ns * Ms]>(stat[k / div]); 95 | auto& data_k = data[k]; 96 | 97 | // static_assert(Nf != Nf); 98 | 99 | PRAGMA_UNROLL 100 | for (int m = 0; m < Md; ++m) { 101 | auto tmp = ConvertKvCache::convert(data_k[m]); 102 | PRAGMA_UNROLL 103 | for (int i = 0; i < Nd; i += 2) { 104 | dequant((Array&)tmp[i], stat_k[(m * Nd + i) / Nf]); 105 | } 106 | frag_k[m] = tmp; 107 | } 108 | } 109 | 110 | template 111 | __device__ static void dequant(Array& x, Array s) 112 | { 113 | Array& _s = (Array&)s; 114 | 115 | x[0] = __hfma(x[0], _s[0], _s[1]); 116 | x[1] = __hfma(x[1], _s[0], _s[1]); 117 | } 118 | }; 119 | 120 | } // namespace turbomind::gemm 121 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/operand_sm70_s884.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/layout.h" 6 | #include "src/turbomind/kernels/core/meta.h" 7 | #include "src/turbomind/kernels/gemm/arch/smem_copy_sm70.h" 8 | #include "src/turbomind/kernels/gemm/iterator.h" 9 | #include "src/turbomind/kernels/gemm/operand.h" 10 | #include "src/turbomind/kernels/gemm/smem_copy.h" 11 | #include "src/turbomind/kernels/gemm/types.h" 12 | 13 | namespace turbomind::gemm { 14 | 15 | namespace sm70_s884 { 16 | 17 | template 18 | struct GetSmemLayout { 19 | template 20 | static constexpr auto apply(pair) 21 | { 22 | constexpr int2 cs = mk2cs(M, K); 23 | return SmemLayoutV2{}; 24 | } 25 | }; 26 | 27 | template 28 | struct Operand_A { 29 | using Dtype = T; 30 | 31 | static constexpr Pack kPack = 0; 32 | static constexpr Order kOrder = kRowMajor; 33 | 34 | using SmemCopyAtom = SmemCopy_MMA_884_A; 35 | 36 | using GetSmemLayout = GetSmemLayout; 37 | using GetGmemIter = GetGmemIter; 38 | }; 39 | 40 | template 41 | struct Operand_B { 42 | using Dtype = T; 43 | 44 | static constexpr Pack kPack = 0; 45 | static constexpr Order kOrder = kRowMajor; // (n,k) 46 | 47 | using SmemCopyAtom = SmemCopy_MMA_884_B; 48 | 49 | using GetSmemLayout = GetSmemLayout; 50 | using GetGmemIter = GetGmemIter; 51 | }; 52 | 53 | template 54 | struct Operand_V { 55 | using Dtype = T; 56 | 57 | static constexpr Pack kPack = 0; 58 | static constexpr Order kOrder = kColMajor; // (n,k) 59 | 60 | using SmemCopyAtom = SmemCopy_MMA_884_V; 61 | 62 | struct GetSmemLayout { // m-major 63 | template 64 | static constexpr auto apply(pair) 65 | { 66 | return SmemLayoutV2{}; 67 | } 68 | }; 69 | 70 | using GetGmemIter = GetGmemIter; 71 | }; 72 | 73 | template 74 | struct _GetSmemLayoutC { 75 | template 76 | static constexpr auto apply(pair) 77 | { 78 | constexpr auto cs = mk2cs(M, N); 79 | return SmemLayoutV2{}; 80 | } 81 | }; 82 | 83 | template 84 | struct _GetThreadMapC { 85 | template 86 | static constexpr auto apply(pair, constant) 87 | { 88 | constexpr auto cs = mk2cs(M, N); 89 | constexpr int WARPS = THREADS / WARP_SIZE; 90 | 91 | return ThreadMap_V2{}; 92 | } 93 | }; 94 | 95 | template 96 | struct Operand_C { 97 | using Dtype = T; 98 | 99 | static constexpr Order kOrder = order; 100 | 101 | using GetSmemLayout = _GetSmemLayoutC; 102 | using GetThreadMap = _GetThreadMapC; 103 | }; 104 | 105 | template 106 | struct Operand_B_Pack { 107 | using Dtype = T; 108 | 109 | static constexpr int Pack_M = 1; 110 | 111 | static constexpr Pack kPack = HMMA_884 | OPERAND_B | Pack_M; 112 | static constexpr Order kOrder = kRowMajor; 113 | 114 | using SmemCopyAtom = SmemCopyAtom_Pack_v3, kOrder, Pack_M>; 115 | 116 | using GetSmemLayout = GetSmemLayout; 117 | using GetGmemIter = GetGmemIter; 118 | }; 119 | 120 | template 121 | struct Operand_V_Pack { 122 | using Dtype = T; 123 | 124 | static constexpr int Pack_M = 1; 125 | 126 | static constexpr Pack kPack = HMMA_884 | OPERAND_V | Pack_M; 127 | static constexpr Order kOrder = kColMajor; 128 | 129 | using SmemCopyAtom = SmemCopyAtom_Pack_v3, kColMajor, Pack_M>; 130 | 131 | struct GetSmemLayout { // m-major 132 | template 133 | static constexpr auto apply(pair) 134 | { 135 | return SmemLayoutV2{}; 136 | } 137 | }; 138 | 139 | using GetGmemIter = GetGmemIter; 140 | }; 141 | 142 | } // namespace sm70_s884 143 | 144 | template 145 | struct GetOperand: std::true_type { 146 | using Operand = sm70_s884::Operand_A; 147 | }; 148 | 149 | template 150 | struct GetOperand: std::true_type { 151 | using Operand = sm70_s884::Operand_B; 152 | }; 153 | 154 | template 155 | struct GetOperand: std::true_type { 156 | using Operand = sm70_s884::Operand_V; 157 | }; 158 | 159 | template 160 | struct GetOperand: std::true_type { 161 | using Operand = sm70_s884::Operand_B_Pack; 162 | }; 163 | 164 | template 165 | struct GetOperand: std::true_type { 166 | using Operand = sm70_s884::Operand_V_Pack; 167 | }; 168 | 169 | } // namespace turbomind::gemm 170 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/gpu_metric.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/core/array.h" 4 | #include "src/turbomind/kernels/core/common.h" 5 | #include "src/turbomind/kernels/core/math.h" 6 | #include "src/turbomind/kernels/gemm/gpu_metric.h" 7 | #include 8 | 9 | #include 10 | 11 | namespace turbomind::gemm { 12 | 13 | using thrust::device_vector; 14 | 15 | namespace { 16 | 17 | template 18 | __global__ void l2_bw(float* dsink, const float* array, int count) 19 | { 20 | int tid = threadIdx.x + (blockIdx.x >> LOG_TILE) * blockDim.x; 21 | float4 sink{}; 22 | 23 | constexpr int NUM_THREADS = BLOCK_NUM * BLOCK_DIM; 24 | 25 | for (int i = 0; i < count; i += NUM_THREADS * 4) { 26 | const float* ptr = array + i; 27 | const int offset = tid * 4; 28 | float4 data = __ldcg(reinterpret_cast(ptr + offset)); 29 | sink.x += data.x; 30 | sink.y += data.y; 31 | sink.z += data.z; 32 | sink.w += data.w; 33 | } 34 | 35 | dsink[threadIdx.x] = sink.x + sink.y + sink.z + sink.w; 36 | } 37 | 38 | } // namespace 39 | 40 | float MeasureL2CacheThroughput() 41 | { 42 | cudaDeviceProp prop{}; 43 | int device{}; 44 | cudaGetDevice(&device); 45 | cudaGetDeviceProperties(&prop, device); 46 | 47 | size_t size = static_cast(prop.l2CacheSize) * 64; 48 | 49 | std::cout << size << std::endl; 50 | 51 | constexpr int BLOCK_X = 128; // blocks participating single sweep 52 | constexpr int BLOCK_Y = 128; // full sweep iters 53 | constexpr int LOG_TILE = 5; // swizzling factor to bring up L2 hit rate, set to 0 will minimize hit rate 54 | 55 | constexpr int BLOCK_DIM = 256; 56 | 57 | constexpr int CHUNK_SIZE = BLOCK_X * BLOCK_DIM * 4; // x4 for float4 load pattern 58 | 59 | device_vector data(ceil_div(size, sizeof(float)) / CHUNK_SIZE * CHUNK_SIZE); 60 | device_vector dsink(BLOCK_DIM); 61 | 62 | cudaStream_t stream; 63 | cudaStreamCreate(&stream); 64 | 65 | cudaMemsetAsync(data.data().get(), 0, sizeof(float) * data.size(), stream); 66 | 67 | cudaEvent_t ev_start, ev_end; 68 | 69 | cudaEventCreate(&ev_start); 70 | cudaEventCreate(&ev_end); 71 | 72 | cudaEventRecord(ev_start, stream); 73 | 74 | l2_bw<<> LOG_TILE), BLOCK_DIM, 0, stream>>>( 75 | dsink.data().get(), data.data().get(), data.size()); 76 | 77 | cudaEventRecord(ev_end, stream); 78 | 79 | cudaEventSynchronize(ev_end); 80 | 81 | float ms{}; 82 | cudaEventElapsedTime(&ms, ev_start, ev_end); 83 | 84 | size_t bytes = BLOCK_Y * sizeof(float) * data.size(); 85 | 86 | const float bytes_per_second = bytes / ms * 1e3; 87 | std::cout << bytes_per_second / 1e9 << " GB/s" << std::endl; 88 | 89 | cudaEventDestroy(ev_start); 90 | cudaEventDestroy(ev_end); 91 | 92 | cudaStreamDestroy(stream); 93 | 94 | return bytes_per_second; 95 | } 96 | 97 | float MeasureMmaThroughput(int problem_size) 98 | { 99 | device_vector a(problem_size * problem_size); 100 | device_vector b(a.size()); 101 | device_vector c(a.size()); 102 | 103 | cublasHandle_t cublas{}; 104 | cublasCreate(&cublas); 105 | 106 | cudaStream_t stream; 107 | cudaStreamCreate(&stream); 108 | 109 | cublasSetStream(cublas, stream); 110 | 111 | cudaEvent_t ev_start, ev_end; 112 | 113 | cudaEventCreate(&ev_start); 114 | cudaEventCreate(&ev_end); 115 | 116 | cudaEventRecord(ev_start, stream); 117 | 118 | float alpha = 1.f; 119 | float beta = 0.f; 120 | cublasGemmEx(cublas, 121 | CUBLAS_OP_N, 122 | CUBLAS_OP_N, 123 | problem_size, 124 | problem_size, 125 | problem_size, 126 | &alpha, 127 | a.data().get(), 128 | CUDA_R_16F, 129 | problem_size, 130 | b.data().get(), 131 | CUDA_R_16F, 132 | problem_size, 133 | &beta, 134 | c.data().get(), 135 | CUDA_R_16F, 136 | problem_size, 137 | CUBLAS_COMPUTE_32F, 138 | CUBLAS_GEMM_DEFAULT); 139 | 140 | cudaEventRecord(ev_end, stream); 141 | 142 | cudaEventSynchronize(ev_end); 143 | 144 | float ms{}; 145 | cudaEventElapsedTime(&ms, ev_start, ev_end); 146 | 147 | cudaEventDestroy(ev_start); 148 | cudaEventDestroy(ev_end); 149 | 150 | cudaStreamDestroy(stream); 151 | 152 | cublasDestroy(cublas); 153 | 154 | const size_t ops = (size_t)problem_size * problem_size * problem_size; 155 | 156 | float fma_per_second = ops / ms * 1e3; 157 | 158 | std::cout << 2 * fma_per_second / 1e9 << " FLOPS/s" << std::endl; 159 | 160 | return fma_per_second; 161 | } 162 | 163 | } // namespace turbomind::gemm 164 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch/operand_simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/layout.h" 6 | #include "src/turbomind/kernels/core/meta.h" 7 | #include "src/turbomind/kernels/gemm/arch/smem_copy_simt.h" 8 | #include "src/turbomind/kernels/gemm/iterator.h" 9 | #include "src/turbomind/kernels/gemm/operand.h" 10 | #include "src/turbomind/kernels/gemm/simt.h" 11 | #include "src/turbomind/kernels/gemm/smem_copy.h" 12 | #include "src/turbomind/kernels/gemm/types.h" 13 | 14 | namespace turbomind::gemm { 15 | 16 | namespace simt { 17 | 18 | struct GetSmemLayout { 19 | template 20 | static constexpr auto apply(pair) 21 | { 22 | return SmemLayoutV2{}; 23 | } 24 | }; 25 | 26 | template 27 | struct Operand_A { 28 | using Dtype = T; 29 | 30 | static constexpr Pack kPack = 0; 31 | static constexpr Order kOrder = kRowMajor; 32 | 33 | using SmemCopyAtom = SmemCopy_MMA_SIMT_A; 34 | 35 | using GetSmemLayout = GetSmemLayout; 36 | using GetGmemIter = GetGmemIter; 37 | }; 38 | 39 | template 40 | struct Operand_B { 41 | using Dtype = T; 42 | 43 | static constexpr Pack kPack = 0; 44 | static constexpr Order kOrder = kRowMajor; 45 | 46 | using SmemCopyAtom = SmemCopy_MMA_SIMT_B; 47 | 48 | using GetSmemLayout = GetSmemLayout; 49 | using GetGmemIter = GetGmemIter; 50 | }; 51 | 52 | template 53 | struct _GetSmemLayoutC { 54 | template 55 | static constexpr auto apply(pair) 56 | { 57 | constexpr auto cs = mk2cs(M, N); 58 | return SmemLayoutV2{}; 59 | } 60 | }; 61 | 62 | template 63 | struct _GetThreadMapC { 64 | template 65 | static constexpr auto apply(pair, constant) 66 | { 67 | constexpr auto cs = mk2cs(M, N); 68 | constexpr int WARPS = THREADS / WARP_SIZE; 69 | 70 | return ThreadMap_V2{}; 71 | } 72 | }; 73 | 74 | template 75 | struct Operand_C { 76 | using Dtype = T; 77 | 78 | static constexpr Order kOrder = order; 79 | 80 | using GetSmemLayout = _GetSmemLayoutC; 81 | using GetThreadMap = _GetThreadMapC; 82 | }; 83 | 84 | template 85 | struct Operand_V { 86 | using Dtype = T; 87 | 88 | static constexpr Pack kPack = 0; 89 | static constexpr Order kOrder = kColMajor; 90 | 91 | using SmemCopyAtom = SmemCopy_MMA_SIMT_V; 92 | 93 | struct GetSmemLayout { // m-major 94 | template 95 | static constexpr auto apply(pair) 96 | { 97 | return SmemLayoutV2{}; 98 | } 99 | }; 100 | 101 | using GetGmemIter = GetGmemIter; 102 | }; 103 | 104 | struct GetSmemLayout_Pack { 105 | template 106 | static constexpr auto apply(pair) 107 | { 108 | return SmemLayoutV2{}; 109 | } 110 | }; 111 | 112 | template 113 | struct Operand_B_Pack { 114 | using Dtype = T; 115 | 116 | static constexpr int Pack_M = 1; 117 | 118 | static constexpr Pack kPack = HMMA_SIMT | OPERAND_B | Pack_M; 119 | static constexpr Order kOrder = kRowMajor; 120 | 121 | using SmemCopyAtom = SmemCopyAtom_Pack_v3::SmemCopyAtom, kRowMajor, Pack_M>; 122 | using GetSmemLayout = GetSmemLayout_Pack; 123 | using GetGmemIter = GetGmemIter; 124 | }; 125 | 126 | template 127 | struct Operand_V_Pack { 128 | using Dtype = T; 129 | 130 | static constexpr int Pack_M = 1; 131 | 132 | static constexpr Pack kPack = HMMA_SIMT | OPERAND_V | Pack_M; 133 | static constexpr Order kOrder = kColMajor; 134 | 135 | using SmemCopyAtom = SmemCopyAtom_Pack_v3, kColMajor, Pack_M>; 136 | 137 | struct GetSmemLayout { // m-major 138 | template 139 | static constexpr auto apply(pair) 140 | { 141 | return SmemLayoutV2{}; 142 | } 143 | }; 144 | 145 | using GetGmemIter = GetGmemIter; 146 | }; 147 | 148 | } // namespace simt 149 | 150 | template 151 | struct GetOperand: std::true_type { 152 | using Operand = simt::Operand_A; 153 | }; 154 | 155 | template 156 | struct GetOperand: std::true_type { 157 | using Operand = simt::Operand_B; 158 | }; 159 | 160 | template 161 | struct GetOperand: std::true_type { 162 | using Operand = simt::Operand_V; 163 | }; 164 | 165 | template 166 | struct GetOperand: std::true_type { 167 | using Operand = simt::Operand_B_Pack; 168 | }; 169 | 170 | template 171 | struct GetOperand: std::true_type { 172 | using Operand = simt::Operand_V_Pack; 173 | }; 174 | 175 | } // namespace turbomind::gemm 176 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/types.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/data_type.h" 6 | #include 7 | #if ENABLE_BF16 8 | #include 9 | #endif 10 | 11 | namespace turbomind::gemm { 12 | 13 | enum class Order : int 14 | { 15 | kColMajor = 0, 16 | kRowMajor = 1, 17 | }; 18 | 19 | inline constexpr Order kColMajor = Order::kColMajor; 20 | inline constexpr Order kRowMajor = Order::kRowMajor; 21 | 22 | constexpr Order operator~(Order a) 23 | { 24 | return a == kColMajor ? kRowMajor : kColMajor; 25 | } 26 | 27 | using Pack = uint32_t; 28 | 29 | typedef enum MMA_Tag 30 | { 31 | HMMA_16816 = 0x100, // sm80+ 32 | HMMA_1688 = 0x200, // sm75 33 | HMMA_884 = 0x300, // sm70 34 | HMMA_SIMT = 0x400, // sm75- 35 | } MMA_Tag; 36 | 37 | typedef enum Op_Tag 38 | { 39 | OPERAND_A = 0x010, 40 | OPERAND_B = 0x020, 41 | OPERAND_U = 0x030, 42 | OPERAND_V = 0x040, 43 | } Op_Tag; 44 | 45 | constexpr MMA_Tag get_mma_tag(Pack pack) 46 | { 47 | return static_cast(pack & 0xf00); 48 | } 49 | 50 | constexpr Op_Tag get_operand_tag(Pack pack) 51 | { 52 | return static_cast(pack & 0x0f0); 53 | } 54 | 55 | constexpr int get_pack_num(Pack pack) 56 | { 57 | return pack & 0x00f; 58 | } 59 | 60 | enum class QuantType : int 61 | { 62 | kNone, 63 | kDefault, 64 | }; 65 | 66 | enum class Epilogue : int 67 | { 68 | kNone = 0, 69 | kChannelCombination = 0x1, 70 | kGatedSilu = 0x2, 71 | }; 72 | 73 | enum class DataType : int 74 | { 75 | U4, 76 | U8, 77 | U16, 78 | U32, 79 | U64, 80 | F8_E4M3, 81 | F8_E5M2, 82 | F16, 83 | F32, 84 | BF16, 85 | TF32, 86 | }; 87 | 88 | inline const char* to_string(DataType data_type) 89 | { 90 | switch (data_type) { 91 | case DataType::U4: 92 | return "u4"; 93 | case DataType::U8: 94 | return "u8"; 95 | case DataType::F16: 96 | return "f16"; 97 | case DataType::F32: 98 | return "f32"; 99 | case DataType::BF16: 100 | return "bf16"; 101 | case DataType::TF32: 102 | return "tf32"; 103 | default: 104 | return "unknown"; 105 | } 106 | } 107 | 108 | inline int64_t get_size(DataType type, int64_t size) 109 | { 110 | if (!size) { 111 | return 0; 112 | } 113 | switch (type) { 114 | case DataType::U64: 115 | return size * 8; 116 | case DataType::F32: 117 | case DataType::U32: 118 | return size * 4; 119 | case DataType::BF16: 120 | case DataType::F16: 121 | case DataType::U16: 122 | return size * 2; 123 | case DataType::U8: 124 | case DataType::F8_E4M3: 125 | case DataType::F8_E5M2: 126 | return size; 127 | case DataType::U4: 128 | return size / 2; 129 | default: 130 | // std::cerr << to_string(type) << "\n"; 131 | return -1; 132 | } 133 | } 134 | 135 | template 136 | struct get_data_type { 137 | }; 138 | 139 | template<> 140 | struct get_data_type { 141 | static constexpr auto value = DataType::F16; 142 | }; 143 | 144 | #if ENABLE_BF16 145 | template<> 146 | struct get_data_type { 147 | static constexpr auto value = DataType::BF16; 148 | }; 149 | #endif 150 | 151 | template<> 152 | struct get_data_type { 153 | static constexpr auto value = DataType::U4; 154 | }; 155 | 156 | template<> 157 | struct get_data_type { 158 | static constexpr auto value = DataType::U8; 159 | }; 160 | 161 | template 162 | inline constexpr auto get_data_type_v = get_data_type::value; 163 | 164 | template 165 | struct get_dtype { 166 | }; 167 | 168 | template<> 169 | struct get_dtype { 170 | using type = half; 171 | }; 172 | 173 | template<> 174 | struct get_dtype { 175 | using type = uint4_t; 176 | }; 177 | 178 | template<> 179 | struct get_dtype { 180 | using type = uint8_t; 181 | }; 182 | 183 | template<> 184 | struct get_dtype { 185 | using type = uint16_t; 186 | }; 187 | 188 | template<> 189 | struct get_dtype { 190 | using type = uint32_t; 191 | }; 192 | 193 | struct QuantDesc { 194 | QuantType type; 195 | int group_size; 196 | }; 197 | 198 | enum class DispatchPolicy : int 199 | { 200 | kDefault = 0, 201 | kMeasure = 1, 202 | kReuse = 2, 203 | kAppend = 3, 204 | }; 205 | 206 | constexpr bool operator&(const DispatchPolicy& a, const DispatchPolicy& b) 207 | { 208 | return ((int)a & (int)b); 209 | } 210 | 211 | struct Operation { 212 | DispatchPolicy dispatch; 213 | Epilogue epilogue; 214 | QuantDesc quant_a; 215 | QuantDesc quant_b; 216 | int batch_dim; 217 | void* reserved; 218 | }; 219 | 220 | struct MatrixLayout { 221 | DataType type; 222 | Order order; 223 | int rows; 224 | int cols; 225 | int ld; 226 | Pack pack; 227 | }; 228 | 229 | inline int64_t get_size(const MatrixLayout& m) 230 | { 231 | return get_size(m.type, (int64_t)m.rows * m.cols); 232 | } 233 | 234 | struct Workspace { 235 | void* barriers; 236 | size_t barriers_size; 237 | void* partials; 238 | size_t partials_size; 239 | }; 240 | 241 | } // namespace turbomind::gemm 242 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/thread_map.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | 7 | #include 8 | 9 | namespace turbomind { 10 | 11 | template 12 | struct ThreadMapQ { 13 | static constexpr int kWarpCount = WarpCount; 14 | static constexpr int kAccessC = AccessC; 15 | 16 | static constexpr int kWarpThreadC = C / kAccessC; 17 | static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC; 18 | 19 | static_assert(kWarpThreadC <= WARP_SIZE); 20 | 21 | static constexpr int kWarpAccessC = kWarpThreadC * kAccessC; // C 22 | static constexpr int kWarpAccessS = kWarpThreadS; 23 | 24 | static constexpr int kWarpIterC = C / kWarpAccessC; // 1 25 | static constexpr int kWarpIterS = S / kWarpAccessS; 26 | 27 | static constexpr int kWarpC = 1; 28 | static constexpr int kWarpS = kWarpCount; 29 | 30 | static constexpr int kIterC = kWarpIterC / kWarpC; // 1 31 | static constexpr int kIterS = std::max(kWarpIterS / kWarpS, 1); 32 | 33 | static constexpr int kFootprintC = kWarpAccessC * kIterC; // C 34 | static constexpr int kFootprintS = kWarpAccessS * kIterS; 35 | 36 | static constexpr int kDeltaC = kWarpAccessC; 37 | static constexpr int kDeltaS = kWarpAccessS; 38 | 39 | __device__ static int2 get_offset(int warp_id, int lane_id) 40 | { 41 | int warp_offset_c = warp_id % kWarpC; 42 | int warp_offset_s = warp_id / kWarpC; 43 | 44 | int warp_thread_offset_c = lane_id % kWarpThreadC; 45 | int warp_thread_offset_s = lane_id / kWarpThreadC; 46 | 47 | int cta_thread_offset_c = kFootprintC * warp_offset_c + warp_thread_offset_c * kAccessC; 48 | int cta_thread_offset_s = kFootprintS * warp_offset_s + warp_thread_offset_s; 49 | 50 | return {cta_thread_offset_c, cta_thread_offset_s}; 51 | } 52 | }; 53 | 54 | template 55 | struct RakedThreadMap { 56 | static constexpr int kDimC = DimC; 57 | static constexpr int kDimS = DimS; 58 | 59 | static constexpr int kWarpCount = WarpCount; 60 | static constexpr int kAccessC = AccessC; 61 | 62 | static constexpr int kWarpThreadC = WarpThreadC; 63 | static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC; 64 | 65 | static_assert(kWarpThreadC <= WARP_SIZE); 66 | 67 | static constexpr int kWarpAccessC = kWarpThreadC * kAccessC; 68 | static constexpr int kWarpAccessS = kWarpThreadS; 69 | 70 | static constexpr int kWarpIterC = (kDimC + kWarpAccessC - 1) / kWarpAccessC; 71 | static constexpr int kWarpIterS = kDimS / kWarpAccessS; 72 | 73 | static constexpr int kWarpC = 1; 74 | static constexpr int kWarpS = kWarpCount; 75 | 76 | static constexpr int kIterC = kWarpIterC / kWarpC; 77 | static constexpr int kIterS = std::max(kWarpIterS / kWarpS, 1); 78 | 79 | // Allow partial tile when there is ONLY 1 iteration 80 | static_assert(kDimC % kWarpAccessC == 0 || kIterC == 1); 81 | 82 | static_assert(kIterC > 0); 83 | static_assert(kIterS > 0); 84 | 85 | static constexpr bool kPartialC = kDimC % kWarpAccessC != 0; 86 | 87 | static constexpr int kFootprintC = kWarpAccessC * kIterC; 88 | static constexpr int kFootprintS = kWarpAccessS * kIterS; 89 | 90 | static constexpr int kDeltaC = kWarpAccessC; 91 | static constexpr int kDeltaS = kWarpAccessS; 92 | 93 | // static constexpr int kDeltaC = kWarpAccessC * kWarpC; 94 | // static constexpr int kDeltaS = kWarpAccessS * kWarpS; 95 | 96 | __device__ static int2 get_offset(int warp_id, int lane_id) 97 | { 98 | int warp_offset_c = warp_id % kWarpC; 99 | int warp_offset_s = warp_id / kWarpC; 100 | 101 | int warp_thread_offset_c = lane_id % kWarpThreadC; 102 | int warp_thread_offset_s = lane_id / kWarpThreadC; 103 | 104 | int cta_thread_offset_c = kFootprintC * warp_offset_c + warp_thread_offset_c * kAccessC; 105 | int cta_thread_offset_s = kFootprintS * warp_offset_s + warp_thread_offset_s; 106 | 107 | // int cta_thread_offset_c = kWarpAccessC * warp_offset_c + warp_thread_offset_c * kAccessC; 108 | // int cta_thread_offset_s = kWarpAccessS * warp_offset_s + warp_thread_offset_s; 109 | 110 | return {cta_thread_offset_c, cta_thread_offset_s}; 111 | } 112 | }; 113 | 114 | namespace { 115 | 116 | template 117 | void Print(TMap) 118 | { 119 | std::cout << " warps: " << TMap::kWarpCount << "\n"; 120 | std::cout << " shape: (" << TMap::kDimC << ", " << TMap::kDimS << ")\n"; 121 | std::cout << " access: (" << TMap::kAccessC << ", " << 1 << ")\n"; 122 | std::cout << "warpThread: (" << TMap::kWarpThreadC << ", " << TMap::kWarpThreadS << ")\n"; 123 | std::cout << "warpAccess: (" << TMap::kWarpAccessC << ", " << TMap::kWarpAccessS << ")\n"; 124 | std::cout << " warpIter: (" << TMap::kWarpIterC << ", " << TMap::kWarpIterS << ")\n"; 125 | std::cout << " warp: (" << TMap::kWarpC << ", " << TMap::kWarpS << ")\n"; 126 | std::cout << " iter: (" << TMap::kIterC << ", " << TMap::kIterS << ")\n"; 127 | std::cout << " footprint: (" << TMap::kFootprintC << ", " << TMap::kFootprintS << ")\n"; 128 | std::cout << " delta: (" << TMap::kDeltaC << ", " << TMap::kDeltaS << ")\n"; 129 | std::cout << " partialC: " << TMap::kPartialC << "\n"; 130 | } 131 | 132 | } // namespace 133 | 134 | } // namespace turbomind 135 | -------------------------------------------------------------------------------- /example/modeling_turbomind.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Dict, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | import transformers 7 | from accelerate.big_modeling import (init_empty_weights, 8 | load_checkpoint_and_dispatch) 9 | from module import get_named_linears, set_op_by_name 10 | from tqdm import tqdm 11 | from transformers import AutoConfig, PretrainedConfig, PreTrainedModel 12 | from typing_extensions import Annotated, Doc 13 | 14 | # from turbomind import Linear 15 | import turbomind 16 | 17 | 18 | class TurbomindForCausalLM(nn.Module): 19 | 20 | def __init__( 21 | self, 22 | model, 23 | is_quantized, 24 | config, 25 | quant_config, 26 | ): 27 | """The base model for all AutoAWQ models. 28 | 29 | Args: 30 | model: The pretrained or quantized model. 31 | is_quantized: Indicates if the current model is quantized 32 | config: The config of the model. 33 | quant_config: The quantization config of the model. 34 | """ 35 | super().__init__() 36 | self.model: PreTrainedModel = model 37 | self.is_quantized: bool = is_quantized 38 | self.search_result = None 39 | self.config: PretrainedConfig = config 40 | self.quant_config = quant_config 41 | 42 | def to(self, device: Annotated[str, 43 | Doc('The device to move your model to.')]): 44 | """A utility function for moving the model to a device.""" 45 | return self.model.to(device) 46 | 47 | def forward(self, *args, **kwargs): 48 | """A forward function that mimics the torch forward.""" 49 | return self.model(*args, **kwargs) 50 | 51 | def generate(self, *args, **kwargs): 52 | """A generate function that mimics the HF generate function.""" 53 | with torch.inference_mode(): 54 | return self.model.generate(*args, **kwargs) 55 | 56 | @classmethod 57 | def from_quantized(self, 58 | model_path: str, 59 | torch_dtype: torch.dtype = torch.float16, 60 | device_map: Union[str, Dict] = 'balanced', 61 | **config_kwargs: Dict): 62 | """A method for initialization of a quantized model, usually in INT4. 63 | 64 | Args: 65 | model_path (str): The model path 66 | max_seq_len (int): The maximum sequence cached sequence length of 67 | the model. Larger values may increase loading time and 68 | memory usage. 69 | torch_dtype: The dtype to load the model as. May not work with 70 | other values than float16. 71 | device_map: A device map that will be passed onto the model 72 | loading method from transformers. 73 | **config_kwargs: Additional kwargs that are passed to the config 74 | during initialization 75 | """ 76 | config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) 77 | quant_config = config.quantization_config 78 | 79 | target_cls = getattr(transformers, config.architectures[0]) 80 | 81 | # Load model 82 | with init_empty_weights(): 83 | model = target_cls._from_config(config=config, 84 | torch_dtype=torch_dtype) 85 | # Prepare quantized linear layers, replace nn.Linear 86 | self._load_quantized_modules( 87 | self, 88 | model, 89 | quant_config, 90 | ) 91 | 92 | model.tie_weights() 93 | 94 | # loads the weights into modules and distributes 95 | # across available devices automatically 96 | load_checkpoint_and_dispatch( 97 | model, 98 | checkpoint=model_path, 99 | device_map=device_map, 100 | no_split_module_classes=[model.model.layers[0].__class__.__name__], 101 | dtype=torch_dtype, 102 | ) 103 | 104 | # model = turbomind_post_init(model) 105 | for _, submodule in model.named_modules(): 106 | if isinstance(submodule, turbomind.Linear): 107 | submodule.post_init() 108 | 109 | model.eval() 110 | 111 | return self( 112 | model, 113 | is_quantized=True, 114 | config=config, 115 | quant_config=quant_config, 116 | ) 117 | 118 | def _load_quantized_modules(self, model, quant_config): 119 | assert quant_config['quant_method'] in ['awq', 'gptq'] 120 | if quant_config['quant_method'] == 'awq': 121 | assert quant_config['version'] == 'gemm' 122 | 123 | # Get blocks of model 124 | layers = model.model.layers 125 | 126 | for i in tqdm(range(len(layers)), desc='Replacing layers...'): 127 | layer = layers[i] 128 | 129 | # Get every linear layer in a block 130 | named_linears = get_named_linears(layer) 131 | 132 | # # Filter out the linear layers we don't want to include 133 | # named_linears = exclude_layers_to_not_quantize( 134 | # named_linears, quant_config.modules_to_not_convert) 135 | 136 | # Replace nn.Linear with turbomind Linear 137 | for name, module in named_linears.items(): 138 | q_linear_module = turbomind.Linear 139 | q_linear = q_linear_module.from_linear( 140 | module, quant_config['bits'], quant_config['group_size'], 141 | quant_config['quant_method'], True) 142 | q_linear.to(next(layer.parameters()).device) 143 | set_op_by_name(layer, name, q_linear) 144 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h" 4 | #include "src/turbomind/kernels/gemm/registry.h" 5 | #include "src/turbomind/kernels/gemm/transform.h" 6 | #include "src/turbomind/kernels/gemm/types.h" 7 | 8 | namespace turbomind::gemm { 9 | 10 | void Registry::f16_u4g128_f16_tnt_sm80_s16816() 11 | { 12 | using namespace sm80_s16816; 13 | using namespace cache_policy; 14 | using S = cache_policy::Stream; 15 | using D = cache_policy::Default; 16 | 17 | using C = Sm80_s16816, // A 19 | Transform_Default, // tarnsform A 20 | VoidOperand, // U 21 | Operand_B_Pack, // B 22 | Transform_HMMA_16816<1, 0>, // transform B 23 | Operand_UV_Pack, // V 24 | kRowMajor, // order_C 25 | half>; // Tc 26 | 27 | // clang-format off 28 | // Add>(); // 0/0 29 | Add>(); // 30/3 30 | Add>(); // --/20 31 | Add>(); // --/13 32 | Add>(); // 21/13 33 | Add>(); // 6/6 34 | 35 | Add>(); // --/3 36 | Add>(); // 13/13 37 | Add>(); // 14/10 38 | Add>(); // 2/2 39 | 40 | Add>(); // --/21 41 | Add>(); // 27/13 42 | Add>(); // 8/5 43 | Add>(); // 7/5 44 | Add>(); // 6/7 45 | Add>(); 46 | 47 | Add>(); // 1/1 48 | Add>(); // 1/1 49 | Add>(); // 4/4 50 | Add>(); 51 | 52 | Add>(); 53 | Add>(); 54 | Add>(); 55 | Add>(); 56 | Add>(); 57 | 58 | Add>(); 59 | Add>(); 60 | Add>(); 61 | Add>(); 62 | Add>(); 63 | // clang-format on 64 | } 65 | 66 | // sm80_f16_u4g128_f16_ttt_128x256x32_4_s16816_1x8x1_c128x128_a1x32x32_00: 46 67 | // sm80_f16_u4g128_f16_ttt_128x128x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 27 68 | // sm80_f16_u4g128_f16_ttt_64x256x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 21 69 | // sm80_f16_u4g128_f16_ttt_64x256x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 19 70 | // sm80_f16_u4g128_f16_ttt_16x128x128_4_s16816_1x4x2_c16x128_a1x32x128_01: 17 71 | // sm80_f16_u4g128_f16_ttt_32x128x128_3_s16816_1x4x2_c32x128_a1x32x128_01: 16 72 | // sm80_f16_u4g128_f16_ttt_64x128x128_3_s16816_1x4x2_c64x128_a1x32x128_01: 16 73 | // sm80_f16_u4g128_f16_ttt_96x128x32_4_s16816_1x4x1_c96x128_a1x32x32_01: 16 74 | // sm80_f16_u4g128_f16_ttt_96x256x32_4_s16816_1x8x1_c96x256_a1x32x32_00: 15 75 | // sm80_f16_u4g128_f16_ttt_16x64x128_3_s16816_1x2x2_c16x64_a1x32x128_01: 13 76 | // sm80_f16_u4g128_f16_ttt_16x128x64_4_s16816_1x4x1_c16x128_a1x32x64_01: 13 77 | // sm80_f16_u4g128_f16_ttt_48x128x128_3_s16816_1x4x2_c48x128_a1x32x128_01: 13 78 | // sm80_f16_u4g128_f16_ttt_48x256x64_3_s16816_1x4x1_c48x128_a1x32x64_01: 13 79 | // sm80_f16_u4g128_f16_ttt_16x64x128_4_s16816_1x2x2_c16x64_a1x32x128_01: 11 80 | // sm80_f16_u4g128_f16_ttt_64x128x64_3_s16816_1x4x1_c64x128_a1x32x64_01: 9 81 | // sm80_f16_u4g128_f16_ttt_128x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 9 82 | // sm80_f16_u4g128_f16_ttt_96x128x128_3_s16816_1x4x2_c96x128_a1x32x128_01: 7 83 | // sm80_f16_u4g128_f16_ttt_96x256x32_3_s16816_1x8x1_c96x256_a1x32x32_01: 7 84 | // sm80_f16_u4g128_f16_ttt_48x128x64_4_s16816_1x4x1_c48x128_a1x32x64_01: 6 85 | // sm80_f16_u4g128_f16_ttt_32x64x128_4_s16816_1x2x2_c32x64_a1x32x128_01: 5 86 | // sm80_f16_u4g128_f16_ttt_32x256x64_3_s16816_1x4x1_c32x256_a1x32x64_01: 5 87 | // sm80_f16_u4g128_f16_ttt_64x64x64_6_s16816_1x2x2_c64x64_a1x32x64_01: 5 88 | // sm80_f16_u4g128_f16_ttt_16x128x128_3_s16816_1x4x2_c16x128_a1x32x128_01: 4 89 | // sm80_f16_u4g128_f16_ttt_32x128x64_4_s16816_1x4x1_c32x128_a1x32x64_01: 4 90 | // sm80_f16_u4g128_f16_ttt_48x64x128_4_s16816_1x2x2_c48x64_a1x32x128_01: 4 91 | // sm80_f16_u4g128_f16_ttt_64x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 4 92 | // sm80_f16_u4g128_f16_ttt_128x128x64_3_s16816_1x4x2_c64x128_a1x32x64_01: 4 93 | // sm80_f16_u4g128_f16_ttt_128x256x32_3_s16816_1x8x1_c128x128_a1x32x32_00: 4 94 | // sm80_f16_u4g128_f16_ttt_32x64x128_3_s16816_1x2x2_c32x64_a1x32x128_01: 3 95 | // sm80_f16_u4g128_f16_ttt_128x256x64_3_s16816_1x8x1_c128x256_a1x32x64_01: 0 96 | 97 | } // namespace turbomind::gemm 98 | --------------------------------------------------------------------------------