├── README_zh-CN.md
├── README.md
├── requirements
    ├── runtime.txt
    ├── build.txt
    └── test.txt
├── requirements.txt
├── src
    └── turbomind
    │   ├── CMakeLists.txt
    │   ├── utils
    │       ├── CMakeLists.txt
    │       ├── macro.h
    │       ├── parser.h
    │       ├── parser.cc
    │       └── tensor.h
    │   ├── kernels
    │       ├── gemm
    │       │   ├── gpu_metric.h
    │       │   ├── simt.h
    │       │   ├── tuner
    │       │   │   ├── cache_utils.h
    │       │   │   ├── stopping_criterion.h
    │       │   │   ├── sampler.h
    │       │   │   ├── cache_utils.cu
    │       │   │   ├── stats.h
    │       │   │   ├── params.h
    │       │   │   ├── stopping_criterion.cc
    │       │   │   ├── measurer.h
    │       │   │   ├── measurer.cu
    │       │   │   ├── sampler.cu
    │       │   │   └── params.cc
    │       │   ├── test
    │       │   │   ├── reference.h
    │       │   │   ├── quantization.h
    │       │   │   ├── models.h
    │       │   │   ├── test_utils.h
    │       │   │   ├── quantization.cu
    │       │   │   ├── gemm_bench.cu
    │       │   │   ├── gemm_test.cu
    │       │   │   └── reference.cu
    │       │   ├── dispatch_cache.h
    │       │   ├── arch.h
    │       │   ├── predicate.h
    │       │   ├── registry.h
    │       │   ├── cast.h
    │       │   ├── registry.cu
    │       │   ├── kernel
    │       │   │   ├── f16_u4g128_f16_tnt_sm75_simt.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm75_s16816.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm90_s16816.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm70_s884.cu
    │       │   │   └── f16_u4g128_f16_tnt_sm80_s16816.cu
    │       │   ├── arch
    │       │   │   ├── mma_simt.h
    │       │   │   ├── mma_sm80.h
    │       │   │   ├── mma_sm70.h
    │       │   │   ├── smem_copy_simt.h
    │       │   │   ├── config_sm70_s884.h
    │       │   │   ├── config_sm75_s16816.h
    │       │   │   ├── config_sm80_s16816.h
    │       │   │   ├── config_simt.h
    │       │   │   ├── smem_copy_sm70.h
    │       │   │   ├── operand_sm70_s884.h
    │       │   │   └── operand_simt.h
    │       │   ├── format.h
    │       │   ├── gemm.h
    │       │   ├── operand.h
    │       │   ├── CMakeLists.txt
    │       │   ├── desc.h
    │       │   ├── iterator.h
    │       │   ├── unpack.cu
    │       │   ├── cta_map.h
    │       │   ├── kernel.h
    │       │   ├── utils.h
    │       │   ├── thread_group_map.h
    │       │   ├── transform.h
    │       │   ├── gpu_metric.cu
    │       │   └── types.h
    │       └── core
    │       │   ├── pipe_iter.h
    │       │   ├── math.h
    │       │   ├── meta.h
    │       │   ├── sub_byte_ptr.h
    │       │   ├── sync.h
    │       │   ├── common.h
    │       │   ├── data_type.h
    │       │   ├── smem.h
    │       │   ├── array.h
    │       │   ├── layout.h
    │       │   └── thread_map.h
    │   └── api
    │       └── python
    │           ├── linear.h
    │           └── CMakeLists.txt
├── MANIFEST.in
├── turbomind
    ├── __init__.py
    ├── version.py
    └── utils.py
├── generate.sh
├── .gitignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 3-documentation.yml
    │   ├── 1-feature-request.yml
    │   └── 2-bug-report.yml
    ├── workflows
    │   ├── lint.yml
    │   ├── windows-x64-gpu.yml
    │   ├── linux-x64-gpu.yml
    │   ├── pypi.yml
    │   └── cuda11.8-whl-release.yml
    └── md-link-config.json
├── example
    ├── generate.py
    ├── module.py
    ├── test_linear.py
    └── modeling_turbomind.py
├── .pre-commit-config.yaml
└── .clang-format


/README_zh-CN.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # turbomind
2 | 


--------------------------------------------------------------------------------
/requirements/runtime.txt:
--------------------------------------------------------------------------------
1 | torch
2 | 


--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | pybind11<=2.13.1
2 | setuptools
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime.txt
3 | 


--------------------------------------------------------------------------------
/src/turbomind/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(utils)
2 | add_subdirectory(kernels/gemm)
3 | add_subdirectory(api/python)
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | include turbomind/lib/*.so
3 | include turbomind/lib/*.so*
4 | include turbomind/lib/*.dll
5 | include turbomind/lib/*.pyd
6 | 


--------------------------------------------------------------------------------
/turbomind/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 
3 | from .linear import Linear
4 | 
5 | __all__ = ['Linear']
6 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | add_library(parser STATIC parser.cc)
4 | set_property(TARGET parser PROPERTY POSITION_INDEPENDENT_CODE  ON)
5 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
 1 | allure-pytest
 2 | coverage
 3 | pynvml
 4 | pytest
 5 | pytest-assume
 6 | pytest-order
 7 | pytest-rerunfailures
 8 | pytest-sugar
 9 | pytest-xdist
10 | pyyaml
11 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/macro.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
 4 | 
 5 | #define __PRETTY_FUNCTION__ __FUNCSIG__
 6 | 
 7 | #endif
 8 | 
 9 | typedef unsigned int uint;
10 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/gpu_metric.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | 
 7 | namespace turbomind::gemm {
 8 | 
 9 | // bytes / second
10 | float MeasureL2CacheThroughput();
11 | 
12 | // fused multiply-add / second
13 | float MeasureMmaThroughput(int proble_size = 16384);
14 | 
15 | }  // namespace turbomind::gemm
16 | 


--------------------------------------------------------------------------------
/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | WORKSPACE_PATH=$(dirname "$(readlink -f "$0")")
 3 | 
 4 | builder="-G Ninja"
 5 | 
 6 | if [ "$1" == "make" ]; then
 7 |     builder=""
 8 | fi
 9 | 
10 | cmake ${builder} .. \
11 |     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
12 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
13 |     -DCMAKE_INSTALL_PREFIX=${WORKSPACE_PATH}/install \
14 |     -DCMAKE_CUDA_FLAGS="-lineinfo" \
15 |     -DUSE_NVTX=ON \
16 |     -DBUILD_TEST=OFF
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/simt.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::gemm::simt {
 6 | 
 7 | // constexpr int OP_M = 2;
 8 | // constexpr int OP_N = 16;
 9 | // constexpr int OP_K = 4;
10 | 
11 | // constexpr int OP_M = 4;
12 | // constexpr int OP_N = 8;
13 | // constexpr int OP_K = 8;
14 | 
15 | constexpr int OP_M = 1;
16 | constexpr int OP_N = 32;
17 | constexpr int OP_K = 8;
18 | 
19 | }  // namespace turbomind::gemm::simt
20 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/cache_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | 
 7 | namespace turbomind::gemm {
 8 | 
 9 | class CacheFlushing {
10 | public:
11 |     static void flush(cudaStream_t stream = {});
12 | 
13 | private:
14 |     CacheFlushing();
15 |     void operator()(cudaStream_t stream) const;
16 | 
17 |     uint32_t* buffer_;
18 |     size_t    size_;
19 | };
20 | 
21 | }  // namespace turbomind::gemm
22 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stopping_criterion.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/stats.h"
 4 | #include <memory>
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | class StoppingCriterion {
 9 | public:
10 |     virtual ~StoppingCriterion()                 = default;
11 |     virtual bool should_stop(const Stats& stats) = 0;
12 | };
13 | 
14 | std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms);
15 | 
16 | }  // namespace turbomind::gemm
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/pipe_iter.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | template<int Stages, int Step = 1>
 8 | struct PipeIter {
 9 |     static constexpr int kMaxStep = Stages * Step;
10 | 
11 |     int r = 0;
12 |     int w = kMaxStep - Step;
13 | 
14 |     __inline__ __device__ PipeIter& operator++()
15 |     {
16 |         w = r;
17 |         r += Step;
18 |         if (r == kMaxStep) {
19 |             r -= kMaxStep;
20 |         }
21 |         return *this;
22 |     }
23 | };
24 | 
25 | }  // namespace turbomind
26 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/reference.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | 
 7 | #include <cublas_v2.h>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | class Reference {
12 | public:
13 |     Reference();
14 |     ~Reference();
15 | 
16 |     void set_stream(cudaStream_t stream);
17 | 
18 |     void gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc);
19 | 
20 | private:
21 |     cublasHandle_t handle_;
22 | };
23 | 
24 | }  // namespace turbomind::gemm
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | 
35 | .cache
36 | /build
37 | 
38 | # Byte-compiled / optimized / DLL files
39 | __pycache__/
40 | .vscode/
41 | 
42 | # Distribution / packaging
43 | .eggs/
44 | wheels/
45 | *.egg-info/
46 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to the documentation.
 3 | labels: "kind/doc,status/unconfirmed"
 4 | title: "[Docs] "
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description the issue.
12 |   validations:
13 |     required: true
14 | 
15 | - type: textarea
16 |   attributes:
17 |     label: Suggest a potential alternative/fix
18 |     description: >
19 |       Tell us how we could improve the documentation in this regard.
20 | - type: markdown
21 |   attributes:
22 |     value: >
23 |       Thanks for contributing 🎉!
24 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/sampler.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/desc.h"
 6 | #include "src/turbomind/kernels/gemm/tuner/measurer.h"
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | class Sampler {
13 | public:
14 |     explicit Sampler(Measurer& measurer, int k_clusters): measurer_{measurer}, k_clusters_{k_clusters} {}
15 | 
16 |     std::vector<LaunchSpec> Run(std::vector<LaunchSpec> specs, const Launcher& launcher, cudaStream_t stream);
17 | 
18 | private:
19 |     Measurer& measurer_;
20 |     int       k_clusters_;
21 | };
22 | 
23 | }  // namespace turbomind::gemm
24 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/cache_utils.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | CacheFlushing::CacheFlushing()
 8 | {
 9 |     cudaDeviceProp props{};
10 |     cudaGetDeviceProperties(&props, 0);
11 | 
12 |     size_ = props.l2CacheSize;
13 | 
14 |     cudaMalloc(&buffer_, size_);
15 | }
16 | 
17 | void CacheFlushing::flush(cudaStream_t stream)
18 | {
19 |     thread_local CacheFlushing inst{};
20 |     inst(stream);
21 | }
22 | 
23 | void CacheFlushing::operator()(cudaStream_t stream) const
24 | {
25 |     cudaMemsetAsync(buffer_, 0, size_, stream);
26 | }
27 | 
28 | }  // namespace turbomind::gemm
29 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/math.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/common.h"
 6 | #include <cassert>
 7 | 
 8 | namespace turbomind {
 9 | 
10 | template<class T>
11 | TM_HOST_DEVICE constexpr T ceil_div(T a, T b)
12 | {
13 |     return (a + b - 1) / b;
14 | }
15 | 
16 | template<class T>
17 | TM_HOST_DEVICE constexpr T round_up(T a, T b)
18 | {
19 |     return (a + b - 1) / b * b;
20 | }
21 | 
22 | template<class T>
23 | TM_HOST_DEVICE constexpr T log2(T x)
24 | {
25 |     T n = 0;
26 |     while (x != 1) {
27 |         x /= 2;
28 |         ++n;
29 |     }
30 |     return n;
31 | }
32 | 
33 | // static_assert(log2(65536) == 16);
34 | // static_assert(log2(32) == 5);
35 | // static_assert(log2(1) == 0);
36 | 
37 | }  // namespace turbomind
38 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/dispatch_cache.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/desc.h"
 4 | 
 5 | #include <memory>
 6 | #include <optional>
 7 | #include <vector>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | class DispatchCache {
12 | public:
13 |     DispatchCache(std::vector<Kernel*> kernels);
14 | 
15 |     ~DispatchCache();
16 | 
17 |     std::optional<LaunchSpec> LowerBound(const GemmDesc& desc) const;
18 | 
19 |     std::optional<LaunchSpec> Find(const GemmDesc& desc) const;
20 | 
21 |     bool Insert(const GemmDesc& desc, const LaunchSpec& spec);
22 | 
23 |     int Export(std::ostream& os) const;
24 | 
25 |     int Import(std::istream& is);
26 | 
27 | private:
28 |     struct Impl;
29 |     std::unique_ptr<Impl> impl_;
30 | };
31 | 
32 | }  // namespace turbomind::gemm
33 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/parser.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str);
 9 | 
10 | std::vector<std::string> ParseListOrTuple(const std::string& str);
11 | 
12 | inline void Parse(int& value, const std::string& str)
13 | {
14 |     value = std::stoi(str);
15 | }
16 | 
17 | inline void Parse(float& value, const std::string& str)
18 | {
19 |     value = std::stof(str);
20 | }
21 | 
22 | template<class T>
23 | void Parse(std::vector<T>& xs, const std::string& str)
24 | {
25 |     const auto ss = ParseListOrTuple(str);
26 |     for (const auto& s : ss) {
27 |         xs.emplace_back();
28 |         Parse(xs.back(), s);
29 |     }
30 | }
31 | 
32 | }  // namespace turbomind
33 | 


--------------------------------------------------------------------------------
/turbomind/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Tuple
 3 | 
 4 | __version__ = '0.0.2'
 5 | short_version = __version__
 6 | 
 7 | 
 8 | def parse_version_info(version_str: str) -> Tuple:
 9 |     """Parse version from a string.
10 | 
11 |     Args:
12 |         version_str (str): A string represents a version info.
13 | 
14 |     Returns:
15 |         tuple: A sequence of integer and string represents version.
16 |     """
17 |     _version_info = []
18 |     for x in version_str.split('.'):
19 |         if x.isdigit():
20 |             _version_info.append(int(x))
21 |         elif x.find('rc') != -1:
22 |             patch_version = x.split('rc')
23 |             _version_info.append(int(patch_version[0]))
24 |             _version_info.append(f'rc{patch_version[1]}')
25 |     return tuple(_version_info)
26 | 
27 | 
28 | version_info = parse_version_info(__version__)
29 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/quantization.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/types.h"
 4 | #include <thrust/device_vector.h>
 5 | #include <thrust/universal_vector.h>
 6 | 
 7 | #pragma once
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | template<class D, class S>
12 | void Quantize(const thrust::universal_vector<S>&  x,
13 |               int                                 m,
14 |               int                                 k,
15 |               Order                               order,
16 |               int                                 group_size,
17 |               thrust::universal_vector<S>&        x_p,  // pseudo-quantized
18 |               thrust::universal_vector<uint16_t>& x_q,  // quantized ushort
19 |               thrust::universal_vector<S>&        x_u,  // scales & zeros (always m-major)
20 |               cudaStream_t                        stream);
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/turbomind/api/python/linear.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | #include "src/turbomind/utils/tensor.h"
 7 | #include <cuda_runtime.h>
 8 | #include <istream>
 9 | #include <memory>
10 | #include <ostream>
11 | 
12 | namespace turbomind {
13 | 
14 | enum class WeightType : int
15 | {
16 |     kFP32,
17 |     kFP16,
18 |     kFP8,  // not supported yet
19 |     kBF16,
20 |     kINT8,
21 |     kINT4
22 | };
23 | 
24 | class Linear {
25 | public:
26 |     Linear(size_t input_dims, size_t output_dims, int w_bit, int group_size);
27 |     void post_init(std::shared_ptr<Tensor> qweight, const Tensor& scales, const Tensor& qzeros, bool simt);
28 |     void forward(const Tensor& in, Tensor& out, cudaStream_t stream = nullptr);
29 |     ~Linear() {}
30 | 
31 |     static void clearWorkspaces();
32 | 
33 | private:
34 |     struct Impl;
35 |     std::shared_ptr<Impl> impl_;
36 | };
37 | };  // namespace turbomind
38 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stats.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <limits>
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | class Stats {
 8 | public:
 9 |     Stats(): count_{}, mean_{}, m2_{} {}
10 | 
11 |     float mean() const noexcept
12 |     {
13 |         return mean_;
14 |     }
15 | 
16 |     float sum() const noexcept
17 |     {
18 |         return mean_ * count_;
19 |     }
20 | 
21 |     int count() const noexcept
22 |     {
23 |         return count_;
24 |     }
25 | 
26 |     float get_variance() const noexcept
27 |     {
28 |         return count_ < 2 ? std::numeric_limits<float>::quiet_NaN() : m2_ / count_;
29 |     }
30 | 
31 |     void add_sample(float x) noexcept
32 |     {
33 |         ++count_;
34 |         float delta = x - mean_;
35 |         mean_ += delta / count_;
36 |         float delta2 = x - mean_;
37 |         m2_ += delta * delta2;
38 |     }
39 | 
40 | private:
41 |     int   count_;
42 |     float mean_;
43 |     float m2_;
44 | };
45 | 
46 | }  // namespace turbomind::gemm
47 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/meta.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | template<class T>
 8 | struct basic_type {
 9 |     using type = T;
10 | };
11 | 
12 | template<class T>
13 | constexpr basic_type<T> type_c{};
14 | 
15 | template<auto v>
16 | struct constant {
17 |     using type       = constant;
18 |     using value_type = decltype(v);
19 | 
20 |     static constexpr value_type value = v;
21 | 
22 |     constexpr value_type operator()() const noexcept
23 |     {
24 |         return v;
25 |     }
26 |     constexpr operator value_type() const noexcept
27 |     {
28 |         return v;
29 |     }
30 | };
31 | 
32 | template<auto u, auto v>
33 | struct pair {
34 | };
35 | 
36 | template<auto u, auto v>
37 | constexpr auto first(pair<u, v>)
38 | {
39 |     return u;
40 | }
41 | 
42 | template<auto u, auto v>
43 | constexpr auto second(pair<u, v>)
44 | {
45 |     return v;
46 | }
47 | 
48 | template<auto u, auto v, auto w>
49 | struct triplet {
50 | };
51 | 
52 | }  // namespace turbomind
53 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/models.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | #include <utility>
 7 | #include <vector>
 8 | 
 9 | static const std::vector<std::pair<int64_t, int64_t>> config{
10 |     {11008 * 2, 4096}, {4096, 11008}, {12288, 4096}, {4096, 4096},  // llama2-7b
11 |     {14336 * 2, 4096}, {4096, 14336}, {6144, 4096},  {4096, 4096},  // llama3-8b / internlm2.5-7b
12 |     {16384 * 2, 6144}, {6144, 16384}, {8192, 6144},  {6144, 6144},  // internlm2-20b
13 |     {13696 * 2, 4096}, {4096, 13696}, {4608, 4096},  {4096, 4096},  // glm4-9b
14 |     {18944 * 2, 3584}, {3584, 18944}, {4608, 3584},  {3584, 3584},  // qwen2-7b
15 |     {20480 * 2, 7168}, {7168, 20480}, {9216, 7168},  {7168, 7168},  // yi-34b
16 |     {28672 * 2, 8192}, {8192, 28672}, {10240, 8192}, {8192, 8192},  // llama2-70b / llama3-70b
17 |     {29696 * 2, 8192}, {8192, 29696}, {10240, 8192}, {8192, 8192}   // qwen2-72b-instruct-awq
18 | };
19 | // {29568 * 2, 8192}, {8192, 29568}, {10240, 8192}, {8192, 8192},  // qwen2-72b
20 | 


--------------------------------------------------------------------------------
/src/turbomind/api/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | project(_turbomind_ext)
 5 | 
 6 | find_package(pybind11 CONFIG)
 7 | if(NOT pybind11_FOUND)
 8 |     execute_process(COMMAND "pybind11-config" "--cmakedir"
 9 |                     RESULT_VARIABLE _COMMAND_SUCCESS
10 |                     OUTPUT_VARIABLE pybind11_DIR
11 |                     OUTPUT_STRIP_TRAILING_WHITESPACE)
12 |     find_package(pybind11 CONFIG)
13 | endif()
14 | 
15 | pybind11_add_module(${PROJECT_NAME} bind.cpp linear.cc)
16 | target_link_libraries(${PROJECT_NAME} PRIVATE gemm2)
17 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
18 | 
19 | set(_INSTALL_CUDA_RPATH
20 |     "\$ORIGIN"
21 |     "\$ORIGIN/../../nvidia/nccl/lib/"
22 |     "\$ORIGIN/../../nvidia/cuda_runtime/lib/"
23 |     "\$ORIGIN/../../nvidia/cublas/lib/"
24 |     "\$ORIGIN/../../nvidia/curand/lib/"
25 | )
26 | set_target_properties(${PROJECT_NAME} PROPERTIES
27 |     BUILD_RPATH "\$ORIGIN"
28 |     INSTALL_RPATH "${_INSTALL_CUDA_RPATH}"
29 | )
30 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-20.04
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - name: Set up Python 3.8
11 |         uses: actions/setup-python@v2
12 |         with:
13 |           python-version: 3.8
14 |       - name: Install pre-commit hook
15 |         run: |
16 |           python -m pip install pre-commit
17 |           pre-commit install
18 |       - name: Linting
19 |         run: pre-commit run --all-files
20 |       - name: Format c/cuda codes with clang-format
21 |         uses: DoozyX/clang-format-lint-action@v0.13
22 |         with:
23 |           source: src
24 |           extensions: h,c,cpp,hpp,cu,cuh,cc
25 |           clangFormatVersion: 11
26 |           style: file
27 |       - name: Check markdown link
28 |         uses: gaurav-nelson/github-action-markdown-link-check@v1
29 |         with:
30 |           use-quiet-mode: 'yes'
31 |           use-verbose-mode: 'yes'
32 |           config-file: '.github/md-link-config.json'
33 |           file-path: './README.md, ./LICENSE, ./README_zh-CN.md'
34 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/params.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <array>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | struct TuningParams {
12 |     // Split-k params
13 |     int max_splits = 8;
14 |     int max_waves  = 10;
15 | 
16 |     // Swizzling params
17 |     std::vector<int> swizzle{3};
18 | 
19 |     // Sampling params
20 |     float top_k    = 0;
21 |     int   clusters = 5;
22 |     int   min_iter = 1;
23 |     int   max_iter = 10;
24 |     float max_time = 1.f;
25 | 
26 |     std::vector<int> seq;
27 | };
28 | 
29 | // example
30 | //   max_splits=8,top_splits=5,max_waves=16,top_k=10,swizzle=[2,3,4],clusters=5,max_iter=10,min_iter=1,max_time=10.0
31 | void ParseTuningParams(TuningParams& params, const std::string& str);
32 | 
33 | // example
34 | //   16-16-128,256-128-1024,8192
35 | std::vector<int> ParseTuningSequence(const std::string& str);
36 | 
37 | std::vector<int> GenerateTuningSequence(const std::vector<std::array<int, 3>>& generators);
38 | 
39 | std::vector<std::array<int, 3>> GetDefaultTuningGenerators();
40 | 
41 | }  // namespace turbomind::gemm
42 | 


--------------------------------------------------------------------------------
/.github/md-link-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ignorePatterns": [
 3 |       {
 4 |         "pattern": "^https://www.reddit.com/"
 5 |       },
 6 |       {
 7 |         "pattern": "^https://developer.nvidia.com/"
 8 |       },
 9 |       {
10 |         "pattern": "^https://docs.openvino.ai/"
11 |       },
12 |       {
13 |         "pattern": "^https://developer.android.com/"
14 |       },
15 |       {
16 |         "pattern": "^https://developer.qualcomm.com/"
17 |       },
18 |       {
19 |         "pattern": "^http://localhost"
20 |       },
21 |       {
22 |         "pattern": "^https://twitter.com"
23 |       },
24 |       {
25 |         "pattern": "^https://platform.openai.com"
26 |       },
27 |       {
28 |         "pattern": "^http://0.0.0.0"
29 |       }
30 |     ],
31 |     "httpHeaders": [
32 |       {
33 |         "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"],
34 |         "headers": {
35 |           "Accept-Encoding": "zstd, br, gzip, deflate"
36 |         }
37 |       }
38 |     ],
39 |     "timeout": "20s",
40 |     "retryOn429": true,
41 |     "retryCount": 5,
42 |     "fallbackRetryDelay": "30s",
43 |     "aliveStatusCodes": [200, 206, 429]
44 |   }
45 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stopping_criterion.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
 4 | #include <memory>
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | namespace stopping_criterions {
 9 | 
10 | class Optimistic: public StoppingCriterion {
11 | public:
12 |     Optimistic(int min_iter, int max_iter, float max_ms)
13 |     {
14 |         min_iter_ = std::max(min_iter, 1);
15 |         max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits<int>::max();
16 |         max_ms_   = max_ms > 0 ? max_ms : std::numeric_limits<float>::infinity();
17 |     }
18 |     bool should_stop(const Stats& stats) override
19 |     {
20 |         return stats.count() >= min_iter_ && (stats.count() >= max_iter_ || stats.sum() >= max_ms_);
21 |     }
22 | 
23 | private:
24 |     int   min_iter_;
25 |     int   max_iter_;
26 |     float max_ms_;
27 | };
28 | 
29 | }  // namespace stopping_criterions
30 | 
31 | std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms)
32 | {
33 |     return std::make_unique<stopping_criterions::Optimistic>(min_iter, max_iter, max_ms);
34 | }
35 | 
36 | }  // namespace turbomind::gemm
37 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/parser.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <iostream>
 4 | #include <regex>
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | namespace turbomind {
 9 | 
10 | std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str)
11 | {
12 |     const std::regex regex(R"((\w+)=([^,\[\(]+|\[.*\]|\(.*\)))");
13 | 
14 |     std::sregex_iterator beg(str.begin(), str.end(), regex);
15 |     std::sregex_iterator end{};
16 | 
17 |     std::vector<std::pair<std::string, std::string>> ret;
18 |     for (auto it = beg; it != end; ++it) {
19 |         std::smatch match = *it;
20 |         ret.emplace_back(match[1], match[2]);
21 |     }
22 | 
23 |     return ret;
24 | }
25 | 
26 | std::vector<std::string> ParseListOrTuple(const std::string& str)
27 | {
28 |     const std::regex regex(R"([,\[\]\(\)]+)");
29 | 
30 |     std::vector<std::string> ret;
31 |     std::copy_if(std::sregex_token_iterator(str.begin(), str.end(), regex, -1),
32 |                  std::sregex_token_iterator{},
33 |                  std::back_inserter(ret),
34 |                  [](const std::string& s) { return !s.empty(); });
35 | 
36 |     return ret;
37 | }
38 | 
39 | }  // namespace turbomind
40 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | // tags for dispatching & conditional codegen
 8 | 
 9 | template<int Begin, int End = -1>
10 | struct Arch {
11 |     static constexpr bool is_compatible(int arch)
12 |     {
13 |         return Begin <= arch && (End == -1 || arch < End);
14 |     }
15 | };
16 | 
17 | struct Sm70: Arch<700, 750> {
18 |     static constexpr int value = 700;
19 | };
20 | 
21 | struct Sm75: Arch<750, 800> {
22 |     static constexpr int value = 750;
23 | };
24 | 
25 | struct Sm80: Arch<800, 900> {
26 |     static constexpr int value = 800;
27 | };
28 | 
29 | struct Sm90: Arch<900> {
30 |     static constexpr int value = 900;
31 | };
32 | 
33 | inline bool is_arch_compatible(int karch, int darch)
34 | {
35 |     switch (karch) {
36 |         case 700:
37 |             return Sm70::is_compatible(darch);
38 |         case 750:
39 |             return Sm75::is_compatible(darch);
40 |         case 800:
41 |             return Sm80::is_compatible(darch);
42 |         case 900:
43 |             return Sm90::is_compatible(darch);
44 |         default:
45 |             return false;
46 |     }
47 | }
48 | 
49 | }  // namespace turbomind::gemm
50 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/predicate.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | #include <type_traits>
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | template<int S, int C, bool AlignedS, bool AlignedC>
11 | struct Predicate {
12 | 
13 |     static constexpr int kSizeC = AlignedC ? 1 : C;
14 | 
15 |     static_assert(S * kSizeC <= 32);
16 | 
17 |     static constexpr bool is_active = true;
18 | 
19 |     uint32_t pred_{};
20 | 
21 |     __device__ int operator()(int s, int c) const
22 |     {
23 |         return (pred_ & (1 << (s * kSizeC + c))) != 0;
24 |     }
25 | 
26 |     __device__ void set(int s, int c)
27 |     {
28 |         pred_ |= (1 << (s * kSizeC + c));
29 |     }
30 | 
31 |     __device__ void clear()
32 |     {
33 |         pred_ = 0;
34 |     }
35 | };
36 | 
37 | template<int S, int C>
38 | struct Predicate<S, C, true, true> {
39 | 
40 |     static constexpr bool is_active = false;
41 | 
42 |     __device__ constexpr std::integral_constant<int, 1> operator()(int, int) const
43 |     {
44 |         return {};
45 |     }
46 | 
47 |     __device__ void set(int, int) {}
48 | 
49 |     __device__ void clear()
50 |     {
51 |         // pred_ = 0;
52 |     }
53 | };
54 | 
55 | }  // namespace turbomind::gemm
56 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/registry.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/kernel_impl.h"
 6 | #include <memory>
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | class Registry {
11 | public:
12 |     explicit Registry(std::shared_ptr<cudaDeviceProp> device_prop);
13 | 
14 |     template<class Config>
15 |     [[maybe_unused]] bool Add()
16 |     {
17 |         return Add(std::make_unique<KernelImpl<typename Config::Kernel>>());
18 |     }
19 | 
20 |     [[nodiscard]] const std::vector<Kernel*>& kernels() const
21 |     {
22 |         return ptrs_;
23 |     }
24 | 
25 | private:
26 |     bool Add(std::unique_ptr<Kernel> kernel);
27 | 
28 |     void f16_u4g128_f16_tnt_sm70_s884();
29 |     void f16_u4g128_f16_tnt_sm75_simt();
30 |     void f16_u4g128_f16_tnt_sm75_s16816();
31 |     void f16_u4g128_f16_tnt_sm80_s16816();
32 |     void f16_u4g128_f16_tnt_sm90_s16816();
33 | 
34 |     void u4g128_f16_f16_nnn_sm80_s16816();
35 | 
36 | private:
37 |     std::shared_ptr<cudaDeviceProp>      device_prop_;
38 |     int                                  arch_;
39 |     std::vector<std::unique_ptr<Kernel>> kernels_;
40 |     std::vector<Kernel*>                 ptrs_;
41 | };
42 | 
43 | }  // namespace turbomind::gemm
44 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/measurer.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/desc.h"
 4 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
 5 | #include <climits>
 6 | #include <functional>
 7 | #include <memory>
 8 | #include <vector>
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | struct Measurement {
13 |     cudaError_t status;
14 |     int         sample_count;
15 |     float       mean;
16 |     float       variance;
17 | };
18 | 
19 | using Launcher = std::function<int(LaunchSpec, cudaStream_t)>;
20 | 
21 | class Measurer {
22 | public:
23 |     Measurer(std::unique_ptr<StoppingCriterion> stop_criterion);
24 | 
25 |     ~Measurer();
26 | 
27 |     std::vector<Measurement>
28 |     Measure(const std::vector<LaunchSpec>& specs, const Launcher& launcher, cudaStream_t stream);
29 | 
30 | private:
31 |     Measurement MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
32 | 
33 |     std::pair<float, cudaError_t> ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
34 | 
35 | private:
36 |     cudaEvent_t                        ev_beg_;
37 |     cudaEvent_t                        ev_end_;
38 |     std::unique_ptr<StoppingCriterion> stop_criterion_;
39 | };
40 | 
41 | }  // namespace turbomind::gemm
42 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | title: "[Feature] "
 4 | 
 5 | body:
 6 | - type: markdown
 7 |   attributes:
 8 |     value: |
 9 |       We strongly appreciate you creating a PR to implement this feature [here](https://github.com/InternLM/lmdeploy/pulls)!
10 |       If you need our help, please fill in as much of the following form as you're able to.
11 | 
12 |       **The less clear the description, the longer it will take to solve it.**
13 | - type: textarea
14 |   attributes:
15 |     label: Motivation
16 |     description: |
17 |       A clear and concise description of the motivation of the feature.
18 |       Ex1. It is inconvenient when \[....\].
19 |   validations:
20 |     required: true
21 | - type: textarea
22 |   attributes:
23 |     label: Related resources
24 |     description: |
25 |       If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
26 | - type: textarea
27 |   attributes:
28 |     label: Additional context
29 |     description: |
30 |       Add any other context or screenshots about the feature request here.
31 |       If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
32 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/sub_byte_ptr.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/data_type.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | template<class T>
10 | struct SubBytePtr {
11 | 
12 |     constexpr SubBytePtr() = default;
13 | 
14 |     constexpr __host__ __device__ explicit SubBytePtr(T* ptr): ptr_((char*)ptr) {}
15 | 
16 |     constexpr __host__ __device__ SubBytePtr(char* ptr): ptr_(ptr) {}
17 | 
18 |     __device__ T& operator[](int i)
19 |     {
20 |         return *reinterpret_cast<T*>(ptr_ + i * bitsof<T> / bitsof<char>);
21 |     }
22 | 
23 |     friend __device__ SubBytePtr operator+(const SubBytePtr a, int n)
24 |     {
25 |         return SubBytePtr{a.ptr_ + n * bitsof<T> / bitsof<char>};
26 |     }
27 | 
28 |     friend __device__ SubBytePtr operator+(int n, const SubBytePtr a)
29 |     {
30 |         return a + n;
31 |     }
32 | 
33 |     friend __device__ bool operator==(const SubBytePtr& a, const SubBytePtr& b)
34 |     {
35 |         return a.ptr_ == b.ptr_;
36 |     }
37 | 
38 |     __device__ explicit operator T*() const
39 |     {
40 |         return (T*)ptr_;
41 |     }
42 | 
43 |     char* ptr_;
44 | };
45 | 
46 | template<class T>
47 | struct get_pointer_type_t<T, std::enable_if_t<bitsof<T> % 8 != 0>> {
48 |     using type = SubBytePtr<T>;
49 | };
50 | 
51 | }  // namespace turbomind
52 | 


--------------------------------------------------------------------------------
/example/generate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from modeling_turbomind import TurbomindForCausalLM
 3 | from transformers import AutoTokenizer, TextStreamer
 4 | 
 5 | quant_path = '/models/140/llama3/Meta-Llama-3-8B-Instruct-hf-AWQ'
 6 | 
 7 | # Load model
 8 | model = TurbomindForCausalLM.from_quantized(quant_path)
 9 | tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)
10 | streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
11 | 
12 | prompt = "You're standing on the surface of the Earth. "\
13 |         'You walk one mile south, one mile west and one mile north. '\
14 |         'You end up exactly where you started. Where are you?'
15 | 
16 | chat = [
17 |     {
18 |         'role': 'system',
19 |         'content': 'You are a concise assistant that helps answer questions.'
20 |     },
21 |     {
22 |         'role': 'user',
23 |         'content': prompt
24 |     },
25 | ]
26 | 
27 | terminators = [
28 |     tokenizer.eos_token_id,
29 |     tokenizer.convert_tokens_to_ids('<|eot_id|>')
30 | ]
31 | 
32 | tokens = tokenizer.apply_chat_template(chat, return_tensors='pt')
33 | tokens = tokens.to(torch.device('cuda'))
34 | 
35 | # Generate output
36 | generation_output = model.generate(tokens,
37 |                                    streamer=streamer,
38 |                                    max_new_tokens=64,
39 |                                    eos_token_id=terminators)
40 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/test_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/utils/macro.h"
 6 | #include <cuda_fp16.h>
 7 | #include <memory>
 8 | #include <vector>
 9 | 
10 | namespace turbomind {
11 | 
12 | template<typename T>
13 | void Compare(const T* src,
14 |              const T* ref,
15 |              size_t   stride,
16 |              int      dims,
17 |              int      bsz,
18 |              bool     show = false,
19 |              float    rtol = 1e-2,
20 |              float    atol = 1e-4);
21 | 
22 | template<class T>
23 | std::vector<float>
24 | FastCompare(const T* src, const T* ref, int dims, int bsz, cudaStream_t stream, float rtol = 1e-2, float atol = 1e-4);
25 | 
26 | void LoadBinary(const std::string& path, size_t size, void* dst);
27 | 
28 | class RNG {
29 | public:
30 |     RNG();
31 |     ~RNG();
32 |     void GenerateUInt(uint* out, size_t count);
33 | 
34 |     template<typename T>
35 |     void GenerateUniform(T* out, size_t count, float scale = 1.f, float shift = 0.f);
36 | 
37 |     template<typename T>
38 |     void GenerateNormal(T* out, size_t count, float scale = 1.f, float shift = 0.f);
39 | 
40 |     cudaStream_t stream() const;
41 | 
42 |     void set_stream(cudaStream_t stream);
43 | 
44 | private:
45 |     struct Impl;
46 |     std::unique_ptr<Impl> impl_;
47 | };
48 | 
49 | }  // namespace turbomind
50 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/cast.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/core/data_type.h"
 4 | #include <cuda_runtime.h>
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | void extend_to_u8(uint8_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {});
 9 | 
10 | void extend_to_u16(uint16_t* dst, const uint4_t* src, size_t n, cudaStream_t st = {});
11 | 
12 | void compact_to_u4(uint4_t* dst, const uint8_t* src, size_t n, cudaStream_t st = {});
13 | 
14 | void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st = {});
15 | 
16 | void fuse_scales_and_zeros(half* fused, const half* scales, half* zeros, size_t n, cudaStream_t st = {});
17 | 
18 | template<class T>
19 | void interleave_output_dims_impl(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st);
20 | 
21 | template<class T>
22 | inline void interleave_output_dims(T* fused, const T* a, const T* b, int m, int k, cudaStream_t st)
23 | {
24 |     auto dispatch = [&](auto u) {
25 |         using U = decltype(u);
26 |         return interleave_output_dims_impl((U*)fused, (const U*)a, (const U*)b, m, k, st);
27 |     };
28 |     if constexpr (bitsof<T> == 8) {
29 |         return dispatch(uint8_t{});
30 |     }
31 |     else if constexpr (bitsof<T> == 16) {
32 |         return dispatch(uint16_t{});
33 |     }
34 |     else if constexpr (bitsof<T> == 32) {
35 |         return dispatch(uint32_t{});
36 |     }
37 | }
38 | 
39 | }  // namespace turbomind
40 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/sync.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | __inline__ __device__ int sem_fetch(int* lock, bool pred)
 8 | {
 9 |     int state{};
10 |     if (pred) {
11 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
12 |         asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
13 | #else
14 |         asm volatile("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
15 | #endif
16 |     }
17 |     return state;
18 | }
19 | 
20 | __inline__ __device__ void sem_wait(int* lock, int status, bool pred)
21 | {
22 |     int state = 0;
23 |     while (__syncthreads_and(state != status)) {
24 |         state = sem_fetch(lock, pred);
25 |     }
26 | 
27 |     __syncthreads();  // memory fence
28 | }
29 | 
30 | __inline__ __device__ void sem_wait_many(int* lock, int count, bool pred)
31 | {
32 |     int state = 0;
33 |     while (__syncthreads_count(state) != count) {
34 |         state = sem_fetch(lock, pred);
35 |     }
36 | 
37 |     __syncthreads();  // memory fence
38 | }
39 | 
40 | __inline__ __device__ void sem_post(int* lock, int status, bool pred)
41 | {
42 |     __syncthreads();  // memory fence
43 | 
44 |     if (pred) {
45 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
46 |         asm volatile("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
47 | #else
48 |         asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
49 | #endif
50 |     }
51 | }
52 | 
53 | }  // namespace turbomind
54 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/registry.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch.h"
 4 | #include "src/turbomind/kernels/gemm/registry.h"
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | Registry::Registry(std::shared_ptr<cudaDeviceProp> device_prop):
 9 |     device_prop_{std::move(device_prop)}, arch_{device_prop_->major * 100 + device_prop_->minor * 10}
10 | {
11 |     f16_u4g128_f16_tnt_sm70_s884();
12 |     f16_u4g128_f16_tnt_sm75_simt();
13 |     f16_u4g128_f16_tnt_sm75_s16816();
14 |     f16_u4g128_f16_tnt_sm80_s16816();
15 |     f16_u4g128_f16_tnt_sm90_s16816();
16 | 
17 |     u4g128_f16_f16_nnn_sm80_s16816();
18 | }
19 | 
20 | bool Registry::Add(std::unique_ptr<Kernel> kernel)
21 | {
22 |     if (!is_arch_compatible(kernel->arch(), arch_)) {
23 |         return false;
24 |     }
25 |     if ((int)device_prop_->sharedMemPerBlockOptin < kernel->smem_size()) {
26 |         return false;
27 |     }
28 |     // std::cout << "register: " << kernel->name()                                        //
29 |     //           << ", shared: " << (kernel->smem_size() >> 10) << " KB"                  //
30 |     //           << ", regs: " << kernel->desc().attr.numRegs                             //
31 |     //           << ", local: " << (float)kernel->desc().attr.localSizeBytes << " bytes"  //
32 |     //           << ", max_active_ctas: " << kernel->desc().max_active_ctas << " \n";
33 | 
34 |     kernels_.push_back(std::move(kernel));
35 |     ptrs_.push_back(kernels_.back().get());
36 |     return true;
37 | }
38 | 
39 | }  // namespace turbomind::gemm
40 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/common.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
 6 | #define TURBOMIND_ARCH_SM70 1
 7 | #else
 8 | #define TURBOMIND_ARCH_SM70 0
 9 | #endif
10 | 
11 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
12 | #define TURBOMIND_ARCH_SM75 1
13 | #else
14 | #define TURBOMIND_ARCH_SM75 0
15 | #endif
16 | 
17 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
18 | #define TURBOMIND_ARCH_SM80 1
19 | #else
20 | #define TURBOMIND_ARCH_SM80 0
21 | #endif
22 | 
23 | #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
24 | #define TURBOMIND_ARCH_SM90 1
25 | #else
26 | #define TURBOMIND_ARCH_SM90 0
27 | #endif
28 | 
29 | #if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
30 | #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
31 | #define PRAGMA_UNROLL _Pragma("unroll")
32 | #define PRAGMA_NO_UNROLL _Pragma("unroll 1")
33 | #else
34 | #define PRAGMA_UNROLL #pragma unroll
35 | #define PRAGMA_NO_UNROLL #pragma unroll 1
36 | #endif
37 | #else
38 | #define PRAGMA_UNROLL
39 | #define PRAGMA_NO_UNROLL
40 | #endif
41 | 
42 | #if defined(__CUDACC__)
43 | #define TM_HOST_DEVICE __forceinline__ __host__ __device__
44 | #define TM_DEVICE __forceinline__ __device__
45 | #define TM_HOST __forceinline__ __host__
46 | #else
47 | #define TM_HOST_DEVICE inline
48 | #define TM_DEVICE inline
49 | #define TM_HOST inline
50 | #endif
51 | 
52 | constexpr int WARP_SIZE = 32;
53 | 
54 | #ifndef uint
55 | using uint = unsigned int;
56 | #endif
57 | 
58 | #ifndef ushort
59 | using ushort = unsigned short int;
60 | #endif
61 | 


--------------------------------------------------------------------------------
/turbomind/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from typing import List
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
 9 |     assert x.dtype == torch.int32
10 |     xs = []
11 |     for _ in range(8):
12 |         xs.append((x & 15).to(dtype))
13 |         x = x >> 4
14 |     return xs
15 | 
16 | 
17 | def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
18 |     xs = get_u4_slices(x, torch.uint8)
19 |     order = [0, 4, 1, 5, 2, 6, 3, 7]
20 |     ys = [xs[i] for i in order]
21 |     return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
22 | 
23 | 
24 | def process_awq_gemm(x: torch.Tensor, kind: str):
25 |     if x.dtype == torch.int32:
26 |         x = unpack_awq_gemm(x)
27 |     if kind in ['qweight', 'qzeros', 'scales']:
28 |         x = x.t()
29 |     return x
30 | 
31 | 
32 | def process_gptq(x: torch.Tensor, kind: str):
33 |     if x.dtype == torch.int32:
34 |         xs = get_u4_slices(x, torch.uint8)
35 |         if kind == 'qweight':  # (k/8,n)
36 |             x = torch.stack(xs, dim=1).view(-1, x.size(-1))
37 |         else:  # 'qzeros' (k/g,n/8)
38 |             x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
39 |     if kind in ['qweight', 'qzeros', 'scales']:
40 |         x = x.t()
41 |     return x
42 | 
43 | 
44 | def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
45 |     assert x.dtype == torch.uint8
46 |     xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
47 |     a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
48 |     for t in reversed(xs):
49 |         a = (a << 4) | t
50 |     return a.squeeze(dim=-1)
51 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/data_type.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | #include <type_traits>
 7 | 
 8 | #include <cuda_fp16.h>
 9 | #if ENABLE_BF16
10 | #include <cuda_bf16.h>
11 | #endif
12 | 
13 | namespace turbomind {
14 | 
15 | struct uint1_t {
16 | };
17 | struct uint2_t {
18 | };
19 | struct uint3_t {
20 | };
21 | struct uint4_t {
22 | };
23 | struct uint5_t {
24 | };
25 | struct uint6_t {
26 | };
27 | 
28 | template<class T>
29 | struct bitsof_t: std::integral_constant<int, sizeof(T) * 8> {
30 | };
31 | 
32 | template<>
33 | struct bitsof_t<uint1_t>: std::integral_constant<int, 1> {
34 | };
35 | 
36 | template<>
37 | struct bitsof_t<uint2_t>: std::integral_constant<int, 2> {
38 | };
39 | 
40 | template<>
41 | struct bitsof_t<uint3_t>: std::integral_constant<int, 3> {
42 | };  // 2 + 1
43 | 
44 | template<>
45 | struct bitsof_t<uint4_t>: std::integral_constant<int, 4> {
46 | };
47 | 
48 | template<>
49 | struct bitsof_t<uint5_t>: std::integral_constant<int, 5> {
50 | };  // 4 + 1
51 | 
52 | template<>
53 | struct bitsof_t<uint6_t>: std::integral_constant<int, 6> {
54 | };  // 4 + 2
55 | 
56 | template<class T>
57 | inline constexpr bitsof_t<T> bitsof{};
58 | 
59 | struct fp8 {
60 |     char v;
61 | };
62 | struct fp8_e4m3: fp8 {
63 | };
64 | struct fp8_e5m2: fp8 {
65 | };
66 | 
67 | namespace detail {
68 | 
69 | struct __uint4_t {
70 |     uint32_t x;
71 | };
72 | 
73 | }  // namespace detail
74 | 
75 | template<class T, class SFINAE = void>
76 | struct get_pointer_type_t {
77 |     using type = T*;
78 | };
79 | 
80 | template<class T>
81 | using get_pointer_type = typename get_pointer_type_t<T>::type;
82 | 
83 | }  // namespace turbomind
84 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-x64-gpu.yml:
--------------------------------------------------------------------------------
 1 | name: windows-x64-gpu
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - '.github/workflows/windows-x64-gpu.yml'
 6 |       - 'src/**'
 7 |       - 'CMakeLists.txt'
 8 |   pull_request:
 9 |     paths:
10 |       - '.github/workflows/windows-x64-gpu.yml'
11 |       - 'src/**'
12 |       - 'CMakeLists.txt'
13 | concurrency:
14 |   group: windows-x64-gpu-${{ github.ref }}
15 |   cancel-in-progress: true
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   build:
21 |     strategy:
22 |       matrix:
23 |         cudaver: [11.8.0, 12.1.0]
24 |     name: cuda-${{ matrix.cudaver }}
25 |     runs-on: windows-latest
26 |     steps:
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v3
29 |       - name: Set up python
30 |         uses: actions/setup-python@v4
31 |         with:
32 |           python-version: '3.8'
33 |       - name: Install python packages
34 |         run: |
35 |           pip install -r requirements/build.txt
36 |           pip install wheel
37 |       - name: Setup CUDA Toolkit
38 |         id: cuda-toolkit
39 |         shell: pwsh
40 |         run: ./builder/windows/setup_cuda.ps1
41 |         env:
42 |             INPUT_CUDA_VERSION: ${{ matrix.cudaver }}
43 |       - name: Build wheel
44 |         run: |
45 |           $env:BUILD_TEST="ON"
46 |           mkdir build
47 |           cd build
48 |           ..\builder\windows\generate.ps1
49 |           cmake --build . --config Release -- /m /v:q
50 |           if (-Not $?) {
51 |             echo "build failed"
52 |             exit 1
53 |           }
54 |           cmake --install . --config Release
55 |           cd ..
56 |           rm build -Force -Recurse
57 |           python setup.py bdist_wheel -d build/wheel
58 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/quantization.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/test/quantization_impl.h"
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | template void Quantize<uint4_t>(const thrust::universal_vector<half>& x,
 8 |                                 int                                   m,
 9 |                                 int                                   k,
10 |                                 Order                                 order,
11 |                                 int                                   group_size,
12 |                                 thrust::universal_vector<half>&       x_p,  // pseudo-quantized
13 |                                 thrust::universal_vector<uint16_t>&   x_q,  // quantized ushort
14 |                                 thrust::universal_vector<half>&       x_u,  // scales & zeros (always m-major)
15 |                                 cudaStream_t                          stream);
16 | 
17 | template void Quantize<uint8_t>(const thrust::universal_vector<half>& x,
18 |                                 int                                   m,
19 |                                 int                                   k,
20 |                                 Order                                 order,
21 |                                 int                                   group_size,
22 |                                 thrust::universal_vector<half>&       x_p,  // pseudo-quantized
23 |                                 thrust::universal_vector<uint16_t>&   x_q,  // quantized ushort
24 |                                 thrust::universal_vector<half>&       x_u,  // scales & zeros (always m-major)
25 |                                 cudaStream_t                          stream);
26 | 
27 | }  // namespace turbomind::gemm
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/PyCQA/flake8
 3 |     rev: 4.0.1
 4 |     hooks:
 5 |       - id: flake8
 6 |   - repo: https://github.com/PyCQA/isort
 7 |     rev: 5.11.5
 8 |     hooks:
 9 |       - id: isort
10 |   - repo: https://github.com/pre-commit/mirrors-yapf
11 |     rev: v0.32.0
12 |     hooks:
13 |       - id: yapf
14 |         name: yapf
15 |         description: 'Formatter for Python code'
16 |         entry: yapf
17 |         language: python
18 |         args: ['-i', '--style={based_on_style: pep8, column_limit: 79}']
19 | 
20 |   - repo: https://github.com/pre-commit/pre-commit-hooks
21 |     rev: v4.2.0
22 |     hooks:
23 |       - id: trailing-whitespace
24 |       - id: check-yaml
25 |       - id: end-of-file-fixer
26 |       - id: requirements-txt-fixer
27 |       - id: double-quote-string-fixer
28 |       - id: check-merge-conflict
29 |       - id: fix-encoding-pragma
30 |         args: ["--remove"]
31 |       - id: mixed-line-ending
32 |         args: ["--fix=lf"]
33 |   - repo: https://github.com/executablebooks/mdformat
34 |     rev: 0.7.9
35 |     hooks:
36 |       - id: mdformat
37 |         args: ["--number"]
38 |         additional_dependencies:
39 |           - mdformat-openmmlab
40 |           - mdformat_frontmatter
41 |           - linkify-it-py
42 |   - repo: https://github.com/codespell-project/codespell
43 |     rev: v2.1.0
44 |     hooks:
45 |       - id: codespell
46 |         args: ["--skip=third_party/*,*.ipynb,*.proto,src/turbomind/kernels/gemm/transform.h"]
47 | 
48 |   - repo: https://github.com/myint/docformatter
49 |     rev: v1.4
50 |     hooks:
51 |       - id: docformatter
52 |         args: ["--in-place", "--wrap-descriptions", "79"]
53 | 
54 |   - repo: https://github.com/open-mmlab/pre-commit-hooks
55 |     rev: v0.2.0
56 |     hooks:
57 |     -   id: check-copyright
58 |         args: ["turbomind"]
59 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_simt.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch/config_simt.h"
 4 | #include "src/turbomind/kernels/gemm/operand.h"
 5 | #include "src/turbomind/kernels/gemm/registry.h"
 6 | #include "src/turbomind/kernels/gemm/transform.h"
 7 | #include "src/turbomind/kernels/gemm/types.h"
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | void Registry::f16_u4g128_f16_tnt_sm75_simt()
12 | {
13 |     using namespace simt;
14 | 
15 |     using S = cache_policy::Stream;
16 |     using D = cache_policy::Default;
17 | 
18 |     {  // quant B
19 |         using Operand_A    = typename GetOperand<HMMA_SIMT, OPERAND_A, half, kRowMajor, false>::Operand;
20 |         using Operand_B_U4 = typename GetOperand<HMMA_SIMT, OPERAND_B, uint4_t, kRowMajor, true>::Operand;
21 |         using Operand_V    = typename GetOperand<HMMA_SIMT, OPERAND_V, uint32_t, kColMajor, true>::Operand;
22 | 
23 |         using C = Sm75_Simt<Operand_A,
24 |                             Transform_Default,
25 |                             VoidOperand,
26 |                             Operand_B_U4,
27 |                             Transform_HMMA_SIMT_B,
28 |                             Operand_V,
29 |                             kRowMajor,
30 |                             half>;
31 | 
32 |         // clang-format off
33 |         Add<C::Type<128, 128, 16, 8, 1, 1, D, D, 2, true, 1, 128>>();
34 |         Add<C::Type< 64, 128, 16, 4, 1, 1, D, D, 2, true, 1, 128>>();
35 |         Add<C::Type< 64, 128, 16, 4, 1, 1, D, S, 2, true, 1, 128>>();
36 |         Add<C::Type< 32, 128, 32, 4, 1, 1, D, S, 2, true, 1, 128>>();
37 |         Add<C::Type< 16, 128, 32, 2, 2, 1, D, S, 2, true, 1, 128>>();
38 |         Add<C::Type<  8, 128, 64, 2, 2, 1, D, S, 2, true, 1, 128>>();
39 |         Add<C::Type<  4, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
40 |         Add<C::Type<  1, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
41 |         // clang-format on
42 |     }
43 | }
44 | 
45 | }  // namespace turbomind::gemm
46 | 


--------------------------------------------------------------------------------
/example/module.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def get_named_linears(module):
 6 |     return {
 7 |         name: m
 8 |         for name, m in module.named_modules() if isinstance(m, nn.Linear)
 9 |     }
10 | 
11 | 
12 | def get_op_by_name(module, op_name):
13 |     # get the op by its name relative to the module
14 |     for name, m in module.named_modules():
15 |         if name == op_name:
16 |             return m
17 |     raise ValueError(f'Cannot find op {op_name} in module {module}')
18 | 
19 | 
20 | def set_op_by_name(layer, name, new_module):
21 |     levels = name.split('.')
22 |     if len(levels) > 1:
23 |         mod_ = layer
24 |         for l_idx in range(len(levels) - 1):
25 |             if levels[l_idx].isdigit():
26 |                 mod_ = mod_[int(levels[l_idx])]
27 |             else:
28 |                 mod_ = getattr(mod_, levels[l_idx])
29 |         setattr(mod_, levels[-1], new_module)
30 |     else:
31 |         setattr(layer, name, new_module)
32 | 
33 | 
34 | def get_op_name(module, op):
35 |     # get the name of the op relative to the module
36 |     for name, m in module.named_modules():
37 |         if m is op:
38 |             return name
39 |     raise ValueError(f'Cannot find op {op} in module {module}')
40 | 
41 | 
42 | def append_str_prefix(x, prefix):
43 |     if isinstance(x, str):
44 |         return prefix + x
45 |     elif isinstance(x, tuple):
46 |         return tuple([append_str_prefix(y, prefix) for y in x])
47 |     elif isinstance(x, list):
48 |         return [append_str_prefix(y, prefix) for y in x]
49 |     else:
50 |         return x
51 | 
52 | 
53 | def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
54 |     if modules_to_not_convert is None:
55 |         return linear_layers
56 | 
57 |     filtered_layers = {}
58 |     for name, linear_layer in linear_layers.items():
59 |         if not any(key in name for key in modules_to_not_convert):
60 |             filtered_layers[name] = linear_layer
61 |     return filtered_layers
62 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/mma_simt.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/array.h"
 6 | #include "src/turbomind/kernels/core/common.h"
 7 | #include "src/turbomind/kernels/gemm/desc.h"
 8 | #include "src/turbomind/kernels/gemm/simt.h"
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | template<class T>
13 | struct MMA_SIMT {
14 |     static constexpr int M = simt::OP_M;
15 |     static constexpr int N = simt::OP_N;
16 |     static constexpr int K = simt::OP_K;
17 | 
18 |     static constexpr int kThreadCount = 32;
19 | 
20 |     static constexpr auto kOpClass = OpClass::kSIMT;
21 | 
22 |     using FragA = Array<T, K>;
23 |     using FragB = Array<T, K>;
24 |     using FragC = Array<float, 1>;
25 | 
26 |     using OffsetC = Array<int2, 1>;
27 |     using FragC_  = FragC[1];
28 | 
29 |     __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
30 |     {
31 |         PRAGMA_UNROLL
32 |         for (int k = 0; k < K; ++k) {
33 |             d[0] = c[0] + float(a[k]) * float(b[k]);
34 |         }
35 | 
36 |         // PRAGMA_UNROLL
37 |         // for (int k = 0; k < K; ++k) {
38 |         //     d[0] = c[0] + float(a[k] * b[k]);
39 |         // }
40 | 
41 |         // T acc{};
42 |         // PRAGMA_UNROLL
43 |         // for (int k = 0; k < K; ++k) {
44 |         //     acc += a[k] * b[k];
45 |         // }
46 |         // d[0] = c[0] + float(acc);
47 |     }
48 | 
49 |     __device__ static constexpr OffsetC static_offset_C()
50 |     {
51 |         return {};
52 |     }
53 | 
54 |     __device__ static int2 thread_offset_C()  // -> (m,n)
55 |     {
56 |         const int lane_id = threadIdx.x % WARP_SIZE;
57 |         return {lane_id / N, lane_id % N};
58 |     }
59 | 
60 |     __device__ static void ReshapeC(const FragC& c, FragC_& c_)
61 |     {
62 |         c_[0] = c;
63 |     }
64 | 
65 |     __device__ static int get_group_id(int thread_idx)
66 |     {
67 |         return thread_idx / WARP_SIZE;
68 |     }
69 | };
70 | 
71 | }  // namespace turbomind::gemm
72 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm75_s16816.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch/config_sm75_s16816.h"
 4 | #include "src/turbomind/kernels/gemm/operand.h"
 5 | #include "src/turbomind/kernels/gemm/registry.h"
 6 | #include "src/turbomind/kernels/gemm/transform.h"
 7 | #include "src/turbomind/kernels/gemm/types.h"
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | void Registry::f16_u4g128_f16_tnt_sm75_s16816()
12 | {
13 |     using namespace sm75_s16816;
14 | 
15 |     {  // fp x u4
16 |         using C = Sm75_s16816<Operand_A<half, kRowMajor>,
17 |                               Transform_Default,
18 |                               VoidOperand,
19 |                               Operand_B_Pack<uint4_t, kColMajor>,
20 |                               Transform_HMMA_16816<1, 0>,
21 |                               Operand_UV_Pack<uint32_t, true>,
22 |                               kRowMajor,
23 |                               half>;
24 | 
25 |         using S = cache_policy::Stream;
26 |         using D = cache_policy::Default;
27 | 
28 |         // clang-format off
29 |         Add<C::Type<128, 256, 32, 1, 8, 1, D, D, 2, true, 1, 128, 128, 128>>();
30 |         Add<C::Type<128, 128, 32, 1, 4, 1, D, D, 2, true, 1, 128,  64, 128>>();
31 |         Add<C::Type< 96,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
32 |         Add<C::Type< 64, 128, 32, 1, 4, 1, D, D, 2, true, 1, 128,  32, 128>>();
33 |         Add<C::Type< 64, 128, 32, 1, 4, 1, D, S, 2, true, 1, 128,  32, 128>>();
34 |         Add<C::Type< 64,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
35 |         Add<C::Type< 48, 128, 64, 1, 4, 1, D, S, 2, true, 1, 128>>();
36 |         Add<C::Type< 48,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
37 |         Add<C::Type< 32,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
38 |         Add<C::Type< 16, 128, 32, 1, 4, 1, D, S, 2, true, 1, 128>>();
39 |         Add<C::Type< 16,  64, 64, 1, 2, 2, D, S, 2, true, 1, 128>>();
40 |         // clang-format on
41 |     }
42 | }
43 | 
44 | }  // namespace turbomind::gemm
45 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/format.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/array.h"
 6 | 
 7 | namespace turbomind::gemm {
 8 | 
 9 | template<class Tin, class Tout>
10 | struct Converter {
11 | };
12 | 
13 | template<class T>
14 | struct Converter<T, T> {
15 |     template<int N>
16 |     __device__ Array<T, N> operator()(Array<T, N> x)
17 |     {
18 |         return x;
19 |     }
20 | };
21 | 
22 | template<>
23 | struct Converter<uint16_t, uint4_t> {
24 | 
25 |     static __device__ Array<uint4_t, 8> pack(const Array<uint8_t, 8>& vi)
26 |     {
27 |         Array<uint32_t, 2> ui = (Array<uint32_t, 2>&)vi;
28 | 
29 |         ui[0] |= (ui[0] >> 12);
30 |         ui[1] |= (ui[1] >> 12);
31 | 
32 |         //  7 6 5 4 3 2 1 0
33 |         // _7_67564_3_23120
34 |         uint32_t uo = __byte_perm(ui[0], ui[1], 0x5140);
35 | 
36 |         return (Array<uint4_t, 8>&)uo;
37 |     }
38 | 
39 |     template<class U, int N>
40 |     __device__ Array<uint4_t, N> operator()(const Array<U, N>& x)
41 |     {
42 |         static_assert(sizeof(U) == 2);
43 |         auto&             vi = (const Array<uint16_t, N>&)x;
44 |         Array<uint8_t, N> tmp;
45 |         PRAGMA_UNROLL
46 |         for (int i = 0; i < N; ++i) {
47 |             tmp[i] = static_cast<uint8_t>(vi[i]);
48 |         }
49 |         Array<uint4_t, N> vo;
50 |         PRAGMA_UNROLL
51 |         for (int i = 0; i < N; i += 8) {
52 |             (Array<uint4_t, 8>&)vo[i] = pack((Array<uint8_t, 8>&)tmp[i]);
53 |         }
54 |         return vo;
55 |     }
56 | };
57 | 
58 | template<>
59 | struct Converter<uint16_t, uint8_t> {
60 |     template<int N>
61 |     __device__ Array<uint8_t, N> operator()(const Array<uint16_t, N>& x)
62 |     {
63 |         // static_assert(sizeof(U) == 2);
64 |         auto&             vi = (const Array<uint16_t, N>&)x;
65 |         Array<uint8_t, N> vo;
66 |         PRAGMA_UNROLL
67 |         for (int i = 0; i < N; ++i) {
68 |             vo[i] = static_cast<uint8_t>(vi[i]);
69 |         }
70 |         return vo;
71 |     }
72 | };
73 | 
74 | }  // namespace turbomind::gemm
75 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-x64-gpu.yml:
--------------------------------------------------------------------------------
 1 | name: linux-x64-gpu
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - '.github/workflows/linux-x64-gpu.yml'
 6 |       - 'src/**'
 7 |       - 'CMakeLists.txt'
 8 |   pull_request:
 9 |     paths:
10 |       - '.github/workflows/linux-x64-gpu.yml'
11 |       - 'src/**'
12 |       - 'CMakeLists.txt'
13 | concurrency:
14 |   group: linux-x64-gpu-${{ github.ref }}
15 |   cancel-in-progress: true
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   build:
21 |     strategy:
22 |       matrix:
23 |         cudaver: [11.8, 12.1]
24 |     name: cuda-${{ matrix.cudaver }}
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - name: Free disk space
28 |         uses: jlumbroso/free-disk-space@main
29 |         with:
30 |           # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
31 |           tool-cache: false
32 |           docker-images: false
33 |           # All of these default to true, but feel free to set to "false" if necessary for your workflow
34 |           android: true
35 |           dotnet: true
36 |           haskell: true
37 |           large-packages: true
38 |           swap-storage: false
39 |       - name: Checkout repository
40 |         uses: actions/checkout@v3
41 |       - name: Build
42 |         uses: addnab/docker-run-action@v3
43 |         with:
44 |           image: openmmlab/lmdeploy-builder:cuda${{ matrix.cudaver }}
45 |           options: -v ${{ github.workspace }}:/work
46 |           run: |
47 |             cd /work
48 |             source /opt/conda/bin/activate
49 |             conda activate py38
50 |             mkdir build && cd build
51 |             cmake .. \
52 |               -DCMAKE_BUILD_TYPE=RelWithDebInfo \
53 |               -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
54 |               -DCMAKE_INSTALL_PREFIX=./install \
55 |               -DCMAKE_CUDA_FLAGS="-lineinfo" \
56 |               -DUSE_NVTX=ON \
57 |               -DBUILD_TEST=ON
58 |             make -j$(nproc) && make install
59 |             cd ..
60 |             rm -rf build
61 |             python setup.py bdist_wheel --plat-name manylinux2014_x86_64 -d /tmp
62 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/gemm.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | #include <cuda_runtime.h>
 7 | #include <memory>
 8 | #include <vector>
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | class Gemm {
13 | public:
14 |     static constexpr size_t kBarriersSize = 1 << 20;
15 |     static constexpr size_t kPartialsSize = 32 << 20;
16 | 
17 |     Gemm();
18 | 
19 |     ~Gemm();
20 | 
21 |     Gemm(Gemm&& other);
22 |     Gemm& operator    =(Gemm&& other);
23 |     Gemm(const Gemm&) = delete;
24 |     Gemm& operator=(const Gemm&) = delete;
25 | 
26 |     [[nodiscard]] int Run(const Operation&    operation,
27 |                           float               alpha,
28 |                           const void*         A,
29 |                           const MatrixLayout& Adesc,
30 |                           const void*         U,
31 |                           const MatrixLayout& Udesc,
32 |                           const void*         B,
33 |                           const MatrixLayout& Bdesc,
34 |                           const void*         V,
35 |                           const MatrixLayout& Vdesc,
36 |                           float               beta,
37 |                           const void*         C,
38 |                           const MatrixLayout& Cdesc,
39 |                           void*               D,
40 |                           const MatrixLayout& Ddesc,
41 |                           const Workspace&    workspace,
42 |                           cudaStream_t        stream);
43 | 
44 |     [[maybe_unused]] int Export(std::ostream& os);
45 | 
46 |     [[maybe_unused]] int Import(std::istream& is);
47 | 
48 |     [[nodiscard]] std::vector<int> GetTuningSeq() const;
49 | 
50 | private:
51 |     struct Impl;
52 |     std::unique_ptr<Impl> impl_;
53 | };
54 | 
55 | [[nodiscard]] int
56 | Convert(const void* S, const MatrixLayout& Sdesc, void* D, const MatrixLayout& Ddesc, cudaStream_t stream);
57 | 
58 | std::tuple<Order, Pack, Order, Pack> get_weight_and_scales_layout(int sm, bool force_simt);
59 | 
60 | }  // namespace turbomind::gemm
61 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | AccessModifierOffset: -4
 3 | AlignAfterOpenBracket: Align
 4 | AllowShortEnumsOnASingleLine: false
 5 | AlignConsecutiveAssignments: true
 6 | AlignConsecutiveDeclarations: true
 7 | AlignEscapedNewlines: Right
 8 | AlignOperands: true
 9 | AlignTrailingComments: true
10 | AllowAllParametersOfDeclarationOnNextLine: true
11 | AllowAllArgumentsOnNextLine: true
12 | AllowShortBlocksOnASingleLine: Empty
13 | AllowShortCaseLabelsOnASingleLine: false
14 | AllowShortFunctionsOnASingleLine: Empty
15 | AllowShortIfStatementsOnASingleLine: Never
16 | AllowShortLoopsOnASingleLine: false
17 | AlwaysBreakAfterReturnType: None
18 | AlwaysBreakBeforeMultilineStrings: false
19 | AlwaysBreakTemplateDeclarations: true
20 | BinPackArguments: false
21 | BinPackParameters: false
22 | BreakBeforeBinaryOperators: NonAssignment
23 | BreakBeforeBraces: Stroustrup
24 | BreakBeforeTernaryOperators: false
25 | BreakConstructorInitializers: AfterColon
26 | BreakInheritanceList: AfterColon
27 | BreakStringLiterals: false
28 | ColumnLimit: 120
29 | CompactNamespaces: false
30 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
31 | ConstructorInitializerIndentWidth: 4
32 | ContinuationIndentWidth: 4
33 | Cpp11BracedListStyle: true
34 | DerivePointerAlignment: false
35 | FixNamespaceComments: true
36 | IndentCaseLabels: true
37 | IndentPPDirectives: None
38 | IndentWidth: 4
39 | IndentWrappedFunctionNames: false
40 | KeepEmptyLinesAtTheStartOfBlocks: true
41 | MaxEmptyLinesToKeep: 1
42 | NamespaceIndentation: None
43 | PointerAlignment: Left
44 | ReflowComments: true
45 | SortIncludes: true
46 | SortUsingDeclarations: false
47 | SpaceAfterCStyleCast: false
48 | SpaceAfterTemplateKeyword: false
49 | SpaceBeforeAssignmentOperators: true
50 | SpaceBeforeCtorInitializerColon: false
51 | SpaceBeforeInheritanceColon: false
52 | SpaceBeforeParens: ControlStatements
53 | SpaceInEmptyParentheses: false
54 | SpacesBeforeTrailingComments: 2
55 | SpacesInAngles: false
56 | SpacesInCStyleCastParentheses: false
57 | SpacesInContainerLiterals: false
58 | SpacesInParentheses: false
59 | SpacesInSquareBrackets: false
60 | Standard: c++17
61 | TabWidth: 4
62 | UseTab: Never
63 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/operand.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/layout.h"
 6 | #include "src/turbomind/kernels/core/meta.h"
 7 | #include "src/turbomind/kernels/gemm/iterator.h"
 8 | #include "src/turbomind/kernels/gemm/smem_copy.h"
 9 | #include "src/turbomind/kernels/gemm/types.h"
10 | #include "src/turbomind/kernels/gemm/utils.h"
11 | 
12 | namespace turbomind::gemm {
13 | 
14 | struct VoidOperand {
15 |     using Dtype = int;
16 | 
17 |     static constexpr Pack  kPack  = 0;
18 |     static constexpr Order kOrder = Order::kColMajor;
19 | 
20 |     struct GetSmemLayout {
21 |         static constexpr SmemLayoutV2<1, 1> apply(...)
22 |         {
23 |             return {};
24 |         }
25 |     };
26 | 
27 |     using SmemCopyAtom = VoidSmemCopyAtom;
28 | 
29 |     struct GetGmemIter {
30 |         static constexpr auto apply(...)
31 |         {
32 |             return type_c<VoidGmemIter>;
33 |         }
34 |     };
35 | };
36 | 
37 | /// TODO: fix AlignC, AlignS
38 | /// TODO: fix GroupSize
39 | template<class Operand, class Iterator, int M, int K, int WARPS, int GroupSize = 1>
40 | struct MakeOperand {
41 | 
42 |     using Dtype = typename Operand::Dtype;
43 | 
44 |     static constexpr Pack  kPack      = Operand::kPack;
45 |     static constexpr Order kOrder     = Operand::kOrder;
46 |     static constexpr int   kGroupSize = GroupSize;
47 | 
48 |     static constexpr int2 kPackMK = Packing_v2<kPack, kOrder>::apply({M, ceil_div(K, kGroupSize)});
49 | 
50 |     static constexpr pair<kPackMK.x, kPackMK.y> kShapeMK{};
51 | 
52 |     using SmemLayout   = decltype(Operand::GetSmemLayout::apply(kShapeMK));
53 |     using SmemAccessor = SmemAccessorV2<Dtype, SmemLayout, kOrder>;
54 | 
55 |     using GmemIter = typename decltype(Operand::GetGmemIter::apply(
56 |         type_c<Operand>, type_c<Iterator>, type_c<SmemLayout>, kShapeMK, constant<WARPS>{}))::type;
57 | 
58 |     using SmemCopyAtom = typename Operand::SmemCopyAtom;
59 | };
60 | 
61 | // CPO for getting specific operand templates
62 | template<MMA_Tag mma, Op_Tag optag, class T, Order order, bool is_pack, class SFINAE = void>
63 | struct GetOperand: std::false_type {
64 | };
65 | 
66 | }  // namespace turbomind::gemm
67 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐞 Bug report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | title: "[Bug] "
 4 | labels: ['Bug']
 5 | 
 6 | body:
 7 | - type: checkboxes
 8 |   attributes:
 9 |     label: Checklist
10 |     options:
11 |     - label: 1. I have searched related issues but cannot get the expected help.
12 |     - label: 2. The bug has not been fixed in the latest version.
13 |     - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
14 | - type: textarea
15 |   attributes:
16 |     label: Describe the bug
17 |     description: A clear and concise description of what the bug is.
18 |   validations:
19 |     required: true
20 | - type: textarea
21 |   attributes:
22 |     label: Reproduction
23 |     description: |
24 |       1. What command or script did you run?
25 |     placeholder: |
26 |       A placeholder for the command.
27 |   validations:
28 |     required: true
29 | - type: textarea
30 |   attributes:
31 |     label: Environment
32 |     description: |
33 |       1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here.
34 |       2. You may add addition that may be helpful for locating the problem, such as
35 |          - Which **model** are you using?
36 |          - How you installed PyTorch \[e.g., pip, conda, source\]
37 |          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
38 |     placeholder: Environment here.
39 |     render: Shell
40 |   validations:
41 |     required: true
42 | - type: textarea
43 |   attributes:
44 |     label: Error traceback
45 |     description: |
46 |       If applicable, paste the error trackback here.
47 |     placeholder: Logs and traceback here.
48 |     render: Shell
49 | - type: markdown
50 |   attributes:
51 |     value: >
52 |      If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
53 | 
54 |      Thanks for your bug report. We appreciate it a lot.
55 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | add_library(gemm2
 4 |         gemm.cu
 5 |         kernel.cu
 6 |         registry.cu
 7 |         dispatch_cache.cu
 8 |         gpu_metric.cu
 9 |         convert_v2.cu
10 |         cast.cu
11 |         unpack.cu
12 |         tuner/cache_utils.cu
13 |         tuner/measurer.cu
14 |         tuner/sampler.cu
15 |         tuner/stopping_criterion.cc
16 |         tuner/params.cc
17 |         kernel/f16_u4g128_f16_tnt_sm90_s16816.cu
18 |         kernel/f16_u4g128_f16_tnt_sm80_s16816.cu
19 |         kernel/f16_u4g128_f16_tnt_sm75_s16816.cu
20 |         kernel/f16_u4g128_f16_tnt_sm70_s884.cu
21 |         kernel/f16_u4g128_f16_tnt_sm75_simt.cu
22 |         kernel/u4g128_f16_f16_nnn_sm80_s16816.cu
23 | )
24 | 
25 | target_link_libraries(gemm2 PRIVATE parser)
26 | 
27 | 
28 | target_compile_options(gemm2 PRIVATE
29 |         $<$<COMPILE_LANGUAGE:CUDA>:
30 |                 -Xptxas=-v
31 |                 --generate-line-info
32 |                 --threads 8>
33 | )
34 | set_property(TARGET gemm2 PROPERTY POSITION_INDEPENDENT_CODE ON)
35 | set_property(TARGET gemm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
36 | 
37 | if (BUILD_TEST)
38 |         add_executable(gemm_test
39 |                 test/gemm_test.cu
40 |                 test/test_utils.cu
41 |                 test/quantization.cu
42 |                 test/reference.cu)
43 |         target_link_libraries(gemm_test PRIVATE gemm2 cublas)
44 | 
45 |         if (NOT MSVC)
46 |                 FetchContent_Declare(
47 |                 repo-nvbench
48 |                 GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git
49 |                 GIT_TAG        d8dced8a64d9ce305add92fa6d274fd49b569b7e
50 |                 )
51 | 
52 |                 set(NVBench_ENABLE_EXAMPLES OFF)
53 |                 set(BUILD_SHARED_LIBS OFF)
54 | 
55 |                 FetchContent_MakeAvailable(repo-nvbench)
56 | 
57 |                 add_executable(gemm_bench
58 |                         test/gemm_bench.cu
59 |                         test/test_utils.cu
60 |                         test/quantization.cu
61 |                         test/reference.cu)
62 |                 target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas)
63 |         endif ()
64 | endif ()
65 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/mma_sm80.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/array.h"
 6 | #include "src/turbomind/kernels/core/common.h"
 7 | #include "src/turbomind/kernels/core/mma.h"
 8 | #include "src/turbomind/kernels/gemm/desc.h"
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | struct SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
13 |     static constexpr int M = 16;
14 |     static constexpr int N = 8;
15 |     static constexpr int K = 16;
16 | 
17 |     static constexpr int kThreadCount = 32;
18 | 
19 |     static constexpr auto kOpClass = OpClass::kMMA_s16816;
20 | 
21 |     using FragA = Array<half, 8>;
22 |     using FragB = Array<half, 4>;
23 |     using FragC = Array<float, 4>;
24 | 
25 |     using OffsetC = Array<int2, 2>;  // (m, n)
26 |     using FragC_  = Array<float, 2>[2];
27 | 
28 |     __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
29 |     {
30 |         mma_m16n8k16_row_col(d, a, b, (FragC&)c);
31 |     }
32 | 
33 |     __device__ static constexpr OffsetC static_offset_C()
34 |     {
35 |         return {int2{0, 0}, int2{8, 0}};
36 |     }
37 | 
38 |     __device__ static int2 thread_offset_C()  // -> (m,n)
39 |     {
40 |         const int lane_id = threadIdx.x % WARP_SIZE;
41 |         return {lane_id / 4, lane_id % 4 * 2};
42 |     }
43 | 
44 |     __device__ static void ReshapeC(const FragC& c, FragC_& c_)
45 |     {
46 |         PRAGMA_UNROLL
47 |         for (int m = 0; m < 2; ++m) {
48 |             c_[m] = (Array<float, 2>&)c[m * 2];
49 |         }
50 |     }
51 | 
52 |     __device__ static int get_group_id(int thread_idx)
53 |     {
54 |         return thread_idx / WARP_SIZE;
55 |     }
56 | };
57 | 
58 | // This is not used yet
59 | struct SM75_MMA_16x8x8_F32_F16_F16_F32_TN: SM80_MMA_16x8x16_F32_F16_F16_F32_TN {
60 |     static constexpr int M = 16;
61 |     static constexpr int N = 8;
62 |     static constexpr int K = 8;
63 | 
64 |     using FragA = Array<half, 4>;
65 |     using FragB = Array<half, 2>;
66 |     using FragC = Array<float, 4>;
67 | 
68 |     __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
69 |     {
70 |         mma_m16n8k8_row_col(d, a, b, (FragC&)c);
71 |     }
72 | };
73 | 
74 | }  // namespace turbomind::gemm
75 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/desc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/data_type.h"
 6 | #include "src/turbomind/kernels/gemm/types.h"
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | // aggregate that uniquely identifies a GEMM problem
11 | struct GemmDesc {
12 |     int       arch;
13 |     DataType  type_a;
14 |     DataType  type_b;
15 |     DataType  type_c;
16 |     Order     order_a;
17 |     Order     order_b;
18 |     Order     order_c;
19 |     Pack      pack_a;
20 |     Pack      pack_b;
21 |     Pack      pack_u;
22 |     Pack      pack_v;
23 |     QuantDesc quant_a;
24 |     QuantDesc quant_b;
25 |     Epilogue  epilogue;
26 |     int       m;
27 |     int       n;
28 |     int       k;
29 |     int       batch_dim;
30 | };
31 | 
32 | enum class OpClass
33 | {
34 |     kSIMT,
35 |     kMMA_s884,
36 |     kMMA_s16816,
37 | };
38 | 
39 | inline const char* to_string(OpClass op)
40 | {
41 |     switch (op) {
42 |         case OpClass::kSIMT:
43 |             return "simt";
44 |         case OpClass::kMMA_s884:
45 |             return "s884";
46 |         case OpClass::kMMA_s16816:
47 |             return "s16816";
48 |         default:
49 |             return "unknown_op_cls";
50 |     }
51 | }
52 | 
53 | // aggregate that uniquely identifies a kernel
54 | struct KernelDesc {
55 |     int       arch;
56 |     OpClass   op_class;
57 |     DataType  type_a;
58 |     DataType  type_b;
59 |     DataType  type_c;
60 |     Order     order_a;
61 |     Order     order_b;
62 |     Order     order_c;
63 |     Pack      pack_a;
64 |     Pack      pack_b;
65 |     Pack      pack_u;
66 |     Pack      pack_v;
67 |     QuantDesc quant_a;
68 |     QuantDesc quant_b;
69 |     int       policy_a;
70 |     int       policy_b;
71 |     int3      cta_tile;
72 |     int3      mma_tile;
73 |     int3      align;
74 |     int2      c_tile;
75 |     int       stages;
76 |     bool      split_k;
77 | 
78 |     // set by `KernelImpl`
79 |     int                max_active_ctas;
80 |     cudaFuncAttributes attr;
81 | };
82 | 
83 | class Kernel;
84 | struct LaunchSpec {
85 |     Kernel* kernel;
86 |     int     swizzle;
87 |     int     splits;
88 |     float   estimated;
89 |     float   measured;
90 | };
91 | 
92 | }  // namespace turbomind::gemm
93 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/gemm_bench.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "nvbench/main.cuh"
 4 | #include "src/turbomind/kernels/gemm/operand.h"
 5 | #include "src/turbomind/kernels/gemm/test/models.h"
 6 | #include "src/turbomind/kernels/gemm/test/testbed.h"
 7 | #include <map>
 8 | #include <nvbench/nvbench.cuh>
 9 | #include <string>
10 | 
11 | void gemm_bench(nvbench::state& state)
12 | {
13 |     const auto idx = state.get_int64("idx");
14 | 
15 |     const auto bs = state.get_int64("bs");
16 |     const auto tp = state.get_int64("tp");
17 | 
18 |     auto [output_dims, input_dims] = config[idx];
19 | 
20 |     constexpr int group_size = 128;
21 | 
22 |     if (idx % 4 == 0 || idx % 4 == 2) {
23 |         if (output_dims % tp)
24 |             return;
25 |         output_dims /= tp;
26 |     }
27 |     else {
28 |         if (input_dims % tp)
29 |             return;
30 |         input_dims /= tp;
31 |     }
32 | 
33 |     if (input_dims % group_size)
34 |         return;
35 | 
36 |     using turbomind::gemm::get_test;
37 | 
38 |     {
39 |         int m = bs;
40 |         int n = output_dims;
41 |         int k = input_dims;
42 |         if (get_test().kBatchDim == 1) {
43 |             std::swap(m, n);
44 |         }
45 |         std::cerr << "m" << m << "n" << n << "k" << k << "\n";
46 |         get_test().Initialize(m, n, k, group_size, state.get_cuda_stream());
47 |     }
48 | 
49 |     state.add_element_count((size_t)bs * output_dims * input_dims * 2);  // mul + add
50 | 
51 |     // state.collect_dram_throughput();
52 |     // state.collect_l2_hit_rates();
53 | 
54 |     if constexpr (1) {
55 |         state.add_global_memory_reads(get_test().global_memory_reads());
56 |         get_test().Run();
57 |         state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
58 |             get_test().Run();
59 |         });
60 |     }
61 |     else {
62 |         state.add_global_memory_reads(sizeof(half) * (bs * input_dims + output_dims * input_dims));
63 |         state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {  //
64 |             get_test().RunCublas();
65 |         });
66 |     }
67 | }
68 | 
69 | NVBENCH_BENCH(gemm_bench)
70 |     .add_int64_axis("idx", nvbench::range(0, (int)config.size() - 1))
71 |     .add_int64_power_of_two_axis("bs", nvbench::range(0, 10))
72 |     .add_int64_axis("tp", {1, 2, 4});
73 | 
74 | int main(int argc, char* argv[])
75 | {
76 |     NVBENCH_MAIN_BODY(argc, argv);
77 | }
78 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/mma_sm70.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/array.h"
 6 | #include "src/turbomind/kernels/core/common.h"
 7 | #include "src/turbomind/kernels/core/mma.h"
 8 | #include "src/turbomind/kernels/gemm/desc.h"
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | struct SM70_MMA_884 {
13 |     // static constexpr int M = 16;
14 |     // static constexpr int N = 16;
15 |     static constexpr int M = 8;
16 |     static constexpr int N = 32;
17 |     static constexpr int K = 8;
18 | 
19 |     static constexpr int kThreadCount = 32;
20 | 
21 |     static constexpr auto kOpClass = OpClass::kMMA_s884;
22 | 
23 |     using FragA = Array<half, K>;
24 |     using FragB = Array<half, K>;
25 |     using FragC = Array<float, 8>;
26 | 
27 |     using OffsetC = Array<int2, 4>;
28 |     using FragC_  = Array<float, 2>[4];
29 | 
30 |     __device__ static void fma(FragC& d, const FragA& a, const FragB& b, const FragC& c)
31 |     {
32 |         mma_m8n8k4_row_col(d, (const Array<half, 4>&)a[0], (const Array<half, 4>&)b[0], (FragC&)c);
33 |         if constexpr (K == 8) {
34 |             mma_m8n8k4_row_col(d, (const Array<half, 4>&)a[4], (const Array<half, 4>&)b[4], (FragC&)d);
35 |         }
36 |     }
37 | 
38 |     __device__ static constexpr OffsetC static_offset_C()
39 |     {
40 |         OffsetC r{};
41 |         PRAGMA_UNROLL
42 |         for (int n = 0; n < 2; ++n) {
43 |             PRAGMA_UNROLL
44 |             for (int m = 0; m < 2; ++m) {
45 |                 r[n * 2 + m] = int2{m * 2, n * 4};
46 |             }
47 |         }
48 |         return r;
49 |     }
50 | 
51 |     __device__ static int2 thread_offset_C()  // -> (m,n)
52 |     {
53 |         const int lane_id = threadIdx.x % WARP_SIZE;
54 |         // return {
55 |         //     (lane_id & 8) * 1 + (lane_id & 1) + lane_id / 16 * 4,
56 |         //     (lane_id & 4) * 2 + (lane_id & 2),
57 |         // };
58 |         return {(lane_id & 1) + (lane_id / 16) * 4,  //
59 |                 (lane_id & 2) + (lane_id & 12) * 2};
60 |     }
61 | 
62 |     __device__ static void ReshapeC(const FragC& c, FragC_& c_)
63 |     {
64 |         PRAGMA_UNROLL
65 |         for (int m = 0; m < 4; ++m) {
66 |             c_[m] = (Array<float, 2>&)c[m * 2];
67 |         }
68 |     }
69 | 
70 |     __device__ static int get_group_id(int thread_idx)
71 |     {
72 |         return thread_idx / WARP_SIZE;
73 |     }
74 | };
75 | 
76 | }  // namespace turbomind::gemm
77 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/measurer.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/kernel.h"
 4 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
 5 | #include "src/turbomind/kernels/gemm/tuner/measurer.h"
 6 | #include <iostream>
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | Measurer::Measurer(std::unique_ptr<StoppingCriterion> stop_criterion): stop_criterion_{std::move(stop_criterion)}
11 | {
12 |     cudaEventCreate(&ev_beg_);
13 |     cudaEventCreate(&ev_end_);
14 | }
15 | 
16 | Measurer::~Measurer()
17 | {
18 |     cudaEventDestroy(ev_beg_);
19 |     cudaEventDestroy(ev_end_);
20 |     ev_beg_ = ev_end_ = {};
21 | }
22 | 
23 | std::vector<Measurement>
24 | Measurer::Measure(const std::vector<LaunchSpec>& specs, const Launcher& launcher, cudaStream_t stream)
25 | {
26 |     std::vector<Measurement> m;
27 |     m.reserve(specs.size());
28 |     for (const auto& spec : specs) {
29 |         auto measure = MeasureOne(spec, launcher, stream);
30 |         if (measure.sample_count) {
31 |             m.push_back(measure);
32 |         }
33 |         /// TODO: report error
34 |     }
35 |     return m;
36 | }
37 | 
38 | Measurement Measurer::MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream)
39 | {
40 |     Stats       stats{};
41 |     cudaError_t status = cudaSuccess;
42 |     while (true) {
43 |         float ms{};
44 |         std::tie(ms, status) = ColdRun(spec, launcher, stream);
45 |         if (status != cudaSuccess) {
46 |             break;
47 |         }
48 |         stats.add_sample(ms);
49 |         // std::cout << spec.kernel->name() << " " << spec.swizzle << " " << stats.count() << " " << stats.mean() << " "
50 |         //           << stats.get_variance() << "\n";
51 |         if (stop_criterion_->should_stop(stats)) {
52 |             break;
53 |         }
54 |     }
55 |     return Measurement{
56 |         status,
57 |         stats.count(),
58 |         stats.mean(),
59 |         stats.get_variance(),
60 |     };
61 | }
62 | 
63 | std::pair<float, cudaError_t> Measurer::ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream)
64 | {
65 |     CacheFlushing::flush(stream);
66 | 
67 |     cudaEventRecord(ev_beg_, stream);
68 | 
69 |     launcher(spec, stream);
70 | 
71 |     cudaEventRecord(ev_end_, stream);
72 |     cudaEventSynchronize(ev_end_);
73 | 
74 |     const auto status = cudaGetLastError();
75 |     float      ms{};
76 | 
77 |     if (status == cudaSuccess) {
78 |         cudaEventElapsedTime(&ms, ev_beg_, ev_end_);
79 |     }
80 | 
81 |     return {ms, status};
82 | }
83 | 
84 | }  // namespace turbomind::gemm
85 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/iterator.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/array.h"
 6 | #include "src/turbomind/kernels/core/data_type.h"
 7 | #include "src/turbomind/kernels/core/meta.h"
 8 | #include "src/turbomind/kernels/gemm/thread_map.h"
 9 | #include "src/turbomind/kernels/gemm/utils.h"
10 | 
11 | namespace turbomind::gemm {
12 | 
13 | struct VoidGmemIter {
14 |     static constexpr int ITER_S = 0;
15 |     using Fragments             = int;
16 |     template<class P>
17 |     __device__ VoidGmemIter(P, int, int2, int2)
18 |     {
19 |     }
20 |     __device__ void ClearSmem() {}
21 |     __device__ void Prefetch(int, int, bool) {}
22 |     __device__ void Prefetch(bool) {}
23 |     __device__ void Fetch(Fragments&, bool) {}
24 |     __device__ void Store(const Fragments&) {}
25 |     __device__ void Advance() {}
26 |     int*            smem_data_;
27 |     bool            g_mask{false};
28 | };
29 | 
30 | struct GetGmemIter {
31 |     template<class Operand, class Iterator, class SmemLayout, int M, int K, int WARPS>
32 |     static constexpr auto
33 |         apply(basic_type<Operand>, basic_type<Iterator>, basic_type<SmemLayout>, pair<M, K>, constant<WARPS>)
34 |     {
35 |         using Dtype = typename Operand::Dtype;
36 | 
37 |         constexpr int kAccessSize =
38 |             std::min<int>(128 / bitsof<Dtype>, std::max<int>(32 / bitsof<Dtype>, M * K / (WARPS * WARP_SIZE)));
39 | 
40 |         constexpr int2 kAligned = mk2cs<Operand::kOrder>(0, 1);
41 |         constexpr int2 kCS      = mk2cs<Operand::kOrder>(M, K);
42 | 
43 |         constexpr int kMaxThrS = std::min(WARP_SIZE, ceil_div(kCS.y, WARPS));
44 |         constexpr int kMaxThrC = std::min(WARP_SIZE, ceil_div(kCS.x, kAccessSize));
45 | 
46 |         constexpr int kTgtThrC = ceil_div<int>(256, sizeof(Array<Dtype, kAccessSize>));
47 | 
48 |         constexpr int kWarpThrC = std::min(kMaxThrC, std::max(WARP_SIZE / kMaxThrS, kTgtThrC));
49 | 
50 |         using GmemIter = typename Iterator::template Type<Dtype,
51 |                                                           gemm::ThreadMap_V2<kCS.x, kCS.y, kAccessSize, Blocked, WARPS>,
52 |                                                           SmemLayout,
53 |                                                           Operand::kPack,
54 |                                                           Operand::kOrder,
55 |                                                           kAligned.x,   // aligned C
56 |                                                           kAligned.y>;  // aligned S
57 |         return type_c<GmemIter>;
58 |     }
59 | };
60 | 
61 | }  // namespace turbomind::gemm
62 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/sampler.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/desc.h"
 4 | #include "src/turbomind/kernels/gemm/kernel.h"
 5 | #include "src/turbomind/kernels/gemm/tuner/sampler.h"
 6 | #include <algorithm>
 7 | #include <iostream>
 8 | #include <numeric>
 9 | #include <vector>
10 | 
11 | namespace turbomind::gemm {
12 | 
13 | template<class Cmp>
14 | static std::vector<int> ArgSort(size_t size, const Cmp& cmp)
15 | {
16 |     std::vector<int> idxs(size);
17 |     std::iota(idxs.begin(), idxs.end(), 0);
18 |     std::stable_sort(idxs.begin(), idxs.end(), cmp);
19 |     return idxs;
20 | }
21 | 
22 | std::vector<LaunchSpec> Sampler::Run(std::vector<LaunchSpec> specs, const Launcher& launcher, cudaStream_t stream)
23 | {
24 |     std::vector<std::vector<LaunchSpec>> clusters;  // ptr into `specs`
25 |     if (k_clusters_) {
26 |         clusters = Cluster(specs, ClusteringParam{true, true});
27 |     }
28 |     else {
29 |         for (auto& s : specs) {
30 |             clusters.push_back({s});
31 |         }
32 |     }
33 |     // std::cout << "k_clusters=" << k_clusters_ << ", #specs" << specs.size() << ", #clusters" << clusters.size() <<
34 |     // "\n";
35 | 
36 |     std::vector<LaunchSpec> s_1;
37 |     for (const auto& c : clusters) {
38 |         s_1.push_back(c.front());
39 |     }
40 | 
41 |     auto m_1 = measurer_.Measure(s_1, launcher, stream);
42 | 
43 |     auto idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; });
44 | 
45 |     if (k_clusters_) {
46 |         const auto top_k = std::min(k_clusters_, (int)idxs.size());
47 |         idxs.resize(top_k);
48 | 
49 |         std::vector<LaunchSpec> s_2;
50 |         for (const auto& idx : idxs) {
51 |             auto& cluster = clusters[idx];
52 |             // Skip cluster leader
53 |             for (size_t j = 1; j < cluster.size(); ++j) {
54 |                 s_2.push_back(cluster[j]);
55 |             }
56 |         }
57 | 
58 |         // std::cout << "#s_2=" << s_2.size() << "\n";
59 | 
60 |         auto m_2 = measurer_.Measure(s_2, launcher, stream);
61 |         // Merge measurements of the 2 runs
62 |         m_2.insert(m_2.end(), m_1.begin(), m_1.end());
63 |         s_2.insert(s_2.end(), s_1.begin(), s_1.end());
64 |         m_1.swap(m_2);
65 |         s_1.swap(s_2);
66 |     }
67 | 
68 |     idxs = ArgSort(m_1.size(), [&](int i, int j) { return m_1[i].mean < m_1[j].mean; });
69 | 
70 |     std::vector<LaunchSpec> ret;
71 |     for (const auto& i : idxs) {
72 |         s_1[i].measured = m_1[i].mean;
73 |         ret.push_back(s_1[i]);
74 |     }
75 | 
76 |     return ret;
77 | }
78 | 
79 | }  // namespace turbomind::gemm
80 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/gemm_test.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/attention/quantization.h"
 4 | 
 5 | #include "src/turbomind/kernels/gemm/convert_v2.h"
 6 | #include "src/turbomind/kernels/gemm/gemm.h"
 7 | #include "src/turbomind/kernels/gemm/gpu_metric.h"
 8 | #include "src/turbomind/kernels/gemm/kernel.h"
 9 | #include "src/turbomind/kernels/gemm/test/models.h"
10 | #include "src/turbomind/kernels/gemm/test/quantization.h"
11 | #include "src/turbomind/kernels/gemm/test/test_utils.h"
12 | #include "src/turbomind/kernels/gemm/test/testbed.h"
13 | #include "src/turbomind/kernels/gemm/types.h"
14 | #include <fstream>
15 | #include <limits>
16 | #include <thrust/universal_vector.h>
17 | 
18 | #include <numeric>
19 | #include <random>
20 | #include <type_traits>
21 | 
22 | using namespace turbomind;
23 | using namespace gemm;
24 | using thrust::universal_vector;
25 | 
26 | cublasHandle_t cublas_handle{};
27 | 
28 | void ComputeRefCpu(half* C, const half* A, const half* B, int m, int n, int k)
29 | {
30 |     for (int mm = 0; mm < m; ++mm) {
31 |         for (int nn = 0; nn < n; ++nn) {
32 |             float c = 0;
33 |             for (int kk = 0; kk < k; ++kk) {
34 |                 c += (float)A[mm * k + kk] * (float)B[nn * k + kk];
35 |             }
36 |             C[mm * n + nn] = c;
37 |         }
38 |     }
39 | }
40 | 
41 | static int g_check = 0;
42 | 
43 | void Run(int batch_size, int output_dims, int input_dims, int g = 128)
44 | {
45 |     auto& test = get_test();
46 |     int   m    = batch_size;
47 |     int   n    = output_dims;
48 |     int   k    = input_dims;
49 |     if (get_test().kBatchDim == 1) {
50 |         std::swap(m, n);
51 |     }
52 |     std::cerr << "m" << m << "n" << n << "k" << k << "\n";
53 |     test.Initialize(m, n, k, g, 0);
54 | 
55 |     if (g_check) {
56 |         test.Check();
57 |     }
58 |     else {
59 |         for (int i = 0; i < 10; ++i) {
60 |             test.Run();
61 |         }
62 |         test.CompareC();
63 |     }
64 | }
65 | 
66 | int main(int argc, char* argv[])
67 | {
68 |     g_check = 0;
69 |     Run(16384, 16384, 16384);
70 | 
71 |     // g_check = 1;
72 |     // std::vector<int> bsz(1024);
73 |     // {
74 |     //     std::iota(bsz.begin(), bsz.end(), 1);
75 |     //     std::random_device rd;
76 |     //     std::mt19937       g(rd());
77 |     //     std::shuffle(bsz.begin() + 1, bsz.end(), g);
78 |     // }
79 |     // for (const auto& b : bsz) {
80 |     //     for (const auto& [out, in] : config) {
81 |     //         Run(b, out, in);
82 |     //     }
83 |     // }
84 | 
85 |     if (auto ec = cudaDeviceSynchronize(); ec != cudaSuccess) {
86 |         std::cerr << "un-clean exit: " << cudaGetErrorString(ec) << "\n";
87 |     }
88 | 
89 |     return 0;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/unpack.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/core/array_ops.h"
 4 | #include "src/turbomind/kernels/core/common.h"
 5 | #include "src/turbomind/kernels/core/data_type.h"
 6 | #include <iostream>
 7 | 
 8 | namespace turbomind {
 9 | 
10 | namespace {
11 | 
12 | __device__ void atomic_assign_u4(uint32_t* address, uint32_t index, uint32_t value)
13 | {
14 |     uint32_t old = *address;
15 |     uint32_t assumed;
16 |     do {
17 |         assumed      = old;
18 |         uint32_t tmp = (assumed & ~(0xfu << (index * 4u))) | (value << (index * 4u));
19 |         old          = atomicCAS(address, assumed, tmp);
20 |     } while (assumed != old);
21 | }
22 | 
23 | __device__ uint32_t read_u4(const uint32_t* address, uint32_t index)
24 | {
25 |     return (*address >> (index * 4u)) & 0xfu;
26 | }
27 | 
28 | template<int... Ds>
29 | __global__ void permute_u4(uint* dst, const uint* src, Array<int, sizeof...(Ds)> dims)
30 | {
31 |     constexpr int N = sizeof...(Ds);
32 | 
33 |     size_t count = 1;
34 |     PRAGMA_UNROLL
35 |     for (int i = 0; i < N; ++i) {
36 |         count *= dims[i];
37 |     }
38 | 
39 |     constexpr int order[] = {Ds...};
40 | 
41 |     for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
42 | 
43 |         int indices[N]{};
44 | 
45 |         PRAGMA_UNROLL
46 |         for (int j = N - 1, ii = i; j >= 0; --j) {
47 |             indices[j] = ii % dims[j];
48 |             ii /= dims[j];
49 |         }
50 | 
51 |         auto data = read_u4(src + i / 8, i % 8);
52 | 
53 |         int index = 0;
54 | 
55 |         PRAGMA_UNROLL
56 |         for (int j = N - 1, stride = 1; j >= 0; --j) {
57 |             index += indices[order[j]] * stride;
58 |             stride *= dims[order[j]];
59 |         }
60 | 
61 |         atomic_assign_u4(dst + index / 8, index % 8, data);
62 |     }
63 | }
64 | 
65 | }  // namespace
66 | 
67 | // col-major interleaved
68 | void unpack_awq_gemm(uint4_t* dst, const uint4_t* src, int rows, int cols, cudaStream_t st)
69 | {
70 |     Array<int, 4> shape{cols, rows / 8, 2, 4};
71 |     permute_u4<0, 1, 3, 2><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
72 | }
73 | 
74 | void transpose_u4(uint4_t* dst, const uint4_t* src, int s, int c, cudaStream_t st)
75 | {
76 |     if (s % 8 || c % 8) {
77 |         std::cerr << "transpose_u4: invalid shape (" << s << "," << c << "), must be multiple of 8" << std::endl;
78 |         return;
79 |     }
80 |     Array<int, 2> shape{s, c};
81 |     permute_u4<1, 0><<<512, 512, 0, st>>>((uint*)dst, (const uint*)src, shape);
82 | }
83 | 
84 | // load -> unpack -> extend_to_u8 -> manipulation -> compat_to_u4 -> store
85 | // load -> extend_to_u16 -> convert -> run
86 | 
87 | }  // namespace turbomind
88 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/cta_map.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/common.h"
 6 | #include "src/turbomind/kernels/core/math.h"
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | struct CtaMap {
11 | 
12 |     TM_HOST_DEVICE static int3 get_tiled_shape(int m, int n, int k, int cta_m, int cta_n, int split_cnt)
13 |     {
14 |         return {(m + cta_m - 1) / cta_m, (n + cta_n - 1) / cta_n, split_cnt};
15 |     }
16 | 
17 |     TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int N)
18 |     {
19 |         auto n = tiled_shape.y;
20 |         if (N >= 32 && n >= 24)
21 |             return 5;
22 |         if (N >= 16 && n >= 12)
23 |             return 4;
24 |         if (N >= 8 && n >= 6)
25 |             return 3;
26 |         if (N >= 4 && n >= 3)
27 |             return 2;
28 |         if (N >= 2 && n >= 2)
29 |             return 1;
30 |         return 0;
31 |     }
32 | 
33 |     TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
34 |     {
35 |         int tile = 1 << log_tile;
36 |         return {static_cast<unsigned>(tiled_shape.x * tile),
37 |                 static_cast<unsigned>((tiled_shape.y + tile - 1) / tile),
38 |                 static_cast<unsigned>(tiled_shape.z)};
39 |     }
40 | 
41 |     TM_DEVICE static int3 get_tile_offset(int log_tile)
42 |     {
43 |         int block_idx_x = blockIdx.x;
44 |         int block_idx_y = blockIdx.y;
45 |         int block_idx_z = blockIdx.z;
46 |         return {(block_idx_x >> log_tile),  //
47 |                 (block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)),
48 |                 block_idx_z};
49 |     }
50 | };
51 | 
52 | struct CtaMapN: public CtaMap {
53 |     TM_HOST_DEVICE static dim3 get_grid_shape(int3 tiled_shape, int log_tile)
54 |     {
55 |         int tile = 1 << log_tile;
56 |         return {static_cast<unsigned>(tiled_shape.y * tile),               // n * tile
57 |                 static_cast<unsigned>((tiled_shape.x + tile - 1) / tile),  // m / tile
58 |                 static_cast<unsigned>(tiled_shape.z)};
59 |     }
60 |     TM_HOST_DEVICE static int get_log_tile(int3 tiled_shape, int M)
61 |     {
62 |         auto m = tiled_shape.x;
63 |         if (M >= 32 && m >= 24)
64 |             return 5;
65 |         if (M >= 16 && m >= 12)
66 |             return 4;
67 |         if (M >= 8 && m >= 6)
68 |             return 3;
69 |         if (M >= 4 && m >= 3)
70 |             return 2;
71 |         if (M >= 2 && m >= 2)
72 |             return 1;
73 |         return 0;
74 |     }
75 |     TM_DEVICE static int3 get_tile_offset(int log_tile)
76 |     {
77 |         int block_idx_x = blockIdx.x;
78 |         int block_idx_y = blockIdx.y;
79 |         int block_idx_z = blockIdx.z;
80 |         return {(block_idx_y << log_tile) + (block_idx_x & ((1 << log_tile) - 1)),  //
81 |                 (block_idx_x >> log_tile),
82 |                 block_idx_z};
83 |     }
84 | };
85 | 
86 | }  // namespace turbomind::gemm
87 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/tensor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <numeric>
 6 | #include <stdlib.h>
 7 | #include <string>
 8 | // #include <sys/stat.h>
 9 | // #include <sys/types.h>
10 | #include <unordered_map>
11 | #include <vector>
12 | 
13 | namespace turbomind {
14 | 
15 | typedef enum datatype_enum
16 | {
17 |     TYPE_INVALID,
18 |     TYPE_BOOL,
19 |     TYPE_UINT8,
20 |     TYPE_UINT16,
21 |     TYPE_UINT32,
22 |     TYPE_UINT64,
23 |     TYPE_INT8,
24 |     TYPE_INT16,
25 |     TYPE_INT32,
26 |     TYPE_INT64,
27 |     TYPE_FP16,
28 |     TYPE_FP32,
29 |     TYPE_FP64,
30 |     TYPE_BYTES,
31 |     TYPE_BF16
32 | } DataType;
33 | 
34 | typedef enum memorytype_enum
35 | {
36 |     MEMORY_CPU,
37 |     MEMORY_CPU_PINNED,
38 |     MEMORY_GPU
39 | } MemoryType;
40 | 
41 | struct Tensor {
42 |     MemoryType          where;
43 |     DataType            type;
44 |     std::vector<size_t> shape;
45 |     const void*         data;
46 | 
47 |     Tensor(): where(MEMORY_CPU), type(TYPE_INVALID), shape({}), data(nullptr) {}
48 |     Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
49 |         where(_where), type(_type), shape(_shape), data(_data)
50 |     {
51 |     }
52 | 
53 |     size_t size() const
54 |     {
55 |         if (data == nullptr || shape.size() == 0) {
56 |             return 0;
57 |         }
58 |         return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
59 |     }
60 | 
61 |     size_t sizeBytes() const
62 |     {
63 |         return size() * typeSize();
64 |     }
65 | 
66 |     size_t typeSize() const
67 |     {
68 |         static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
69 |                                                                    {TYPE_BYTES, sizeof(char)},
70 |                                                                    {TYPE_UINT8, sizeof(uint8_t)},
71 |                                                                    {TYPE_UINT16, sizeof(uint16_t)},
72 |                                                                    {TYPE_UINT32, sizeof(uint32_t)},
73 |                                                                    {TYPE_UINT64, sizeof(uint64_t)},
74 |                                                                    {TYPE_INT8, sizeof(int8_t)},
75 |                                                                    {TYPE_INT16, sizeof(int16_t)},
76 |                                                                    {TYPE_INT32, sizeof(int32_t)},
77 |                                                                    {TYPE_INT64, sizeof(int64_t)},
78 | #ifdef ENABLE_BF16
79 |                                                                    {TYPE_BF16, sizeof(__nv_bfloat16)},
80 | #endif
81 |                                                                    {TYPE_FP16, sizeof(half)},
82 |                                                                    {TYPE_FP32, sizeof(float)},
83 |                                                                    {TYPE_FP64, sizeof(double)}};
84 |         return type_map.at(type);
85 |     }
86 | };
87 | }  // namespace turbomind
88 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/smem_copy_simt.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/array_ops.h"
  6 | #include "src/turbomind/kernels/core/common.h"
  7 | #include "src/turbomind/kernels/gemm/simt.h"
  8 | #include "src/turbomind/kernels/gemm/smem_copy.h"
  9 | #include "src/turbomind/kernels/gemm/types.h"
 10 | 
 11 | namespace turbomind::gemm {
 12 | 
 13 | template<class T, int K_>
 14 | struct SmemCopy_MMA_SIMT_A {
 15 |     static constexpr int M = simt::OP_M;
 16 |     static constexpr int K = simt::OP_K;
 17 | 
 18 |     static constexpr int OP_N = simt::OP_N;
 19 | 
 20 |     static constexpr int kFragNum = 1;
 21 | 
 22 |     using Frag = Array<T, K>;
 23 | 
 24 |     __device__ static int2 get_offset(int thread_idx)
 25 |     {
 26 |         const int lane_id = thread_idx % WARP_SIZE;
 27 |         return {lane_id / OP_N, 0};
 28 |     }
 29 | 
 30 |     template<class S, class D>
 31 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)  // -> (m, k)
 32 |     {
 33 |         Lds(*(Frag*)dst_ptr, (S &&) src_ptr);
 34 |     }
 35 | 
 36 |     __device__ static int2 unique(int thread_idx, int pack_idx)  // -> (unique id, repeat id)
 37 |     {
 38 |         const int lane_id = thread_idx % WARP_SIZE;
 39 |         return {pack_idx * M + lane_id / OP_N, lane_id % OP_N};
 40 |     }
 41 | };
 42 | 
 43 | template<class T, int K_>
 44 | struct SmemCopy_MMA_SIMT_B {
 45 |     static constexpr int M = simt::OP_N;
 46 |     static constexpr int K = simt::OP_K;
 47 | 
 48 |     static constexpr int OP_N = simt::OP_N;
 49 | 
 50 |     static constexpr int kFragNum = 1;
 51 | 
 52 |     using Frag = Array<T, K>;
 53 | 
 54 |     __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
 55 |     {
 56 |         const int lane_id = thread_idx % WARP_SIZE;
 57 |         return {lane_id % OP_N, 0};
 58 |     }
 59 | 
 60 |     template<class S, class D>
 61 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
 62 |     {
 63 |         Lds(*(Frag*)dst_ptr, (S &&) src_ptr);
 64 |     }
 65 | 
 66 |     __device__ static int2 unique(int thread_idx, int pack_idx)  // -> (unique id, repeat id)
 67 |     {
 68 |         const int lane_id = thread_idx % WARP_SIZE;
 69 |         return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N};
 70 |     }
 71 | };
 72 | 
 73 | template<class T, int K_>
 74 | struct SmemCopy_MMA_SIMT_V {
 75 |     static constexpr int M = simt::OP_N;
 76 |     static constexpr int K = K_;
 77 | 
 78 |     static constexpr int OP_N = simt::OP_N;
 79 | 
 80 |     static constexpr int kFragNum = 1;
 81 | 
 82 |     using Frag = Array<T, 1>;
 83 | 
 84 |     __device__ static int2 unique(int thread_idx, int pack_idx)
 85 |     {
 86 |         const int lane_id = thread_idx % WARP_SIZE;
 87 |         return {pack_idx * OP_N + lane_id % OP_N, lane_id / OP_N};
 88 |     }
 89 | 
 90 |     __device__ static int2 get_offset(int thread_idx)  // -> (m, k)
 91 |     {
 92 |         return {unique(thread_idx, 0).x, 0};
 93 |     }
 94 | 
 95 |     template<class S, class D>
 96 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool mask)
 97 |     {
 98 |         Lds(*(Frag*)dst_ptr, src_ptr);
 99 |     }
100 | };
101 | 
102 | }  // namespace turbomind::gemm
103 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/config_sm70_s884.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/arch.h"
 6 | #include "src/turbomind/kernels/gemm/arch/mma_sm70.h"
 7 | #include "src/turbomind/kernels/gemm/arch/operand_sm70_s884.h"
 8 | #include "src/turbomind/kernels/gemm/cta_map.h"
 9 | #include "src/turbomind/kernels/gemm/epilogue.h"
10 | #include "src/turbomind/kernels/gemm/gemm_universal.h"
11 | #include "src/turbomind/kernels/gemm/iterator_sm70.h"
12 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h"
13 | #include "src/turbomind/kernels/gemm/thread_group_map.h"
14 | #include "src/turbomind/kernels/gemm/tiled_mma.h"
15 | #include "src/turbomind/kernels/gemm/types.h"
16 | 
17 | namespace turbomind::gemm::sm70_s884 {
18 | 
19 | template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
20 | struct Sm70_s884 {
21 | 
22 |     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
23 | 
24 |     static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
25 |     static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
26 |     static constexpr int SMEM_K = A::SmemCopyAtom::K;
27 | 
28 |     template<int CTA_M,
29 |              int CTA_N,
30 |              int CTA_K,
31 |              int TG_M,
32 |              int TG_N,
33 |              int TG_K,
34 |              class PolicyA,
35 |              class PolicyB,
36 |              int  Stages,
37 |              bool SplitK,
38 |              int  GroupSizeU = 1,
39 |              int  GroupSizeV = 1,
40 |              int  TILE_C_M_  = -1,
41 |              int  TILE_C_N_  = -1>
42 |     struct Type {
43 | 
44 |         // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom)
45 |         using MMA_Atom = SM70_MMA_884;
46 | 
47 |         using Partition = Blocked<TG_M, TG_N, kColMajor>;
48 |         using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
49 | 
50 |         using MMA = Tiled_MMA_v2<MMA_Atom, MMA_Map>;
51 | 
52 |         using Mainloop = MainloopSm70<MMA,
53 |                                       A,
54 |                                       IteratorSm70<PolicyA>,
55 |                                       TransformA,
56 |                                       U,
57 |                                       GroupSizeU,
58 |                                       B,
59 |                                       IteratorSm70<PolicyB>,
60 |                                       TransformB,
61 |                                       V,
62 |                                       GroupSizeV,
63 |                                       Stages,
64 |                                       true>;  // FusePrefetch_
65 | 
66 |         static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
67 |         static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
68 | 
69 |         using Epilogue = gemm::Epilogue_<Tc,
70 |                                          CTA_M,
71 |                                          CTA_N,
72 |                                          TILE_C_M,
73 |                                          TILE_C_N,
74 |                                          MMA::kThreadCount,
75 |                                          Rearrange<MMA>,
76 |                                          Operand_C<float, order_c>,
77 |                                          SplitK>;
78 | 
79 |         using Kernel = GemmUniversal<Sm70, Mainloop, Epilogue, CtaMap>;
80 |     };
81 | };
82 | 
83 | }  // namespace turbomind::gemm::sm70_s884
84 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/gemm/desc.h"
  6 | #include "src/turbomind/kernels/gemm/types.h"
  7 | #include <array>
  8 | #include <cuda_runtime.h>
  9 | #include <string>
 10 | #include <utility>
 11 | #include <vector>
 12 | 
 13 | namespace turbomind::gemm {
 14 | 
 15 | struct KernelMetric {
 16 |     int64_t mio_cost;
 17 |     int64_t mma_cost;
 18 | };
 19 | 
 20 | class Kernel {
 21 | public:
 22 |     virtual ~Kernel() = default;
 23 | 
 24 |     virtual int Launch(const Operation&    operation,
 25 |                        float               alpha,
 26 |                        const void*         A,
 27 |                        const MatrixLayout& Adesc,
 28 |                        const void*         U,
 29 |                        const MatrixLayout& Udesc,
 30 |                        const void*         B,
 31 |                        const MatrixLayout& Bdesc,
 32 |                        const void*         V,
 33 |                        const MatrixLayout& Vdesc,
 34 |                        float               beta,
 35 |                        const void*         C,
 36 |                        const MatrixLayout& Cdesc,
 37 |                        void*               D,
 38 |                        const MatrixLayout& Ddesc,
 39 |                        int                 swizzle,
 40 |                        int                 splits,
 41 |                        Workspace&          workspace,
 42 |                        cudaStream_t        stream) = 0;
 43 | 
 44 |     // virtual because different implementation may have different workspace requeirements
 45 |     virtual int GetMaxSplits(int m, int n, int k, size_t barrier_size, size_t partials_size) = 0;
 46 | 
 47 |     // true if this kernel can be used to compute the gemm
 48 |     bool is_feasible(const GemmDesc& desc) const noexcept;
 49 | 
 50 |     std::vector<std::pair<int, KernelMetric>>
 51 |     Estimate_v2(std::array<int, 3> size, int max_splits, int max_waves, int sm_count) const;
 52 | 
 53 |     virtual int GetSwizzle(int m, int n, int k, int splits, int swizzle) = 0;
 54 | 
 55 |     const KernelDesc& desc() const noexcept
 56 |     {
 57 |         return desc_;
 58 |     }
 59 | 
 60 |     int3 cta_tile_size() const noexcept
 61 |     {
 62 |         return desc_.cta_tile;
 63 |     }
 64 | 
 65 |     int3 warp_tile_size() const noexcept
 66 |     {
 67 |         return desc_.mma_tile;
 68 |     }
 69 | 
 70 |     int chunk_size_k() const noexcept
 71 |     {
 72 |         return chunk_size_k_;
 73 |     }
 74 | 
 75 |     int stages() const noexcept
 76 |     {
 77 |         return desc_.stages;
 78 |     }
 79 | 
 80 |     bool split_k() const noexcept
 81 |     {
 82 |         return desc_.split_k;
 83 |     }
 84 | 
 85 |     int arch() const noexcept
 86 |     {
 87 |         return desc_.arch;
 88 |     }
 89 | 
 90 |     int smem_size() const noexcept
 91 |     {
 92 |         return smem_size_;
 93 |     }
 94 | 
 95 |     std::string name() const
 96 |     {
 97 |         return name_;
 98 |     }
 99 | 
100 | protected:
101 |     std::string GetName() const;
102 | 
103 |     KernelDesc desc_;
104 | 
105 |     int chunk_size_k_;
106 |     int smem_size_;
107 | 
108 |     std::string name_;
109 | };
110 | 
111 | struct ClusteringParam {
112 |     bool cache_policy;
113 |     bool max_active_ctas;
114 | };
115 | 
116 | std::vector<std::vector<LaunchSpec>> Cluster(const std::vector<LaunchSpec>& specs, const ClusteringParam& param);
117 | 
118 | }  // namespace turbomind::gemm
119 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/config_sm75_s16816.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch.h"
 4 | #include "src/turbomind/kernels/gemm/arch/mma_sm80.h"
 5 | #include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h"
 6 | #include "src/turbomind/kernels/gemm/cta_map.h"
 7 | #include "src/turbomind/kernels/gemm/epilogue.h"
 8 | #include "src/turbomind/kernels/gemm/gemm_universal.h"
 9 | #include "src/turbomind/kernels/gemm/iterator_sm70.h"
10 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h"
11 | #include "src/turbomind/kernels/gemm/thread_group_map.h"
12 | #include "src/turbomind/kernels/gemm/tiled_mma.h"
13 | #include "src/turbomind/kernels/gemm/types.h"
14 | 
15 | namespace turbomind::gemm {
16 | 
17 | namespace sm75_s16816 {
18 | 
19 | using namespace sm80_s16816;
20 | 
21 | template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
22 | struct Sm75_s16816 {
23 | 
24 |     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
25 | 
26 |     static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
27 |     static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
28 |     static constexpr int SMEM_K = A::SmemCopyAtom::K;
29 | 
30 |     template<int CTA_M,
31 |              int CTA_N,
32 |              int CTA_K,
33 |              int TG_M,
34 |              int TG_N,
35 |              int TG_K,
36 |              class PolicyA,
37 |              class PolicyB,
38 |              int  Stages,
39 |              bool SplitK,
40 |              int  GroupSizeU = 1,
41 |              int  GroupSizeV = 1,
42 |              int  TILE_C_M_  = -1,
43 |              int  TILE_C_N_  = -1>
44 |     struct Type {
45 |         // Raked partition dont support `Pack_M > 1`
46 |         using Partition = Blocked<TG_M, TG_N, kColMajor>;
47 |         using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
48 |         using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
49 | 
50 |         using Mainloop = MainloopSm70<MMA,
51 |                                       A,
52 |                                       IteratorSm70<PolicyA>,
53 |                                       TransformA,
54 |                                       U,
55 |                                       GroupSizeU,
56 |                                       B,
57 |                                       IteratorSm70<PolicyB>,
58 |                                       TransformB,
59 |                                       V,
60 |                                       GroupSizeV,
61 |                                       Stages,
62 |                                       true>;  // FusePrefetch_
63 | 
64 |         static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
65 |         static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
66 | 
67 |         using Epilogue = gemm::Epilogue_<Tc,
68 |                                          CTA_M,
69 |                                          CTA_N,
70 |                                          TILE_C_M,
71 |                                          TILE_C_N,
72 |                                          MMA::kThreadCount,
73 |                                          Rearrange<MMA>,
74 |                                          Operand_C<float, order_c>,
75 |                                          SplitK>;
76 | 
77 |         using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
78 |     };
79 | };
80 | 
81 | }  // namespace sm75_s16816
82 | 
83 | }  // namespace turbomind::gemm
84 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/smem.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/array.h"
  6 | #include "src/turbomind/kernels/core/common.h"
  7 | #include <cassert>
  8 | 
  9 | namespace turbomind {
 10 | 
 11 | __inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr)
 12 | {
 13 |     return (uint32_t)__cvta_generic_to_shared(ptr);
 14 | }
 15 | 
 16 | __inline__ __device__ void ldmatrix_m8n8_x4_b16(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
 17 | {
 18 | #if TURBOMIND_ARCH_SM75
 19 |     asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
 20 |                  : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
 21 |                  : "r"(smem_int_ptr));
 22 | #else
 23 |     assert(TURBOMIND_ARCH_SM75);
 24 | #endif
 25 | }
 26 | 
 27 | __inline__ __device__ void ldsm_x4_trans(uint& d0, uint& d1, uint& d2, uint& d3, uint32_t smem_int_ptr)
 28 | {
 29 | #if TURBOMIND_ARCH_SM75
 30 |     asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
 31 |                  : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
 32 |                  : "r"(smem_int_ptr));
 33 | #else
 34 |     assert(TURBOMIND_ARCH_SM75);
 35 | #endif
 36 | }
 37 | 
 38 | __inline__ __device__ void ldmatrix_m8n8_x2_b16(uint& d0, uint& d1, uint32_t smem_int_ptr)
 39 | {
 40 | #if TURBOMIND_ARCH_SM75
 41 |     asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(d0), "=r"(d1) : "r"(smem_int_ptr));
 42 | #else
 43 |     assert(TURBOMIND_ARCH_SM75);
 44 | #endif
 45 | }
 46 | 
 47 | __inline__ __device__ void ldsm_x2_trans(uint& d0, uint& d1, uint32_t smem_int_ptr)
 48 | {
 49 | #if TURBOMIND_ARCH_SM75
 50 |     asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0,%1}, [%2];\n"
 51 |                  : "=r"(d0), "=r"(d1)
 52 |                  : "r"(smem_int_ptr));
 53 | #else
 54 |     assert(TURBOMIND_ARCH_SM75);
 55 | #endif
 56 | }
 57 | 
 58 | __inline__ __device__ void ldmatrix_m8n8_x1_b16(uint& d0, uint32_t smem_int_ptr)
 59 | {
 60 | #if TURBOMIND_ARCH_SM75
 61 |     asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr));
 62 | #else
 63 |     assert(TURBOMIND_ARCH_SM75);
 64 | #endif
 65 | }
 66 | 
 67 | __inline__ __device__ void ldsm_x1_trans(uint& d0, uint32_t smem_int_ptr)
 68 | {
 69 | #if TURBOMIND_ARCH_SM75
 70 |     asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 %0, [%1];\n" : "=r"(d0) : "r"(smem_int_ptr));
 71 | #else
 72 |     assert(TURBOMIND_ARCH_SM75);
 73 | #endif
 74 | }
 75 | 
 76 | __inline__ __device__ void ldsm_x4(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
 77 | {
 78 |     ldmatrix_m8n8_x4_b16(d[0], d[1], d[2], d[3], smem_int_ptr);
 79 | }
 80 | 
 81 | __inline__ __device__ void ldsm_x2(Array<uint32_t, 2>& d, uint32_t smem_int_ptr)
 82 | {
 83 |     ldmatrix_m8n8_x2_b16(d[0], d[1], smem_int_ptr);
 84 | }
 85 | 
 86 | __inline__ __device__ void ldsm_x1(Array<uint32_t, 1>& d, uint32_t smem_int_ptr)
 87 | {
 88 |     ldmatrix_m8n8_x1_b16(d[0], smem_int_ptr);
 89 | }
 90 | 
 91 | __inline__ __device__ void ldsm_x4_trans(Array<uint32_t, 4>& d, uint32_t smem_int_ptr)
 92 | {
 93 |     ldsm_x4_trans(d[0], d[1], d[2], d[3], smem_int_ptr);
 94 | }
 95 | 
 96 | __inline__ __device__ void ldsm_x2_trans(Array<uint32_t, 2>& d, uint32_t smem_int_ptr)
 97 | {
 98 |     ldsm_x2_trans(d[0], d[1], smem_int_ptr);
 99 | }
100 | 
101 | __inline__ __device__ void ldsm_x1_trans(Array<uint32_t, 1>& d, uint32_t smem_int_ptr)
102 | {
103 |     ldsm_x1_trans(d[0], smem_int_ptr);
104 | }
105 | 
106 | }  // namespace turbomind
107 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/utils.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/gemm/simt.h"
  6 | #include "src/turbomind/kernels/gemm/types.h"
  7 | 
  8 | namespace turbomind::gemm {
  9 | 
 10 | __host__ __device__ constexpr Order transpose(Order order)
 11 | {
 12 |     return order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
 13 | }
 14 | 
 15 | __host__ __device__ constexpr MatrixLayout transpose(MatrixLayout x)
 16 | {
 17 |     auto tmp = x.cols;  // `std::swap` is not constexpr
 18 |     x.cols   = x.rows;
 19 |     x.rows   = tmp;
 20 |     x.order  = transpose(x.order);
 21 |     return x;
 22 | }
 23 | 
 24 | template<Order order>
 25 | __host__ __device__ constexpr int2 mk2cs(int m, int k)
 26 | {
 27 |     if constexpr (order == Order::kRowMajor) {
 28 |         return {k, m};
 29 |     }
 30 |     else {
 31 |         return {m, k};
 32 |     }
 33 | }
 34 | 
 35 | template<Order order>
 36 | __host__ __device__ constexpr int2 mk2cs(int2 mk)
 37 | {
 38 |     return mk2cs<order>(mk.x, mk.y);
 39 | }
 40 | 
 41 | template<Order order>
 42 | __host__ __device__ constexpr int2 cs2mk(int c, int s)
 43 | {
 44 |     if constexpr (order == Order::kRowMajor) {
 45 |         return {s, c};
 46 |     }
 47 |     else {
 48 |         return {c, s};
 49 |     }
 50 | }
 51 | 
 52 | template<Order order>
 53 | __host__ __device__ constexpr int2 _kn2cs(int k, int n)
 54 | {
 55 |     if constexpr (order == Order::kColMajor) {
 56 |         return {k, n};
 57 |     }
 58 |     else {
 59 |         return {n, k};
 60 |     }
 61 | }
 62 | 
 63 | template<class Index>
 64 | __host__ __device__ constexpr Index cs2idx(int2 cs, Index ld)
 65 | {
 66 |     return ld * cs.y + cs.x;
 67 | }
 68 | 
 69 | template<MMA_Tag mma, Op_Tag op, int num, Order order>
 70 | struct PackingImpl {
 71 |     __host__ __device__ static constexpr int2 apply(int2 mk)
 72 |     {
 73 |         return mk;
 74 |     }
 75 | };
 76 | 
 77 | template<Pack pack, Order order>
 78 | struct Packing_v2: PackingImpl<get_mma_tag(pack), get_operand_tag(pack), get_pack_num(pack), order> {
 79 | };
 80 | 
 81 | /// TODO: move packing utility to arch/smem_copy_xxx
 82 | 
 83 | template<int num>
 84 | struct PackingImpl<HMMA_16816, OPERAND_A, num, kRowMajor> {
 85 |     __host__ __device__ static constexpr int2 apply(int2 mk)
 86 |     {
 87 |         return {mk.x / 16 / num, mk.y * 16 * num};
 88 |     }
 89 | };
 90 | 
 91 | template<int num>
 92 | struct PackingImpl<HMMA_16816, OPERAND_A, num, kColMajor> {
 93 |     __host__ __device__ static constexpr int2 apply(int2 mk)
 94 |     {
 95 |         return {mk.x * 16, mk.y / 16};
 96 |     }
 97 | };
 98 | 
 99 | template<int num, Order order>
100 | struct PackingImpl<HMMA_16816, OPERAND_B, num, order>: PackingImpl<HMMA_16816, OPERAND_A, num, order> {
101 | };
102 | 
103 | template<int num>
104 | struct PackingImpl<HMMA_SIMT, OPERAND_A, num, kRowMajor> {
105 |     __host__ __device__ static constexpr int2 apply(int2 mk)
106 |     {
107 |         return {mk.x / (simt::OP_M * num), mk.y * simt::OP_M * num};
108 |     }
109 | };
110 | 
111 | template<int num>
112 | struct PackingImpl<HMMA_SIMT, OPERAND_B, num, kRowMajor> {
113 |     __host__ __device__ static constexpr int2 apply(int2 mk)
114 |     {
115 |         return {mk.x / (simt::OP_N * num), mk.y * simt::OP_N * num};
116 |     }
117 | };
118 | 
119 | template<int num>
120 | struct PackingImpl<HMMA_884, OPERAND_B, num, kRowMajor> {
121 |     __host__ __device__ static constexpr int2 apply(int2 mk)
122 |     {
123 |         // return {mk.x / (16 * num), mk.y * 16 * num};
124 |         return {mk.x / (32 * num), mk.y * 32 * num};
125 |     }
126 | };
127 | 
128 | }  // namespace turbomind::gemm
129 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm90_s16816.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
 4 | #include "src/turbomind/kernels/gemm/registry.h"
 5 | #include "src/turbomind/kernels/gemm/transform.h"
 6 | #include "src/turbomind/kernels/gemm/types.h"
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | void Registry::f16_u4g128_f16_tnt_sm90_s16816()
11 | {
12 |     using namespace sm80_s16816;
13 |     using namespace cache_policy;
14 |     //////////////////////////////////////////////////////////////////////////////
15 |     // ! sm_90 + cp.async + evict policy = warp illegal instruction
16 |     //////////////////////////////////////////////////////////////////////////////
17 |     using D = cache_policy::Default;
18 | 
19 |     using C = Sm80_s16816<Sm90,
20 |                           Operand_A<half, kRowMajor>,          // A
21 |                           Transform_Default,                   // tarnsform A
22 |                           VoidOperand,                         // U
23 |                           Operand_B_Pack<uint4_t, kColMajor>,  // B
24 |                           Transform_HMMA_16816<1, 0>,          // transform B
25 |                           Operand_UV_Pack<uint32_t, true>,     // V
26 |                           kRowMajor,                           // order_C
27 |                           half>;                               // Tc
28 | 
29 |     // clang-format off
30 |     Add<C::Type<128, 256,  64, 1, 8, 1, D, D, 3, true, 1, 128>>();
31 |     Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128, 128, 128>>();
32 |     Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128, 128, 128>>();
33 |     Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();
34 |     Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128, 64, 128>>();
35 |     Add<C::Type<128, 128,  64, 1, 4, 2, D, D, 3, true, 1, 128, 64, 128>>();
36 | 
37 |     Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128>>();
38 |     Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128>>();
39 |     Add<C::Type<96, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128>>();
40 |     Add<C::Type<96, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
41 | 
42 |     Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();
43 |     Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 4, true, 1, 128, 64, 128>>();
44 |     Add<C::Type<64, 128,  32, 1, 4, 1, D, D, 4, true, 1, 128>>();
45 |     Add<C::Type<64, 128,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
46 |     Add<C::Type<64, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
47 |     Add<C::Type<64,  64,  64, 1, 2, 2, D, D, 6, true, 1, 128>>();
48 | 
49 |     Add<C::Type<48, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128, 48, 128>>();
50 |     Add<C::Type<48, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
51 |     Add<C::Type<48, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
52 |     Add<C::Type<48,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
53 | 
54 |     Add<C::Type<32, 256,  64, 1, 4, 1, D, D, 3, true, 1, 128>>();
55 |     Add<C::Type<32, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
56 |     Add<C::Type<32, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
57 |     Add<C::Type<32,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
58 |     Add<C::Type<32,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
59 | 
60 |     Add<C::Type<16, 128,  64, 1, 4, 1, D, D, 4, true, 1, 128>>();
61 |     Add<C::Type<16, 128, 128, 1, 4, 2, D, D, 3, true, 1, 128>>();
62 |     Add<C::Type<16, 128, 128, 1, 4, 2, D, D, 4, true, 1, 128>>();
63 |     Add<C::Type<16,  64, 128, 1, 2, 2, D, D, 3, true, 1, 128>>();
64 |     Add<C::Type<16,  64, 128, 1, 2, 2, D, D, 4, true, 1, 128>>();
65 |     // clang-format on
66 | }
67 | 
68 | }  // namespace turbomind::gemm
69 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/config_sm80_s16816.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/arch.h"
 6 | #include "src/turbomind/kernels/gemm/arch/mma_sm80.h"
 7 | #include "src/turbomind/kernels/gemm/arch/operand_sm80_s16816.h"
 8 | #include "src/turbomind/kernels/gemm/cta_map.h"
 9 | #include "src/turbomind/kernels/gemm/epilogue.h"
10 | #include "src/turbomind/kernels/gemm/gemm_universal.h"
11 | #include "src/turbomind/kernels/gemm/iterator_sm80.h"
12 | #include "src/turbomind/kernels/gemm/mainloop_sm80_v2.h"
13 | #include "src/turbomind/kernels/gemm/thread_group_map.h"
14 | #include "src/turbomind/kernels/gemm/tiled_mma.h"
15 | #include "src/turbomind/kernels/gemm/types.h"
16 | 
17 | namespace turbomind::gemm::sm80_s16816 {
18 | 
19 | template<class Arch,
20 |          class A,
21 |          class TransformA,
22 |          class U,
23 |          class B,
24 |          class TransformB,
25 |          class V,
26 |          Order order_c,
27 |          class Tc,
28 |          class CtaMap_ = CtaMap>
29 | struct Sm80_s16816 {
30 | 
31 |     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
32 | 
33 |     static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
34 |     static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
35 |     static constexpr int SMEM_K = A::SmemCopyAtom::K;
36 | 
37 |     template<int CTA_M,
38 |              int CTA_N,
39 |              int CTA_K,
40 |              int TG_M,
41 |              int TG_N,
42 |              int TG_K,
43 |              class PolicyA,
44 |              class PolicyB,
45 |              int  Stages,
46 |              bool SplitK,
47 |              int  GroupSizeU   = 1,
48 |              int  GroupSizeV   = 1,
49 |              int  TILE_C_M_    = -1,
50 |              int  TILE_C_N_    = -1,
51 |              bool FusePrefecth = true>
52 | 
53 |     struct Type {
54 | 
55 |         // Raked partition dont support `Pack_M > 1`
56 |         using Partition = Blocked<TG_M, TG_N, kColMajor>;
57 |         using MMA_Map   = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
58 |         using MMA       = Tiled_MMA_v2<SM80_MMA_16x8x16_F32_F16_F16_F32_TN, MMA_Map>;
59 | 
60 |         using Mainloop = MainloopSm80_v2<MMA,
61 |                                          A,
62 |                                          IteratorSm80<PolicyA>,
63 |                                          TransformA,
64 |                                          U,
65 |                                          GroupSizeU,
66 |                                          B,
67 |                                          IteratorSm80<PolicyB>,
68 |                                          TransformB,
69 |                                          V,
70 |                                          GroupSizeV,
71 |                                          Stages,
72 |                                          FusePrefecth>;
73 | 
74 |         static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
75 |         static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
76 | 
77 |         using Epilogue = gemm::Epilogue_<Tc,
78 |                                          CTA_M,
79 |                                          CTA_N,
80 |                                          TILE_C_M,
81 |                                          TILE_C_N,
82 |                                          MMA::kThreadCount,
83 |                                          Rearrange<MMA>,
84 |                                          Operand_C<float, order_c>,
85 |                                          SplitK>;
86 | 
87 |         using Kernel = GemmUniversal<Arch, Mainloop, Epilogue, CtaMap_>;
88 |     };
89 | };
90 | 
91 | }  // namespace turbomind::gemm::sm80_s16816
92 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/config_simt.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/arch.h"
 6 | #include "src/turbomind/kernels/gemm/arch/mma_simt.h"
 7 | #include "src/turbomind/kernels/gemm/arch/operand_simt.h"
 8 | #include "src/turbomind/kernels/gemm/cta_map.h"
 9 | #include "src/turbomind/kernels/gemm/gemm_universal.h"
10 | #include "src/turbomind/kernels/gemm/iterator_sm70.h"
11 | #include "src/turbomind/kernels/gemm/mainloop_sm70.h"
12 | #include "src/turbomind/kernels/gemm/thread_group_map.h"
13 | #include "src/turbomind/kernels/gemm/tiled_mma.h"
14 | #include "src/turbomind/kernels/gemm/types.h"
15 | 
16 | namespace turbomind::gemm {
17 | 
18 | namespace simt {
19 | 
20 | template<class A, class TransformA, class U, class B, class TransformB, class V, Order order_c, class Tc>
21 | struct Sm75_Simt {
22 | 
23 |     static_assert(A::SmemCopyAtom::K == B::SmemCopyAtom::K);
24 | 
25 |     static constexpr int SMEM_M = A::SmemCopyAtom::M / A::SmemCopyAtom::kFragNum;
26 |     static constexpr int SMEM_N = B::SmemCopyAtom::M / B::SmemCopyAtom::kFragNum;
27 |     static constexpr int SMEM_K = A::SmemCopyAtom::K;
28 | 
29 |     template<int CTA_M,
30 |              int CTA_N,
31 |              int CTA_K,
32 |              int TG_M,
33 |              int TG_N,
34 |              int TG_K,
35 |              class PolicyA,
36 |              class PolicyB,
37 |              int  Stages,
38 |              bool SplitK,
39 |              int  GroupSizeU = 1,
40 |              int  GroupSizeV = 1,
41 |              int  TILE_C_M_  = -1,
42 |              int  TILE_C_N_  = -1>
43 |     struct Type {
44 | 
45 |         // (TM, TN, TK) = R(MMA_Atom, SmemCopy_Atom)
46 |         using MMA_Atom = MMA_SIMT<half>;
47 | 
48 |         static constexpr int TM = MMA_Atom::M;
49 |         static constexpr int TN = MMA_Atom::N;
50 |         static constexpr int TK = MMA_Atom::K;
51 | 
52 |         using Partition = Blocked<TG_M, TG_N, kColMajor>;
53 | 
54 |         using MMA_Map = MMA_Map<CTA_M, CTA_N, CTA_K, SMEM_M, SMEM_N, SMEM_K, Partition, TG_K>;
55 |         using MMA     = Tiled_MMA_v2<MMA_Atom, MMA_Map>;
56 | 
57 |         // using MMA_Map = RakedThreadGroupMap<CTA_M, CTA_N, CTA_K, TM, TN, TK, WARP_CNT_M, WARP_CNT_N, WARP_CNT_K>;
58 | 
59 |         using Mainloop = MainloopSm70<MMA,
60 |                                       A,
61 |                                       IteratorSm70<PolicyA>,
62 |                                       TransformA,
63 |                                       U,
64 |                                       GroupSizeU,
65 |                                       B,
66 |                                       IteratorSm70<PolicyB>,
67 |                                       TransformB,
68 |                                       V,
69 |                                       GroupSizeV,
70 |                                       Stages,
71 |                                       true>;
72 | 
73 |         static constexpr int TILE_C_M = TILE_C_M_ == -1 ? CTA_M : TILE_C_M_;
74 |         static constexpr int TILE_C_N = TILE_C_N_ == -1 ? CTA_N : TILE_C_N_;
75 | 
76 |         using Epilogue = gemm::Epilogue_<Tc,
77 |                                          CTA_M,
78 |                                          CTA_N,
79 |                                          TILE_C_M,
80 |                                          TILE_C_N,
81 |                                          MMA::kThreadCount,
82 |                                          Rearrange<MMA>,
83 |                                          Operand_C<float, order_c>,
84 |                                          SplitK>;
85 | 
86 |         using Kernel = GemmUniversal<Sm75, Mainloop, Epilogue, CtaMap>;
87 |     };
88 | };
89 | 
90 | }  // namespace simt
91 | 
92 | }  // namespace turbomind::gemm
93 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/params.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #include "src/turbomind/kernels/gemm/tuner/params.h"
  4 | #include "src/turbomind/utils/parser.h"
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <regex>
  8 | 
  9 | namespace turbomind::gemm {
 10 | 
 11 | void ParseTuningParams(TuningParams& params, const std::string& str)
 12 | {
 13 |     const auto list = ParseArgsList(str);
 14 | 
 15 |     auto try_parse = [&](auto& value, auto name) {
 16 |         auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == name; });
 17 |         if (it != list.end()) {
 18 |             std::cout << name << " " << it->second << "\n";
 19 |             Parse(value, it->second);
 20 |         }
 21 |     };
 22 | 
 23 |     try_parse(params.max_splits, "max_splits");
 24 |     try_parse(params.max_waves, "max_waves");
 25 |     try_parse(params.swizzle, "swizzle");
 26 |     try_parse(params.top_k, "top_k");
 27 |     try_parse(params.clusters, "clusters");
 28 |     try_parse(params.min_iter, "min_iter");
 29 |     try_parse(params.max_iter, "max_iter");
 30 |     try_parse(params.max_time, "max_time");
 31 | 
 32 |     if (auto it = std::find_if(list.begin(), list.end(), [&](auto a) { return a.first == "seq"; }); it != list.end()) {
 33 |         params.seq = ParseTuningSequence(it->second);
 34 |     }
 35 | }
 36 | 
 37 | std::vector<int> ParseTuningSequence(const std::string& str)
 38 | {
 39 |     const std::regex triplet(R"((\d+)-(\d+)-(\d+))");
 40 | 
 41 |     std::vector<std::array<int, 3>> generators;
 42 | 
 43 |     const auto tokens = ParseListOrTuple(str);
 44 | 
 45 |     for (const auto& token : tokens) {
 46 |         std::smatch match;
 47 |         if (std::regex_match(token, match, triplet)) {
 48 |             generators.push_back({std::stoi(match[1].str()),  //
 49 |                                   std::stoi(match[2].str()),
 50 |                                   std::stoi(match[3].str())});
 51 |         }
 52 |         else {  // must be an integer string
 53 |             generators.push_back({std::stoi(token), 0, 0});
 54 |         }
 55 |     }
 56 | 
 57 |     if (generators.size() == 1) {  // Replace sentinel of the default generators
 58 |         auto fallback   = GetDefaultTuningGenerators();
 59 |         fallback.back() = {generators.front().front(), 0, 0};
 60 |         generators      = std::move(fallback);
 61 |     }
 62 | 
 63 |     return GenerateTuningSequence(generators);
 64 | }
 65 | 
 66 | std::vector<int> GenerateTuningSequence(const std::vector<std::array<int, 3>>& generators)
 67 | {
 68 |     std::vector<int> ret;
 69 |     if (generators.empty()) {
 70 |         return ret;
 71 |     }
 72 |     const int last = generators.back().front();
 73 |     // The last generator is a sentinel `(max_bs, 0, 0)`
 74 |     for (int i = 0; i < (int)generators.size() - 1; ++i) {
 75 |         auto [curr, next, step] = generators[i];
 76 |         if (curr >= last) {
 77 |             break;
 78 |         }
 79 |         if (next == 0 && step == 0) {  // single value
 80 |             ret.push_back(curr);
 81 |         }
 82 |         else {  // generator
 83 |             const int end = std::min(generators[i + 1][0], last);
 84 |             while (curr < end) {
 85 |                 ret.push_back(curr);
 86 |                 if (curr == next) {
 87 |                     step *= 2;
 88 |                     next *= 2;
 89 |                 }
 90 |                 curr += step;
 91 |             }
 92 |         }
 93 |     }
 94 |     ret.push_back(last);
 95 |     return ret;
 96 | }
 97 | 
 98 | std::vector<std::array<int, 3>> GetDefaultTuningGenerators()
 99 | {
100 |     /// TODO: set generators based on device
101 |     return {{8, 16, 8}, {16, 64, 16}, {8192}};
102 | }
103 | 
104 | }  // namespace turbomind::gemm
105 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/smem_copy_sm70.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/common.h"
  6 | #include "src/turbomind/kernels/gemm/smem_copy.h"
  7 | 
  8 | namespace turbomind::gemm {
  9 | 
 10 | template<class T>
 11 | struct SmemCopy_MMA_884_A {
 12 |     // static constexpr int M = 16;
 13 |     // static constexpr int K = 8;
 14 |     static constexpr int M = 8;
 15 |     static constexpr int K = 8;
 16 | 
 17 |     static constexpr int kFragNum = 1;
 18 | 
 19 |     using Frag = Array<T, K>;
 20 | 
 21 |     __device__ static int2 unique(int thread_idx, int pack_idx)
 22 |     {
 23 |         const int lane_id = thread_idx % WARP_SIZE;
 24 |         //                   4                3               01
 25 |         // const int m = lane_id / 16 * 4 + (lane_id & 8) + lane_id % 4;
 26 |         // return {pack_idx * M + m, (lane_id & 4) >> 2};
 27 | 
 28 |         //                   4                01
 29 |         const int m = lane_id / 16 * 4 + lane_id % 4;
 30 |         return {pack_idx * M + m, (lane_id & 12) >> 2};
 31 |     }
 32 | 
 33 |     __device__ static int2 get_offset(int thread_idx)
 34 |     {
 35 |         return int2{unique(thread_idx, 0).x, 0};
 36 |     }
 37 | 
 38 |     template<class S, class D>
 39 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
 40 |     {
 41 |         Lds(*(Frag*)dst_ptr, src_ptr);
 42 |     }
 43 | };
 44 | 
 45 | template<class T>
 46 | struct SmemCopy_MMA_884_B {
 47 |     // static constexpr int M = 16;
 48 |     // static constexpr int K = 8;
 49 |     static constexpr int M = 32;
 50 |     static constexpr int K = 8;
 51 | 
 52 |     static constexpr int kFragNum = 1;
 53 | 
 54 |     using Frag = Array<T, K>;
 55 | 
 56 |     __device__ static int2 unique(int thread_idx, int pack_idx)
 57 |     {
 58 |         const int lane_id = thread_idx % WARP_SIZE;
 59 |         //                4                     2                 01
 60 |         // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4;
 61 |         // return {pack_idx * M + m, (lane_id & 8) >> 3};
 62 | 
 63 |         //                  4                  23                  01
 64 |         const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4;
 65 |         return {pack_idx * M + m, 0};
 66 |     }
 67 | 
 68 |     __device__ static int2 get_offset(int thread_idx)
 69 |     {
 70 |         return int2{unique(thread_idx, 0).x, 0};
 71 |     }
 72 | 
 73 |     template<class S, class D>
 74 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
 75 |     {
 76 |         Lds(*(Frag*)dst_ptr, src_ptr);
 77 |     }
 78 | };
 79 | 
 80 | template<class T, int K_>
 81 | struct SmemCopy_MMA_884_V {
 82 |     // static constexpr int M = 16;
 83 |     static constexpr int M = 32;
 84 |     static constexpr int K = K_;
 85 | 
 86 |     static constexpr int kFragNum = 1;
 87 | 
 88 |     using Frag = Array<T, 1>;
 89 | 
 90 |     __device__ static int2 unique(int thread_idx, int pack_idx)
 91 |     {
 92 |         const int lane_id = thread_idx % WARP_SIZE;
 93 |         //                4                     2                 01
 94 |         // const int m = lane_id / 16 * 4 + (lane_id & 4) * 2 + lane_id % 4;
 95 |         // return {pack_idx * 16 + m, (lane_id & 8) >> 3};
 96 | 
 97 |         const int m = lane_id / 16 * 4 + (lane_id & 12) * 2 + lane_id % 4;
 98 |         return {pack_idx * M + m, 0};
 99 |     }
100 | 
101 |     __device__ static int2 get_offset(int thread_idx)
102 |     {
103 |         return int2{unique(thread_idx, 0).x, 0};
104 |     }
105 | 
106 |     template<class S, class D>
107 |     __device__ static void copy(S&& src_ptr, D&& dst_ptr, bool)
108 |     {
109 |         Lds(*(Frag*)dst_ptr, src_ptr);
110 |     }
111 | };
112 | 
113 | }  // namespace turbomind::gemm
114 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/reference.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #include "src/turbomind/kernels/gemm/test/reference.h"
  4 | #include <cstdio>
  5 | 
  6 | namespace turbomind::gemm {
  7 | 
  8 | #define CHECK(cond)                                                                                                    \
  9 |     do {                                                                                                               \
 10 |         if (!(cond)) {                                                                                                 \
 11 |             fprintf(stderr, "*** Check failed: (%s) @ %s:%d\n", #cond, __FILE__, __LINE__);                            \
 12 |             std::abort();                                                                                              \
 13 |         }                                                                                                              \
 14 |     } while (0)
 15 | 
 16 | namespace {
 17 | 
 18 | MatrixLayout transpose(MatrixLayout x)
 19 | {
 20 |     std::swap(x.rows, x.cols);
 21 |     x.order = x.order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
 22 |     return x;
 23 | }
 24 | 
 25 | cudaDataType to_cuda_dtype(DataType dtype)
 26 | {
 27 |     switch (dtype) {
 28 |         case DataType::F16:
 29 |             return CUDA_R_16F;
 30 |         case DataType::BF16:
 31 |             return CUDA_R_16BF;
 32 |         default:
 33 |             CHECK("unsupported data type" && 0);
 34 |     }
 35 |     return {};
 36 | }
 37 | 
 38 | }  // namespace
 39 | 
 40 | Reference::Reference()
 41 | {
 42 |     cublasCreate(&handle_);
 43 | }
 44 | 
 45 | Reference::~Reference()
 46 | {
 47 |     if (handle_) {
 48 |         cublasDestroy(handle_);
 49 |         handle_ = {};
 50 |     }
 51 | }
 52 | 
 53 | void Reference::set_stream(cudaStream_t stream)
 54 | {
 55 |     cublasSetStream(handle_, stream);
 56 | }
 57 | 
 58 | void Reference::gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc)
 59 | {
 60 | 
 61 |     // Transpose the problem for C to be column major
 62 |     if (Cdesc.order == Order::kRowMajor) {
 63 |         std::swap(A, B);
 64 |         std::swap(Adesc, Bdesc);
 65 |         Adesc = transpose(Adesc);
 66 |         Bdesc = transpose(Bdesc);
 67 |         Cdesc = transpose(Cdesc);
 68 |         // (n, k) (k, m)
 69 |     }
 70 | 
 71 |     CHECK(Adesc.cols == Bdesc.rows);
 72 | 
 73 |     // (m, k) (k, n)
 74 |     int m = Cdesc.rows;
 75 |     int n = Cdesc.cols;
 76 |     int k = Adesc.cols;
 77 |     CHECK(Adesc.rows == m);
 78 |     CHECK(Bdesc.cols == n);
 79 |     CHECK(Bdesc.rows == k);
 80 | 
 81 |     float alpha = 1.f;
 82 |     float beta  = 0.f;
 83 | 
 84 |     auto to_cublas_op = [](Order o) { return o == Order::kColMajor ? CUBLAS_OP_N : CUBLAS_OP_T; };
 85 | 
 86 |     auto status = cublasGemmEx(handle_,
 87 |                                to_cublas_op(Adesc.order),
 88 |                                to_cublas_op(Bdesc.order),
 89 |                                m,
 90 |                                n,
 91 |                                k,
 92 |                                &alpha,
 93 |                                A,
 94 |                                to_cuda_dtype(Adesc.type),
 95 |                                Adesc.ld,
 96 |                                B,
 97 |                                to_cuda_dtype(Bdesc.type),
 98 |                                Bdesc.ld,
 99 |                                &beta,
100 |                                C,
101 |                                to_cuda_dtype(Cdesc.type),
102 |                                Cdesc.ld,
103 |                                CUBLAS_COMPUTE_32F,
104 |                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
105 | 
106 |     CHECK(status == CUBLAS_STATUS_SUCCESS);
107 | }
108 | 
109 | }  // namespace turbomind::gemm
110 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/array.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/common.h"
  6 | #include "src/turbomind/kernels/core/data_type.h"
  7 | #include "src/turbomind/kernels/core/sub_byte_ptr.h"
  8 | 
  9 | namespace turbomind {
 10 | 
 11 | template<typename T, int N>
 12 | struct Array {
 13 |     using value_type      = T;
 14 |     using size_type       = int;
 15 |     using difference_type = int;
 16 |     using reference       = value_type&;
 17 |     using const_reference = const value_type&;
 18 |     using pointer         = value_type*;
 19 |     using const_pointer   = const value_type*;
 20 |     using iterator        = pointer;
 21 |     using const_iterator  = const_pointer;
 22 | 
 23 |     static_assert(N > 0);
 24 | 
 25 |     T __a[N];
 26 | 
 27 |     TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept
 28 |     {
 29 |         return __a[i];
 30 |     }
 31 | 
 32 |     TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept
 33 |     {
 34 |         return __a[i];
 35 |     }
 36 | 
 37 |     TM_HOST_DEVICE constexpr reference front() noexcept
 38 |     {
 39 |         return *begin();
 40 |     }
 41 | 
 42 |     TM_HOST_DEVICE constexpr const_reference front() const noexcept
 43 |     {
 44 |         return *begin();
 45 |     }
 46 | 
 47 |     TM_HOST_DEVICE constexpr reference back() noexcept
 48 |     {
 49 |         return *(end() - 1);
 50 |     }
 51 | 
 52 |     TM_HOST_DEVICE constexpr const_reference back() const noexcept
 53 |     {
 54 |         return *(end() - 1);
 55 |     }
 56 | 
 57 |     TM_HOST_DEVICE constexpr pointer data() noexcept
 58 |     {
 59 |         return &__a[0];
 60 |     }
 61 | 
 62 |     TM_HOST_DEVICE constexpr const_pointer data() const noexcept
 63 |     {
 64 |         return &__a[0];
 65 |     }
 66 | 
 67 |     TM_HOST_DEVICE constexpr iterator begin() noexcept
 68 |     {
 69 |         return data();
 70 |     }
 71 | 
 72 |     TM_HOST_DEVICE constexpr const_iterator begin() const noexcept
 73 |     {
 74 |         return data();
 75 |     }
 76 | 
 77 |     TM_HOST_DEVICE constexpr iterator end() noexcept
 78 |     {
 79 |         return data() + N;
 80 |     }
 81 | 
 82 |     TM_HOST_DEVICE constexpr const_iterator end() const noexcept
 83 |     {
 84 |         return data() + N;
 85 |     }
 86 | 
 87 |     TM_HOST_DEVICE static constexpr std::integral_constant<int, N> size() noexcept
 88 |     {
 89 |         return {};
 90 |     }
 91 | 
 92 |     TM_HOST_DEVICE static constexpr std::false_type empty() noexcept
 93 |     {
 94 |         return {};
 95 |     }
 96 | };
 97 | 
 98 | template<int N>
 99 | struct Array<uint4_t, N> {
100 |     using value_type      = detail::__uint4_t;
101 |     using size_type       = int;
102 |     using difference_type = int;
103 |     using reference       = value_type&;
104 |     using const_reference = const value_type&;
105 |     using pointer         = SubBytePtr<uint4_t>;
106 |     using const_pointer   = SubBytePtr<const uint4_t>;
107 | 
108 |     // static_assert(N % 8 == 0);
109 | 
110 |     detail::__uint4_t __a[N / 8];
111 | 
112 |     TM_HOST_DEVICE constexpr reference operator[](size_type i) noexcept
113 |     {
114 |         return __a[i / 8];
115 |     }
116 | 
117 |     TM_HOST_DEVICE constexpr const_reference operator[](size_type i) const noexcept
118 |     {
119 |         return __a[i / 8];
120 |     }
121 | 
122 |     TM_HOST_DEVICE static constexpr std::integral_constant<int, N> size() noexcept
123 |     {
124 |         return {};
125 |     }
126 | 
127 |     TM_HOST_DEVICE static constexpr std::false_type empty() noexcept
128 |     {
129 |         return {};
130 |     }
131 | 
132 |     TM_HOST_DEVICE constexpr pointer data() noexcept
133 |     {
134 |         return {(char*)&__a[0]};
135 |     }
136 | };
137 | 
138 | static_assert(sizeof(Array<uint4_t, 8>) == 4);
139 | static_assert(sizeof(Array<uint4_t, 16>) == 8);
140 | static_assert(sizeof(Array<uint4_t, 24>) == 12);
141 | static_assert(sizeof(Array<uint4_t, 32>) == 16);
142 | 
143 | }  // namespace turbomind
144 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
  1 | name: publish to pypi
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |     paths:
  8 |       - "turbomind/version.py"
  9 |   workflow_dispatch:
 10 | 
 11 | 
 12 | jobs:
 13 |   linux-build:
 14 |     strategy:
 15 |       matrix:
 16 |         pyver: [py38, py39, py310, py311, py312]
 17 |     runs-on: ubuntu-latest
 18 |     env:
 19 |       PYTHON_VERSION: ${{ matrix.pyver }}
 20 |       PLAT_NAME: manylinux2014_x86_64
 21 |       DOCKER_TAG: cuda12.1
 22 |       OUTPUT_FOLDER: cuda12.1_dist
 23 |     steps:
 24 |       - name: Free disk space
 25 |         uses: jlumbroso/free-disk-space@main
 26 |         with:
 27 |           # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
 28 |           tool-cache: false
 29 |           docker-images: false
 30 |           # All of these default to true, but feel free to set to "false" if necessary for your workflow
 31 |           android: true
 32 |           dotnet: true
 33 |           haskell: true
 34 |           large-packages: true
 35 |           swap-storage: false
 36 |       - name: Checkout repository
 37 |         uses: actions/checkout@v3
 38 |       - name: Build
 39 |         run: |
 40 |           echo ${PYTHON_VERSION}
 41 |           echo ${PLAT_NAME}
 42 |           echo ${DOCKER_TAG}
 43 |           echo ${OUTPUT_FOLDER}
 44 |           # remove -it
 45 |           sed -i 's/docker run --rm -it/docker run --rm/g' builder/manylinux/build_wheel.sh
 46 |           bash builder/manylinux/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
 47 |       - name: Upload Artifacts
 48 |         uses: actions/upload-artifact@v4
 49 |         with:
 50 |           if-no-files-found: error
 51 |           path: builder/manylinux/${{ env.OUTPUT_FOLDER }}/*
 52 |           retention-days: 1
 53 |           name: linux-${{ matrix.pyver }}
 54 | 
 55 |   windows-build:
 56 |     strategy:
 57 |       matrix:
 58 |         pyver: ['3.8', '3.9', '3.10', '3.11', '3.12']
 59 |     runs-on: windows-latest
 60 |     steps:
 61 |       - name: Checkout repository
 62 |         uses: actions/checkout@v3
 63 |       - name: Set up python
 64 |         uses: actions/setup-python@v4
 65 |         with:
 66 |           python-version: ${{ matrix.pyver }}
 67 |       - name: Install python packages
 68 |         run: |
 69 |           pip install -r requirements/build.txt
 70 |           pip install wheel
 71 |       - name: Setup CUDA Toolkit
 72 |         id: cuda-toolkit
 73 |         shell: pwsh
 74 |         run: ./builder/windows/setup_cuda.ps1
 75 |         env:
 76 |             INPUT_CUDA_VERSION: '12.1.0'
 77 |       - name: Build wheel
 78 |         run: |
 79 |           mkdir build
 80 |           cd build
 81 |           # https://github.com/pypa/setuptools/issues/1631
 82 |           pip install -U setuptools
 83 |           ..\builder\windows\generate.ps1
 84 |           cmake --build . --config Release -- /m /v:q
 85 |           if (-Not $?) {
 86 |             echo "build failed"
 87 |             exit 1
 88 |           }
 89 |           cmake --install . --config Release
 90 |           cd ..
 91 |           rm build -Force -Recurse
 92 |           python setup.py bdist_wheel -d build/wheel
 93 |       - name: Upload Artifacts
 94 |         uses: actions/upload-artifact@v4
 95 |         with:
 96 |           if-no-files-found: error
 97 |           path: build/wheel/*
 98 |           retention-days: 1
 99 |           name: windows-${{ matrix.pyver }}
100 | 
101 |   publish:
102 |     runs-on: ubuntu-latest
103 |     environment: 'prod'
104 |     needs:
105 |       - linux-build
106 |       - windows-build
107 |     steps:
108 |       - name: Download artifacts
109 |         uses: actions/download-artifact@v4
110 |         with:
111 |           path: artifact
112 |           merge-multiple: true
113 |       - name: Display artifacts
114 |         run: ls artifact/ -lh
115 |       - name: Set up python3.8
116 |         uses: actions/setup-python@v4
117 |         with:
118 |           python-version: '3.8'
119 |       - name: Upload to pypi
120 |         run: |
121 |           pip install twine
122 |           twine upload artifact/* -u __token__ -p ${{ secrets.pypi_password }}
123 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/layout.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/data_type.h"
  6 | namespace turbomind {
  7 | 
  8 | template<int Bits, int Base, int Shift>
  9 | struct Swizzle {
 10 | 
 11 |     using bit_mask = std::integral_constant<int, (1 << Bits) - 1>;
 12 |     using yyy_mask = std::integral_constant<int, bit_mask{} << (Base + Shift)>;
 13 |     using shift    = std::integral_constant<int, Shift>;
 14 | 
 15 |     template<class Offset>
 16 |     __host__ __device__ constexpr static auto apply(Offset offset)
 17 |     {
 18 |         return offset ^ ((offset & yyy_mask{}) >> shift{});
 19 |     }
 20 | 
 21 |     template<class Offset>
 22 |     __host__ __device__ constexpr auto operator()(Offset offset)
 23 |     {
 24 |         return apply(offset);
 25 |     }
 26 | };
 27 | 
 28 | struct Identity {
 29 | 
 30 |     template<class Offset>
 31 |     __device__ constexpr static auto apply(Offset offset)
 32 |     {
 33 |         return offset;
 34 |     }
 35 | 
 36 |     template<class Offset>
 37 |     __device__ Offset operator()(Offset offset)
 38 |     {
 39 |         return apply(offset);
 40 |     }
 41 | 
 42 |     template<int D>
 43 |     __device__ int AdvanceS(int offset, int s0, int s1)
 44 |     {
 45 |         return offset;
 46 |     }
 47 | };
 48 | 
 49 | template<int S_, int C_, int S0_ = -1, int C0_ = -1, class Swizzle_ = Identity>
 50 | struct SmemLayoutV2 {
 51 | 
 52 |     // (C0,S0),(   C1,       S1)
 53 |     // ( 1,C0),(C0*S0, C0*S0*C1)
 54 | 
 55 |     static constexpr int S = S_;
 56 |     static constexpr int C = C_;
 57 | 
 58 |     static constexpr int S0 = S0_ < 0 ? S : S0_;
 59 |     static constexpr int C0 = C0_ < 0 ? C : C0_;
 60 | 
 61 |     static_assert(S % S0 == 0);
 62 |     static_assert(C % C0 == 0);
 63 | 
 64 |     static constexpr int S1 = S / S0;
 65 |     static constexpr int C1 = C / C0;
 66 | 
 67 |     static constexpr int kSize = S * C;
 68 | 
 69 |     static constexpr int kSize0 = S0 * C0;
 70 |     static constexpr int kSize1 = S1 * C1;
 71 | 
 72 |     using Swizzle = Swizzle_;
 73 | 
 74 |     static constexpr int kIsTrivial = S == S0 && C == C0 && std::is_same_v<Swizzle, Identity>;
 75 | 
 76 |     __forceinline__ __device__ static int apply(int s, int c, int offset = 0)
 77 |     {
 78 |         int s1 = s / S0;
 79 |         int s0 = s % S0;
 80 |         int c1 = c / C0;
 81 |         int c0 = c % C0;
 82 |         //            variable             | uniform |         constant
 83 |         // return Swizzle::apply(s0 * C0 + c0) + offset + (s1 * C1 + c1) * kSize0;
 84 | 
 85 |         // return offset + Swizzle::apply(s0 * C0 + c0) + (s1 * C1 + c1) * kSize0;
 86 | 
 87 |         return Swizzle::apply(s0 * C0 + c0) + (s1 * C1 + c1) * kSize0 + offset;
 88 |     }
 89 | 
 90 |     __forceinline__ __device__ int operator()(int s, int c, int offset = 0)
 91 |     {
 92 |         return apply(s, c, offset);
 93 |     }
 94 | };
 95 | 
 96 | struct Offset {
 97 |     __device__ explicit Offset(int value): value_{value} {};
 98 |     __device__ int& operator()()
 99 |     {
100 |         return value_;
101 |     }
102 |     __device__ const int& operator()() const
103 |     {
104 |         return value_;
105 |     }
106 |     int value_;
107 | };
108 | 
109 | template<class T, class Layout>
110 | struct SmemAccessor {
111 |     using Pointer = get_pointer_type<T>;
112 |     Pointer ptr_;
113 |     Layout  layout_;
114 | 
115 |     __device__ SmemAccessor(Pointer ptr): ptr_{ptr} {}
116 | 
117 |     __device__ T& operator()(int s, int c)
118 |     {
119 |         return ptr_[layout_(s, c)];
120 |     }
121 | 
122 |     __device__ T& operator()(int s, int c, int offset)
123 |     {
124 |         return ptr_[layout_(s, c, offset)];
125 |     }
126 | 
127 |     __device__ T& operator()(int idx)
128 |     {
129 |         return ptr_[idx];
130 |     }
131 | };
132 | 
133 | template<class T0, class T1>
134 | struct Stride {
135 |     T0 v0;
136 |     T1 v1;
137 | 
138 |     // CTAD
139 |     __host__ __device__ Stride(T0 v0, T1 v1): v0{v0}, v1{v1} {}
140 | 
141 |     template<class I0, class I1>
142 |     __host__ __device__ constexpr auto operator()(I0 i0, I1 i1) const
143 |     {
144 |         return v0 * i0 + v1 * i1;
145 |     }
146 | };
147 | 
148 | }  // namespace turbomind
149 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda11.8-whl-release.yml:
--------------------------------------------------------------------------------
  1 | name: cuda11.8-whl-release
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - '*'
  7 |   workflow_dispatch:
  8 | 
  9 | permissions:
 10 |   contents: write
 11 | 
 12 | jobs:
 13 |   linux-build:
 14 |     strategy:
 15 |       matrix:
 16 |         pyver: [py38, py39, py310, py311, py312]
 17 |     runs-on: ubuntu-latest
 18 |     env:
 19 |       PYTHON_VERSION: ${{ matrix.pyver }}
 20 |       PLAT_NAME: manylinux2014_x86_64
 21 |       DOCKER_TAG: cuda11.8
 22 |       OUTPUT_FOLDER: cuda11.8_dist
 23 |       CUDA_VER: 11.8
 24 |     steps:
 25 |       - name: Free disk space
 26 |         uses: jlumbroso/free-disk-space@main
 27 |         with:
 28 |           # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
 29 |           tool-cache: false
 30 |           docker-images: false
 31 |           # All of these default to true, but feel free to set to "false" if necessary for your workflow
 32 |           android: true
 33 |           dotnet: true
 34 |           haskell: true
 35 |           large-packages: true
 36 |           swap-storage: false
 37 |       - name: Checkout repository
 38 |         uses: actions/checkout@v3
 39 |       - name: Build
 40 |         run: |
 41 |           echo ${PYTHON_VERSION}
 42 |           echo ${PLAT_NAME}
 43 |           echo ${DOCKER_TAG}
 44 |           echo ${OUTPUT_FOLDER}
 45 |           # remove -it
 46 |           sed -i 's/docker run --rm -it/docker run --rm/g' builder/manylinux/build_wheel.sh
 47 |           bash builder/manylinux/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
 48 |       - name: Upload Artifacts
 49 |         uses: actions/upload-artifact@v4
 50 |         with:
 51 |           if-no-files-found: error
 52 |           path: builder/manylinux/${{ env.OUTPUT_FOLDER }}/*
 53 |           retention-days: 1
 54 |           name: linux-${{ matrix.pyver }}
 55 | 
 56 |   windows-build:
 57 |     strategy:
 58 |       matrix:
 59 |         pyver: ['3.8', '3.9', '3.10', '3.11', '3.12']
 60 |     runs-on: windows-latest
 61 |     steps:
 62 |       - name: Checkout repository
 63 |         uses: actions/checkout@v3
 64 |       - name: Set up python
 65 |         uses: actions/setup-python@v4
 66 |         with:
 67 |           python-version: ${{ matrix.pyver }}
 68 |       - name: Install python packages
 69 |         run: |
 70 |           pip install pybind11 wheel
 71 |       - name: Setup CUDA Toolkit
 72 |         id: cuda-toolkit
 73 |         shell: pwsh
 74 |         run: ./builder/windows/setup_cuda.ps1
 75 |         env:
 76 |             INPUT_CUDA_VERSION: '11.8.0'
 77 |       - name: Build wheel
 78 |         run: |
 79 |           mkdir build
 80 |           cd build
 81 |           pip install -U setuptools
 82 |           ..\builder\windows\generate.ps1
 83 |           cmake --build . --config Release -- /m /v:q
 84 |           if (-Not $?) {
 85 |             echo "build failed"
 86 |             exit 1
 87 |           }
 88 |           cmake --install . --config Release
 89 |           cd ..
 90 |           rm build -Force -Recurse
 91 |           python setup.py bdist_wheel -d build/wheel
 92 |       - name: Upload Artifacts
 93 |         uses: actions/upload-artifact@v4
 94 |         with:
 95 |           if-no-files-found: error
 96 |           path: build/wheel/*
 97 |           retention-days: 1
 98 |           name: windows-${{ matrix.pyver }}
 99 | 
100 |   publish:
101 |     runs-on: ubuntu-latest
102 |     environment: 'prod'
103 |     needs:
104 |       - linux-build
105 |       - windows-build
106 |     steps:
107 |       - name: Checkout repository
108 |         uses: actions/checkout@v3
109 |       - name: Download artifacts
110 |         uses: actions/download-artifact@v4
111 |         with:
112 |           path: artifact
113 |           merge-multiple: true
114 |       - name: Add cuda version to package name
115 |         run: |
116 |           ver=$(cat turbomind/version.py | grep '__version__ =' | cut -d\' -f2)
117 |           cuver=$ver+cu118
118 |           ls -lh
119 |           cd artifact
120 |           for file in *; do
121 |             mv "$file" "`echo $file | sed "s/$ver/$cuver/g"`";
122 |           done
123 |       - name: Display artifacts
124 |         run: ls artifact/ -lh
125 |       - name: Publish
126 |         uses: softprops/action-gh-release@v1
127 |         if: startsWith(github.ref, 'refs/tags/')
128 |         with:
129 |           files: artifact/*
130 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm70_s884.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch/config_sm70_s884.h"
 4 | #include "src/turbomind/kernels/gemm/operand.h"
 5 | #include "src/turbomind/kernels/gemm/registry.h"
 6 | #include "src/turbomind/kernels/gemm/transform.h"
 7 | #include "src/turbomind/kernels/gemm/types.h"
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | void Registry::f16_u4g128_f16_tnt_sm70_s884()
12 | {
13 |     using namespace sm70_s884;
14 |     {  // quant B
15 |         using Config = Sm70_s884<typename GetOperand<HMMA_884, OPERAND_A, half, kRowMajor, false>::Operand,
16 |                                  Transform_Default,
17 |                                  VoidOperand,
18 |                                  typename GetOperand<HMMA_884, OPERAND_B, uint4_t, kRowMajor, true>::Operand,
19 |                                  Transform_HMMA_SIMT_B,
20 |                                  typename GetOperand<HMMA_884, OPERAND_V, uint32_t, kColMajor, true>::Operand,
21 |                                  kRowMajor,
22 |                                  half>;
23 | 
24 |         using namespace cache_policy;
25 | 
26 |         // m8n32k8: pack_bv=1
27 |         // (8,226.234),(16,192.248),(32,120.564),(64,103.483),(96,98.209),(128,54.537),(192,13.739)
28 |         // (256,-6.61),(4096,-16.622),(8192,-16.021)
29 |         Add<Config::Type<128, 256, 16, 2, 4, 1, Default, Default, 2, true, 1, 128, 128, 128>>();  // 50.631
30 |         Add<Config::Type<128, 128, 16, 2, 2, 1, Default, Default, 2, true, 1, 128, 64, 128>>();
31 |         Add<Config::Type<128, 128, 16, 2, 2, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 50.698
32 |         Add<Config::Type<96, 128, 32, 2, 2, 1, Default, Stream, 2, true, 1, 128, 48, 128>>();   // 93.395
33 |         Add<Config::Type<64, 128, 32, 2, 2, 1, Default, Default, 2, true, 1, 128, 32, 128>>();
34 |         Add<Config::Type<64, 128, 32, 2, 2, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 93.482
35 |         Add<Config::Type<64, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 82.113
36 |         Add<Config::Type<64, 256, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 80.686
37 |         Add<Config::Type<32, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 92.014
38 |         Add<Config::Type<32, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 110.979
39 |         Add<Config::Type<16, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 147.616
40 |         Add<Config::Type<16, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();           // 186.569
41 |         Add<Config::Type<8, 128, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 218.194
42 |         Add<Config::Type<8, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 209.224
43 |         Add<Config::Type<8, 256, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();            // 219.651
44 | 
45 |         // m16n16k8: pack_bv=2
46 |         // (8,179.471),(16,174.246),(32,114.659),(64,100.813),(96,96.822),(128,53.423),(192,12.433),(256,-7.601),(4096,-17.335)
47 |         // Add<Config::Type<128, 256, 16, 1, 8, 1, Default, Default, 2, true, 1, 128, 128, 128>>(); // 50.934
48 |         // Add<Config::Type<128, 128, 16, 1, 4, 1, Default, Default, 2, true, 1, 128, 64, 128>>();  // 47.874
49 |         // Add<Config::Type<128, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>();  // 47.874
50 |         // Add<Config::Type<96, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>(); // 95.303
51 |         // Add<Config::Type<64, 128, 32, 1, 4, 1, Default, Default, 2, true, 1, 128>>();
52 |         // Add<Config::Type<64, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 97.095
53 |         // Add<Config::Type<64, 128, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 86.559
54 |         // Add<Config::Type<64, 256, 16, 1, 4, 1, Default, Stream, 2, true, 1, 128, 64, 128>>(); // 73.869
55 |         // Add<Config::Type<32, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 115.205
56 |         // Add<Config::Type<32, 256, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128, 32, 128>>();  // 96.151
57 |         // Add<Config::Type<16, 128, 64, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();  // 175.285
58 |         // Add<Config::Type<16, 128, 32, 1, 4, 1, Default, Stream, 2, true, 1, 128>>();
59 |     }
60 | }
61 | 
62 | }  // namespace turbomind::gemm
63 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/thread_group_map.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/common.h"
  6 | #include "src/turbomind/kernels/core/math.h"
  7 | #include "src/turbomind/kernels/core/meta.h"
  8 | #include "src/turbomind/kernels/gemm/thread_map.h"
  9 | 
 10 | #include <iostream>
 11 | 
 12 | namespace turbomind::gemm {
 13 | 
 14 | template<int M_, int N_, int K_, int TM, int TN, int TK, int GM, int GN, int GK>
 15 | struct RakedThreadGroupMap {
 16 |     static constexpr int M = M_;
 17 |     static constexpr int N = N_;
 18 |     static constexpr int K = K_;
 19 | 
 20 |     static constexpr int TileM = TM;
 21 |     static constexpr int TileN = TN;
 22 |     static constexpr int TileK = TK;
 23 | 
 24 |     static constexpr int kGroupM = GM;
 25 |     static constexpr int kGroupN = GN;
 26 |     static constexpr int kGroupK = GK;
 27 | 
 28 |     static constexpr int kGroupCount = GM * GN * GK;
 29 | 
 30 |     static constexpr int M1 = GM * TM;
 31 |     static constexpr int N1 = GN * TN;
 32 |     static constexpr int K1 = GK * TK;
 33 | 
 34 |     static constexpr int kIterM = M / M1;
 35 |     static constexpr int kIterN = N / N1;
 36 |     static constexpr int kIterK = K / K1;
 37 | 
 38 |     static constexpr int kFootprintM = kIterM * TM;
 39 |     static constexpr int kFootprintN = kIterN * TN;
 40 |     static constexpr int kFootprintK = kIterK * TK;
 41 | 
 42 |     static constexpr int kDeltaM = TM;
 43 |     static constexpr int kDeltaN = TN;
 44 |     static constexpr int kDeltaK = TK;
 45 | 
 46 |     __device__ static int3 get_offset(int group_id)
 47 |     {
 48 |         const int m = group_id % GM;
 49 |         const int n = group_id / GM % GN;
 50 |         const int k = group_id / GM / GN;
 51 |         return {m * kFootprintM, n * kFootprintN, k * kFootprintK};
 52 |     }
 53 | };
 54 | 
 55 | template<int M_, int N_, int K_, int tM_, int tN_, int tK_, class ArrangementMN, int gK, bool rK = 0>
 56 | struct MMA_Map {
 57 |     static constexpr int M = M_;
 58 |     static constexpr int N = N_;
 59 |     static constexpr int K = K_;
 60 | 
 61 |     static constexpr int TileM = tM_;
 62 |     static constexpr int TileN = tN_;
 63 |     static constexpr int TileK = tK_;
 64 | 
 65 |     static constexpr int kGroupM = ArrangementMN::gM;
 66 |     static constexpr int kGroupN = ArrangementMN::gN;
 67 |     static constexpr int kGroupK = gK;
 68 | 
 69 |     static constexpr int kGroupCount = kGroupM * kGroupN * kGroupK;
 70 | 
 71 |     static constexpr int kIterM = M / tM_ / kGroupM;
 72 |     static constexpr int kIterN = N / tN_ / kGroupN;
 73 |     static constexpr int kIterK = K / tK_ / kGroupK;
 74 | 
 75 |     static constexpr int kFootprintM = kIterM * tM_;
 76 |     static constexpr int kFootprintN = kIterN * tN_;
 77 |     static constexpr int kFootprintK = kIterK * tK_;
 78 | 
 79 |     static constexpr int kDeltaM = tM_ * ArrangementMN::dM;
 80 |     static constexpr int kDeltaN = tN_ * ArrangementMN::dN;
 81 |     static constexpr int kDeltaK = tK_ * (rK ? gK : 1);
 82 | 
 83 |     static constexpr auto kPartitionM = ArrangementMN::pM;
 84 |     static constexpr auto kPartitionN = ArrangementMN::pN;
 85 |     static constexpr auto kPartitionK = rK ? Partition::kRaked : Partition::kBlocked;
 86 | 
 87 |     __device__ static int3 get_offset(int group_id)
 88 |     {
 89 |         constexpr int kGroupMN = kGroupM * kGroupN;
 90 | 
 91 |         const auto mn = ArrangementMN::get_offset(group_id % kGroupMN, pair<M / TileM, N / TileN>{});
 92 |         const int  k  = group_id / kGroupMN;
 93 | 
 94 |         return {mn.x * tM_, mn.y * tN_, k * tK_ * (rK ? 1 : kIterK)};
 95 |     }
 96 | };
 97 | 
 98 | namespace {
 99 | 
100 | template<class TMap>
101 | void Print_(TMap)
102 | {
103 |     std::cout << "M, N, K = " << TMap::M << " " << TMap::N << " " << TMap::K << "\n";
104 |     std::cout << "TM, TN, TK = " << TMap::TileM << " " << TMap::TileN << " " << TMap::TileK << "\n";
105 |     std::cout << "group count = " << TMap::kGroupCount << "\n";
106 |     // std::cout << "M1, N1, K1 = " << TMap::M1 << " " << TMap::N1 << " " << TMap::K1 << "\n";
107 |     std::cout << "itM, itN, itK = " << TMap::kIterM << " " << TMap::kIterN << " " << TMap::kIterK << "\n";
108 |     std::cout << "fpM, fpN, fpK = " << TMap::kFootprintM << " " << TMap::kFootprintN << " " << TMap::kFootprintK
109 |               << "\n";
110 |     std::cout << "dM, dN, dK = " << TMap::kDeltaM << " " << TMap::kDeltaN << " " << TMap::kDeltaK << "\n";
111 | }
112 | 
113 | }  // namespace
114 | 
115 | /// TODO: Striped partition?
116 | 
117 | }  // namespace turbomind::gemm
118 | 


--------------------------------------------------------------------------------
/example/test_linear.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | import turbomind as tm
  5 | from turbomind.utils import unpack_awq_gemm
  6 | 
  7 | torch.manual_seed(0)
  8 | 
  9 | 
 10 | def i32x8_to_i4x8(w):
 11 |     """merge 8 integers (range from 0 to 15) into one 32-bit integer."""
 12 |     assert w.shape[-1] % 8 == 0
 13 |     shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8)
 14 |     shape = shape[:-1] + (1, )
 15 |     result = torch.zeros(shape, dtype=w.dtype, device=w.device)
 16 |     mask = torch.tensor([15], dtype=w.dtype, device=w.device)
 17 |     for i in range(8):
 18 |         shift = 4 * (7 - i)
 19 |         result[..., 0] |= (w[..., i] & mask) << shift
 20 |     result = result.view(w.shape[0], -1)
 21 |     return result
 22 | 
 23 | 
 24 | def makeup_weights(in_features: int, out_features: int, group_size: int = 128):
 25 |     # make up qweight
 26 |     assert out_features % 8 == 0
 27 |     qweight = torch.randint(0,
 28 |                             16, (in_features, out_features // 8, 8),
 29 |                             dtype=torch.int32,
 30 |                             device='cuda')
 31 |     print(f'-- makeup qweight: shape {qweight.shape}')
 32 |     print(qweight.view(in_features, -1))
 33 |     qweight = i32x8_to_i4x8(qweight)
 34 |     print(f'-- merge qweight: shape {qweight.shape}')
 35 |     print(qweight)
 36 | 
 37 |     # make up qzeros
 38 |     assert in_features % group_size == 0 and in_features // group_size >= 1
 39 |     qzeros = torch.randint(0,
 40 |                            16,
 41 |                            (in_features // group_size, out_features // 8, 8),
 42 |                            dtype=torch.int32,
 43 |                            device='cuda')
 44 |     print(f'-- makeup qzero: shape {qzeros.shape}')
 45 |     print(qzeros.view(in_features // group_size, -1))
 46 |     qzeros = i32x8_to_i4x8(qzeros)
 47 |     print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}')
 48 | 
 49 |     # make up scales
 50 |     scales = torch.rand((in_features // group_size, out_features),
 51 |                         dtype=torch.float16,
 52 |                         device='cuda')
 53 |     print(f'-- makeup scales: shape {scales.shape}\n{scales}')
 54 |     return qweight, qzeros, scales
 55 | 
 56 | 
 57 | def dequantize(qweight, qzeros, scales, group_size: int = 128):
 58 |     _qweight = unpack_awq_gemm(qweight)
 59 |     _qzeros = unpack_awq_gemm(qzeros)
 60 |     _qzeros = _qzeros.float()
 61 |     _qweight = _qweight.float()
 62 |     _scales = scales.float()
 63 |     for i in range(qzeros.shape[0]):
 64 |         start = i * group_size
 65 |         end = start + group_size
 66 |         _qweight[start:end] = (_qweight[start:end, :] -
 67 |                                _qzeros[i:i + 1, :]) * _scales[i:i + 1, :]
 68 |     return _qweight.half()
 69 | 
 70 | 
 71 | group_size = 128
 72 | batch_size = 16384
 73 | in_features = 16384
 74 | out_features = 16384
 75 | qweight, qzeros, scales = makeup_weights(in_features, out_features, group_size)
 76 | 
 77 | x = torch.randn((batch_size, in_features),
 78 |                 device=qweight.device,
 79 |                 dtype=torch.float16)
 80 | 
 81 | weight = dequantize(qweight, qzeros, scales, group_size)
 82 | print(f'-- dequantization: weight.shape={weight.shape}, weight: \n{weight}')
 83 | ref_linear = nn.Linear(in_features, out_features, bias=False, device='cuda')
 84 | with torch.no_grad():
 85 |     ref_linear.weight = nn.Parameter(weight.T)
 86 |     ref_res = ref_linear(x)
 87 |     print(f'nn.linear.res: {ref_res}')
 88 | 
 89 | model = tm.Linear(in_features=in_features,
 90 |                   out_features=out_features,
 91 |                   bias=False,
 92 |                   quant_method='awq',
 93 |                   w_bit=4,
 94 |                   group_size=group_size)
 95 | 
 96 | model.qweight = qweight
 97 | model.qzeros = qzeros
 98 | model.scales = scales
 99 | 
100 | model.post_init()
101 | 
102 | stream = torch.cuda.Stream()
103 | with torch.cuda.stream(stream):
104 |     res = model(x)
105 | stream.synchronize()
106 | 
107 | print(f'tm.linear.res: {res}')
108 | abs_diff = torch.abs(res - ref_res).float()
109 | rel_diff = abs_diff / torch.max(torch.abs(ref_res), torch.abs(res))
110 | rtol = 0.01
111 | atol = 0.0001
112 | outliers = abs_diff > atol + rtol * torch.abs(ref_res)
113 | abs_diff = torch.sum(abs_diff) / abs_diff.numel()
114 | rel_diff = torch.sum(rel_diff) / rel_diff.numel()
115 | outliers = torch.sum(outliers) / outliers.shape[0]
116 | print(f'abs_diff {abs_diff:4f}, '
117 |       f'rel_diff {rel_diff:4f}, '
118 |       f'outliers {outliers:4f}')
119 | 
120 | tm.Linear.clear_workspaces()
121 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/transform.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/attention/quantization.h"
  6 | #include "src/turbomind/kernels/core/common.h"
  7 | #include "src/turbomind/kernels/core/meta.h"
  8 | #include "src/turbomind/kernels/gemm/smem_copy.h"
  9 | #include "src/turbomind/kernels/gemm/tiled_mma.h"
 10 | #include <iterator>
 11 | 
 12 | namespace turbomind::gemm {
 13 | 
 14 | struct Transform_Default {
 15 |     template<class T, int Nf, int Mf, int K, int Nd, int Md, class S>
 16 |     __device__ static void apply(Array<T, Nf> (&frag)[K][Mf], int k, Array<T, Nd> (&data)[K][Md], S&, int div)
 17 |     {
 18 |         static_assert(Nf * Mf == Nd * Md);
 19 |         static_assert(Nd % Nf == 0 && Mf % Md == 0);
 20 |         static_assert(sizeof(frag) == sizeof(data));
 21 | 
 22 |         // Alignment must be manually enforced for `reinterpret_cast`
 23 |         auto& frag_k = reinterpret_cast<Array<T, Nd>(&)[Md]>(frag[k]);
 24 |         auto& data_k = data[k];
 25 | 
 26 |         PRAGMA_UNROLL
 27 |         for (int i = 0; i < std::size(frag_k); ++i) {
 28 |             frag_k[i] = data_k[i];
 29 |         }
 30 |     }
 31 | };
 32 | 
 33 | template<int StatStepS, int StatStepC>
 34 | struct Transform_HMMA_16816 {
 35 |     template<class F, int Nf, int Mf, int K, class D, int Nd, int Md, class S, int Ns, int Ms, int Ks>
 36 |     __device__ static void
 37 |     apply(Array<F, Nf> (&frag)[K][Mf], int k, Array<D, Nd> (&data)[K][Md], Array<S, Ns> (&stat)[Ks][Ms], int div)
 38 |     {
 39 |         static_assert(Nf * Mf == Nd * Md);
 40 |         static_assert(Nd % Nf == 0 && Mf % Md == 0);
 41 |         static_assert(Nf * Mf == Ns * Ms * 4);
 42 | 
 43 |         // static_assert(Nf != Nf);
 44 | 
 45 |         auto& frag_k = reinterpret_cast<Array<F, Nd>(&)[Md]>(frag[k]);
 46 |         auto& stat_k = reinterpret_cast<Array<S, 1>(&)[Ns * Ms]>(stat[k / div]);
 47 |         auto& data_k = data[k];
 48 | 
 49 |         PRAGMA_UNROLL
 50 |         for (int m = 0; m < Md; ++m) {
 51 |             // if (threadIdx.x == 0) {
 52 |             //     printf("m = %d\n", m);
 53 |             // }
 54 |             auto tmp = ConvertKvCache<D, F>::convert(data_k[m]);
 55 |             PRAGMA_UNROLL
 56 |             for (int i = 0; i < Nd; i += 8) {
 57 |                 PRAGMA_UNROLL
 58 |                 for (int s = 0; s < 2; ++s) {
 59 |                     PRAGMA_UNROLL
 60 |                     for (int c = 0; c < 2; ++c) {
 61 |                         const int idx = (m * Nd + i) / 8 * 2 + s * StatStepS + c * StatStepC;
 62 |                         // if (threadIdx.x == 0) {
 63 |                         //     printf("idx=%d\n", idx);
 64 |                         // }
 65 |                         dequant((Array<F, 2>&)tmp[i + s * 4 + c * 2], stat_k[idx]);
 66 |                     }
 67 |                 }
 68 |             }
 69 | 
 70 |             frag_k[m] = tmp;
 71 |         }
 72 |     }
 73 | 
 74 |     template<class F>
 75 |     __device__ static void dequant(Array<F, 2>& x, Array<uint32_t, 1> s)
 76 |     {
 77 |         Array<F, 2>& _s = (Array<F, 2>&)s;
 78 |         // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)_s[0], (float)_s[1]);
 79 |         // printf("tidx=%d %f %f\n", (int)threadIdx.x, (float)x[0], (float)x[1]);
 80 |         x[0] = __hfma(x[0], _s[0], _s[1]);
 81 |         x[1] = __hfma(x[1], _s[0], _s[1]);
 82 |     }
 83 | };
 84 | 
 85 | struct Transform_HMMA_SIMT_B {
 86 |     template<class F, int Nf, int Mf, int K, class D, int Nd, int Md, class S, int Ns, int Ms, int Ks>
 87 |     __device__ static void
 88 |     apply(Array<F, Nf> (&frag)[K][Mf], int k, Array<D, Nd> (&data)[K][Md], Array<S, Ns> (&stat)[Ks][Ms], int div)
 89 |     {
 90 |         static_assert(Nf * Mf == Nd * Md);
 91 |         static_assert(Nd % Nf == 0 && Mf % Md == 0);
 92 | 
 93 |         auto& frag_k = reinterpret_cast<Array<F, Nd>(&)[Md]>(frag[k]);
 94 |         auto& stat_k = reinterpret_cast<Array<S, 1>(&)[Ns * Ms]>(stat[k / div]);
 95 |         auto& data_k = data[k];
 96 | 
 97 |         // static_assert(Nf != Nf);
 98 | 
 99 |         PRAGMA_UNROLL
100 |         for (int m = 0; m < Md; ++m) {
101 |             auto tmp = ConvertKvCache<D, F>::convert(data_k[m]);
102 |             PRAGMA_UNROLL
103 |             for (int i = 0; i < Nd; i += 2) {
104 |                 dequant((Array<F, 2>&)tmp[i], stat_k[(m * Nd + i) / Nf]);
105 |             }
106 |             frag_k[m] = tmp;
107 |         }
108 |     }
109 | 
110 |     template<class F>
111 |     __device__ static void dequant(Array<F, 2>& x, Array<uint32_t, 1> s)
112 |     {
113 |         Array<F, 2>& _s = (Array<F, 2>&)s;
114 | 
115 |         x[0] = __hfma(x[0], _s[0], _s[1]);
116 |         x[1] = __hfma(x[1], _s[0], _s[1]);
117 |     }
118 | };
119 | 
120 | }  // namespace turbomind::gemm
121 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/operand_sm70_s884.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/layout.h"
  6 | #include "src/turbomind/kernels/core/meta.h"
  7 | #include "src/turbomind/kernels/gemm/arch/smem_copy_sm70.h"
  8 | #include "src/turbomind/kernels/gemm/iterator.h"
  9 | #include "src/turbomind/kernels/gemm/operand.h"
 10 | #include "src/turbomind/kernels/gemm/smem_copy.h"
 11 | #include "src/turbomind/kernels/gemm/types.h"
 12 | 
 13 | namespace turbomind::gemm {
 14 | 
 15 | namespace sm70_s884 {
 16 | 
 17 | template<Order order>
 18 | struct GetSmemLayout {
 19 |     template<int M, int K>
 20 |     static constexpr auto apply(pair<M, K>)
 21 |     {
 22 |         constexpr int2 cs = mk2cs<order>(M, K);
 23 |         return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
 24 |     }
 25 | };
 26 | 
 27 | template<class T>
 28 | struct Operand_A {
 29 |     using Dtype = T;
 30 | 
 31 |     static constexpr Pack  kPack  = 0;
 32 |     static constexpr Order kOrder = kRowMajor;
 33 | 
 34 |     using SmemCopyAtom = SmemCopy_MMA_884_A<T>;
 35 | 
 36 |     using GetSmemLayout = GetSmemLayout<kOrder>;
 37 |     using GetGmemIter   = GetGmemIter;
 38 | };
 39 | 
 40 | template<class T>
 41 | struct Operand_B {
 42 |     using Dtype = T;
 43 | 
 44 |     static constexpr Pack  kPack  = 0;
 45 |     static constexpr Order kOrder = kRowMajor;  // (n,k)
 46 | 
 47 |     using SmemCopyAtom = SmemCopy_MMA_884_B<T>;
 48 | 
 49 |     using GetSmemLayout = GetSmemLayout<kOrder>;
 50 |     using GetGmemIter   = GetGmemIter;
 51 | };
 52 | 
 53 | template<class T>
 54 | struct Operand_V {
 55 |     using Dtype = T;
 56 | 
 57 |     static constexpr Pack  kPack  = 0;
 58 |     static constexpr Order kOrder = kColMajor;  // (n,k)
 59 | 
 60 |     using SmemCopyAtom = SmemCopy_MMA_884_V<T, 1>;
 61 | 
 62 |     struct GetSmemLayout {  // m-major
 63 |         template<int M, int K>
 64 |         static constexpr auto apply(pair<M, K>)
 65 |         {
 66 |             return SmemLayoutV2<K, M>{};
 67 |         }
 68 |     };
 69 | 
 70 |     using GetGmemIter = GetGmemIter;
 71 | };
 72 | 
 73 | template<Order order>
 74 | struct _GetSmemLayoutC {
 75 |     template<int M, int N>
 76 |     static constexpr auto apply(pair<M, N>)
 77 |     {
 78 |         constexpr auto cs = mk2cs<order>(M, N);
 79 |         return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
 80 |     }
 81 | };
 82 | 
 83 | template<Order order>
 84 | struct _GetThreadMapC {
 85 |     template<int M, int N, int THREADS>
 86 |     static constexpr auto apply(pair<M, N>, constant<THREADS>)
 87 |     {
 88 |         constexpr auto cs    = mk2cs<order>(M, N);
 89 |         constexpr int  WARPS = THREADS / WARP_SIZE;
 90 | 
 91 |         return ThreadMap_V2<cs.x, cs.y, 4, Raked, WARPS>{};
 92 |     }
 93 | };
 94 | 
 95 | template<class T, Order order>
 96 | struct Operand_C {
 97 |     using Dtype = T;
 98 | 
 99 |     static constexpr Order kOrder = order;
100 | 
101 |     using GetSmemLayout = _GetSmemLayoutC<order>;
102 |     using GetThreadMap  = _GetThreadMapC<order>;
103 | };
104 | 
105 | template<class T>
106 | struct Operand_B_Pack {
107 |     using Dtype = T;
108 | 
109 |     static constexpr int Pack_M = 1;
110 | 
111 |     static constexpr Pack  kPack  = HMMA_884 | OPERAND_B | Pack_M;
112 |     static constexpr Order kOrder = kRowMajor;
113 | 
114 |     using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_884_B<T>, kOrder, Pack_M>;
115 | 
116 |     using GetSmemLayout = GetSmemLayout<kOrder>;
117 |     using GetGmemIter   = GetGmemIter;
118 | };
119 | 
120 | template<class T>
121 | struct Operand_V_Pack {
122 |     using Dtype = T;
123 | 
124 |     static constexpr int Pack_M = 1;
125 | 
126 |     static constexpr Pack  kPack  = HMMA_884 | OPERAND_V | Pack_M;
127 |     static constexpr Order kOrder = kColMajor;
128 | 
129 |     using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_884_V<T, 8>, kColMajor, Pack_M>;
130 | 
131 |     struct GetSmemLayout {  // m-major
132 |         template<int M, int K>
133 |         static constexpr auto apply(pair<M, K>)
134 |         {
135 |             return SmemLayoutV2<K, M>{};
136 |         }
137 |     };
138 | 
139 |     using GetGmemIter = GetGmemIter;
140 | };
141 | 
142 | }  // namespace sm70_s884
143 | 
144 | template<class T>
145 | struct GetOperand<HMMA_884, OPERAND_A, T, kRowMajor, false>: std::true_type {
146 |     using Operand = sm70_s884::Operand_A<T>;
147 | };
148 | 
149 | template<class T>
150 | struct GetOperand<HMMA_884, OPERAND_B, T, kRowMajor, false>: std::true_type {
151 |     using Operand = sm70_s884::Operand_B<T>;
152 | };
153 | 
154 | template<class T>
155 | struct GetOperand<HMMA_884, OPERAND_V, T, kColMajor, false>: std::true_type {
156 |     using Operand = sm70_s884::Operand_V<T>;
157 | };
158 | 
159 | template<class T>
160 | struct GetOperand<HMMA_884, OPERAND_B, T, kRowMajor, true>: std::true_type {
161 |     using Operand = sm70_s884::Operand_B_Pack<T>;
162 | };
163 | 
164 | template<class T>
165 | struct GetOperand<HMMA_884, OPERAND_V, T, kColMajor, true>: std::true_type {
166 |     using Operand = sm70_s884::Operand_V_Pack<T>;
167 | };
168 | 
169 | }  // namespace turbomind::gemm
170 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/gpu_metric.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #include "src/turbomind/kernels/core/array.h"
  4 | #include "src/turbomind/kernels/core/common.h"
  5 | #include "src/turbomind/kernels/core/math.h"
  6 | #include "src/turbomind/kernels/gemm/gpu_metric.h"
  7 | #include <thrust/device_vector.h>
  8 | 
  9 | #include <cublas_v2.h>
 10 | 
 11 | namespace turbomind::gemm {
 12 | 
 13 | using thrust::device_vector;
 14 | 
 15 | namespace {
 16 | 
 17 | template<int BLOCK_NUM, int BLOCK_DIM, int LOG_TILE>
 18 | __global__ void l2_bw(float* dsink, const float* array, int count)
 19 | {
 20 |     int    tid = threadIdx.x + (blockIdx.x >> LOG_TILE) * blockDim.x;
 21 |     float4 sink{};
 22 | 
 23 |     constexpr int NUM_THREADS = BLOCK_NUM * BLOCK_DIM;
 24 | 
 25 |     for (int i = 0; i < count; i += NUM_THREADS * 4) {
 26 |         const float* ptr    = array + i;
 27 |         const int    offset = tid * 4;
 28 |         float4       data   = __ldcg(reinterpret_cast<const float4*>(ptr + offset));
 29 |         sink.x += data.x;
 30 |         sink.y += data.y;
 31 |         sink.z += data.z;
 32 |         sink.w += data.w;
 33 |     }
 34 | 
 35 |     dsink[threadIdx.x] = sink.x + sink.y + sink.z + sink.w;
 36 | }
 37 | 
 38 | }  // namespace
 39 | 
 40 | float MeasureL2CacheThroughput()
 41 | {
 42 |     cudaDeviceProp prop{};
 43 |     int            device{};
 44 |     cudaGetDevice(&device);
 45 |     cudaGetDeviceProperties(&prop, device);
 46 | 
 47 |     size_t size = static_cast<size_t>(prop.l2CacheSize) * 64;
 48 | 
 49 |     std::cout << size << std::endl;
 50 | 
 51 |     constexpr int BLOCK_X  = 128;  // blocks participating single sweep
 52 |     constexpr int BLOCK_Y  = 128;  // full sweep iters
 53 |     constexpr int LOG_TILE = 5;    // swizzling factor to bring up L2 hit rate, set to 0 will minimize hit rate
 54 | 
 55 |     constexpr int BLOCK_DIM = 256;
 56 | 
 57 |     constexpr int CHUNK_SIZE = BLOCK_X * BLOCK_DIM * 4;  // x4 for float4 load pattern
 58 | 
 59 |     device_vector<float> data(ceil_div(size, sizeof(float)) / CHUNK_SIZE * CHUNK_SIZE);
 60 |     device_vector<float> dsink(BLOCK_DIM);
 61 | 
 62 |     cudaStream_t stream;
 63 |     cudaStreamCreate(&stream);
 64 | 
 65 |     cudaMemsetAsync(data.data().get(), 0, sizeof(float) * data.size(), stream);
 66 | 
 67 |     cudaEvent_t ev_start, ev_end;
 68 | 
 69 |     cudaEventCreate(&ev_start);
 70 |     cudaEventCreate(&ev_end);
 71 | 
 72 |     cudaEventRecord(ev_start, stream);
 73 | 
 74 |     l2_bw<BLOCK_X, BLOCK_DIM, LOG_TILE><<<dim3(BLOCK_X << LOG_TILE, BLOCK_Y >> LOG_TILE), BLOCK_DIM, 0, stream>>>(
 75 |         dsink.data().get(), data.data().get(), data.size());
 76 | 
 77 |     cudaEventRecord(ev_end, stream);
 78 | 
 79 |     cudaEventSynchronize(ev_end);
 80 | 
 81 |     float ms{};
 82 |     cudaEventElapsedTime(&ms, ev_start, ev_end);
 83 | 
 84 |     size_t bytes = BLOCK_Y * sizeof(float) * data.size();
 85 | 
 86 |     const float bytes_per_second = bytes / ms * 1e3;
 87 |     std::cout << bytes_per_second / 1e9 << " GB/s" << std::endl;
 88 | 
 89 |     cudaEventDestroy(ev_start);
 90 |     cudaEventDestroy(ev_end);
 91 | 
 92 |     cudaStreamDestroy(stream);
 93 | 
 94 |     return bytes_per_second;
 95 | }
 96 | 
 97 | float MeasureMmaThroughput(int problem_size)
 98 | {
 99 |     device_vector<half> a(problem_size * problem_size);
100 |     device_vector<half> b(a.size());
101 |     device_vector<half> c(a.size());
102 | 
103 |     cublasHandle_t cublas{};
104 |     cublasCreate(&cublas);
105 | 
106 |     cudaStream_t stream;
107 |     cudaStreamCreate(&stream);
108 | 
109 |     cublasSetStream(cublas, stream);
110 | 
111 |     cudaEvent_t ev_start, ev_end;
112 | 
113 |     cudaEventCreate(&ev_start);
114 |     cudaEventCreate(&ev_end);
115 | 
116 |     cudaEventRecord(ev_start, stream);
117 | 
118 |     float alpha = 1.f;
119 |     float beta  = 0.f;
120 |     cublasGemmEx(cublas,
121 |                  CUBLAS_OP_N,
122 |                  CUBLAS_OP_N,
123 |                  problem_size,
124 |                  problem_size,
125 |                  problem_size,
126 |                  &alpha,
127 |                  a.data().get(),
128 |                  CUDA_R_16F,
129 |                  problem_size,
130 |                  b.data().get(),
131 |                  CUDA_R_16F,
132 |                  problem_size,
133 |                  &beta,
134 |                  c.data().get(),
135 |                  CUDA_R_16F,
136 |                  problem_size,
137 |                  CUBLAS_COMPUTE_32F,
138 |                  CUBLAS_GEMM_DEFAULT);
139 | 
140 |     cudaEventRecord(ev_end, stream);
141 | 
142 |     cudaEventSynchronize(ev_end);
143 | 
144 |     float ms{};
145 |     cudaEventElapsedTime(&ms, ev_start, ev_end);
146 | 
147 |     cudaEventDestroy(ev_start);
148 |     cudaEventDestroy(ev_end);
149 | 
150 |     cudaStreamDestroy(stream);
151 | 
152 |     cublasDestroy(cublas);
153 | 
154 |     const size_t ops = (size_t)problem_size * problem_size * problem_size;
155 | 
156 |     float fma_per_second = ops / ms * 1e3;
157 | 
158 |     std::cout << 2 * fma_per_second / 1e9 << " FLOPS/s" << std::endl;
159 | 
160 |     return fma_per_second;
161 | }
162 | 
163 | }  // namespace turbomind::gemm
164 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch/operand_simt.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/layout.h"
  6 | #include "src/turbomind/kernels/core/meta.h"
  7 | #include "src/turbomind/kernels/gemm/arch/smem_copy_simt.h"
  8 | #include "src/turbomind/kernels/gemm/iterator.h"
  9 | #include "src/turbomind/kernels/gemm/operand.h"
 10 | #include "src/turbomind/kernels/gemm/simt.h"
 11 | #include "src/turbomind/kernels/gemm/smem_copy.h"
 12 | #include "src/turbomind/kernels/gemm/types.h"
 13 | 
 14 | namespace turbomind::gemm {
 15 | 
 16 | namespace simt {
 17 | 
 18 | struct GetSmemLayout {
 19 |     template<int M, int K>
 20 |     static constexpr auto apply(pair<M, K>)
 21 |     {
 22 |         return SmemLayoutV2<M, K>{};
 23 |     }
 24 | };
 25 | 
 26 | template<class T, int K>
 27 | struct Operand_A {
 28 |     using Dtype = T;
 29 | 
 30 |     static constexpr Pack  kPack  = 0;
 31 |     static constexpr Order kOrder = kRowMajor;
 32 | 
 33 |     using SmemCopyAtom = SmemCopy_MMA_SIMT_A<T, K>;
 34 | 
 35 |     using GetSmemLayout = GetSmemLayout;
 36 |     using GetGmemIter   = GetGmemIter;
 37 | };
 38 | 
 39 | template<class T, int K>
 40 | struct Operand_B {
 41 |     using Dtype = T;
 42 | 
 43 |     static constexpr Pack  kPack  = 0;
 44 |     static constexpr Order kOrder = kRowMajor;
 45 | 
 46 |     using SmemCopyAtom = SmemCopy_MMA_SIMT_B<T, K>;
 47 | 
 48 |     using GetSmemLayout = GetSmemLayout;
 49 |     using GetGmemIter   = GetGmemIter;
 50 | };
 51 | 
 52 | template<Order order>
 53 | struct _GetSmemLayoutC {
 54 |     template<int M, int N>
 55 |     static constexpr auto apply(pair<M, N>)
 56 |     {
 57 |         constexpr auto cs = mk2cs<order>(M, N);
 58 |         return SmemLayoutV2<cs.y, cs.x, 1, 1>{};
 59 |     }
 60 | };
 61 | 
 62 | template<Order order>
 63 | struct _GetThreadMapC {
 64 |     template<int M, int N, int THREADS>
 65 |     static constexpr auto apply(pair<M, N>, constant<THREADS>)
 66 |     {
 67 |         constexpr auto cs    = mk2cs<order>(M, N);
 68 |         constexpr int  WARPS = THREADS / WARP_SIZE;
 69 | 
 70 |         return ThreadMap_V2<cs.x, cs.y, 4, Raked, WARPS>{};
 71 |     }
 72 | };
 73 | 
 74 | template<class T, Order order>
 75 | struct Operand_C {
 76 |     using Dtype = T;
 77 | 
 78 |     static constexpr Order kOrder = order;
 79 | 
 80 |     using GetSmemLayout = _GetSmemLayoutC<order>;
 81 |     using GetThreadMap  = _GetThreadMapC<order>;
 82 | };
 83 | 
 84 | template<class T>
 85 | struct Operand_V {
 86 |     using Dtype = T;
 87 | 
 88 |     static constexpr Pack  kPack  = 0;
 89 |     static constexpr Order kOrder = kColMajor;
 90 | 
 91 |     using SmemCopyAtom = SmemCopy_MMA_SIMT_V<T, 1>;
 92 | 
 93 |     struct GetSmemLayout {  // m-major
 94 |         template<int M, int K>
 95 |         static constexpr auto apply(pair<M, K>)
 96 |         {
 97 |             return SmemLayoutV2<K, M>{};
 98 |         }
 99 |     };
100 | 
101 |     using GetGmemIter = GetGmemIter;
102 | };
103 | 
104 | struct GetSmemLayout_Pack {
105 |     template<int M, int K>
106 |     static constexpr auto apply(pair<M, K>)
107 |     {
108 |         return SmemLayoutV2<M, K>{};
109 |     }
110 | };
111 | 
112 | template<class T, int K>
113 | struct Operand_B_Pack {
114 |     using Dtype = T;
115 | 
116 |     static constexpr int Pack_M = 1;
117 | 
118 |     static constexpr Pack  kPack  = HMMA_SIMT | OPERAND_B | Pack_M;
119 |     static constexpr Order kOrder = kRowMajor;
120 | 
121 |     using SmemCopyAtom  = SmemCopyAtom_Pack_v3<T, typename Operand_B<T, K>::SmemCopyAtom, kRowMajor, Pack_M>;
122 |     using GetSmemLayout = GetSmemLayout_Pack;
123 |     using GetGmemIter   = GetGmemIter;
124 | };
125 | 
126 | template<class T>
127 | struct Operand_V_Pack {
128 |     using Dtype = T;
129 | 
130 |     static constexpr int Pack_M = 1;
131 | 
132 |     static constexpr Pack  kPack  = HMMA_SIMT | OPERAND_V | Pack_M;
133 |     static constexpr Order kOrder = kColMajor;
134 | 
135 |     using SmemCopyAtom = SmemCopyAtom_Pack_v3<T, SmemCopy_MMA_SIMT_V<T, OP_K>, kColMajor, Pack_M>;
136 | 
137 |     struct GetSmemLayout {  // m-major
138 |         template<int M, int K>
139 |         static constexpr auto apply(pair<M, K>)
140 |         {
141 |             return SmemLayoutV2<K, M>{};
142 |         }
143 |     };
144 | 
145 |     using GetGmemIter = GetGmemIter;
146 | };
147 | 
148 | }  // namespace simt
149 | 
150 | template<class T>
151 | struct GetOperand<HMMA_SIMT, OPERAND_A, T, kRowMajor, false>: std::true_type {
152 |     using Operand = simt::Operand_A<T, simt::OP_K>;
153 | };
154 | 
155 | template<class T>
156 | struct GetOperand<HMMA_SIMT, OPERAND_B, T, kRowMajor, false>: std::true_type {
157 |     using Operand = simt::Operand_B<T, simt::OP_K>;
158 | };
159 | 
160 | template<class T>
161 | struct GetOperand<HMMA_SIMT, OPERAND_V, T, kColMajor, false>: std::true_type {
162 |     using Operand = simt::Operand_V<T>;
163 | };
164 | 
165 | template<class T>
166 | struct GetOperand<HMMA_SIMT, OPERAND_B, T, kRowMajor, true>: std::true_type {
167 |     using Operand = simt::Operand_B_Pack<T, simt::OP_K>;
168 | };
169 | 
170 | template<class T>
171 | struct GetOperand<HMMA_SIMT, OPERAND_V, T, kColMajor, true>: std::true_type {
172 |     using Operand = simt::Operand_V_Pack<T>;
173 | };
174 | 
175 | }  // namespace turbomind::gemm
176 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/types.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/data_type.h"
  6 | #include <cuda_fp16.h>
  7 | #if ENABLE_BF16
  8 | #include <cuda_bf16.h>
  9 | #endif
 10 | 
 11 | namespace turbomind::gemm {
 12 | 
 13 | enum class Order : int
 14 | {
 15 |     kColMajor = 0,
 16 |     kRowMajor = 1,
 17 | };
 18 | 
 19 | inline constexpr Order kColMajor = Order::kColMajor;
 20 | inline constexpr Order kRowMajor = Order::kRowMajor;
 21 | 
 22 | constexpr Order operator~(Order a)
 23 | {
 24 |     return a == kColMajor ? kRowMajor : kColMajor;
 25 | }
 26 | 
 27 | using Pack = uint32_t;
 28 | 
 29 | typedef enum MMA_Tag
 30 | {
 31 |     HMMA_16816 = 0x100,  // sm80+
 32 |     HMMA_1688  = 0x200,  // sm75
 33 |     HMMA_884   = 0x300,  // sm70
 34 |     HMMA_SIMT  = 0x400,  // sm75-
 35 | } MMA_Tag;
 36 | 
 37 | typedef enum Op_Tag
 38 | {
 39 |     OPERAND_A = 0x010,
 40 |     OPERAND_B = 0x020,
 41 |     OPERAND_U = 0x030,
 42 |     OPERAND_V = 0x040,
 43 | } Op_Tag;
 44 | 
 45 | constexpr MMA_Tag get_mma_tag(Pack pack)
 46 | {
 47 |     return static_cast<MMA_Tag>(pack & 0xf00);
 48 | }
 49 | 
 50 | constexpr Op_Tag get_operand_tag(Pack pack)
 51 | {
 52 |     return static_cast<Op_Tag>(pack & 0x0f0);
 53 | }
 54 | 
 55 | constexpr int get_pack_num(Pack pack)
 56 | {
 57 |     return pack & 0x00f;
 58 | }
 59 | 
 60 | enum class QuantType : int
 61 | {
 62 |     kNone,
 63 |     kDefault,
 64 | };
 65 | 
 66 | enum class Epilogue : int
 67 | {
 68 |     kNone               = 0,
 69 |     kChannelCombination = 0x1,
 70 |     kGatedSilu          = 0x2,
 71 | };
 72 | 
 73 | enum class DataType : int
 74 | {
 75 |     U4,
 76 |     U8,
 77 |     U16,
 78 |     U32,
 79 |     U64,
 80 |     F8_E4M3,
 81 |     F8_E5M2,
 82 |     F16,
 83 |     F32,
 84 |     BF16,
 85 |     TF32,
 86 | };
 87 | 
 88 | inline const char* to_string(DataType data_type)
 89 | {
 90 |     switch (data_type) {
 91 |         case DataType::U4:
 92 |             return "u4";
 93 |         case DataType::U8:
 94 |             return "u8";
 95 |         case DataType::F16:
 96 |             return "f16";
 97 |         case DataType::F32:
 98 |             return "f32";
 99 |         case DataType::BF16:
100 |             return "bf16";
101 |         case DataType::TF32:
102 |             return "tf32";
103 |         default:
104 |             return "unknown";
105 |     }
106 | }
107 | 
108 | inline int64_t get_size(DataType type, int64_t size)
109 | {
110 |     if (!size) {
111 |         return 0;
112 |     }
113 |     switch (type) {
114 |         case DataType::U64:
115 |             return size * 8;
116 |         case DataType::F32:
117 |         case DataType::U32:
118 |             return size * 4;
119 |         case DataType::BF16:
120 |         case DataType::F16:
121 |         case DataType::U16:
122 |             return size * 2;
123 |         case DataType::U8:
124 |         case DataType::F8_E4M3:
125 |         case DataType::F8_E5M2:
126 |             return size;
127 |         case DataType::U4:
128 |             return size / 2;
129 |         default:
130 |             // std::cerr << to_string(type) << "\n";
131 |             return -1;
132 |     }
133 | }
134 | 
135 | template<class T>
136 | struct get_data_type {
137 | };
138 | 
139 | template<>
140 | struct get_data_type<half> {
141 |     static constexpr auto value = DataType::F16;
142 | };
143 | 
144 | #if ENABLE_BF16
145 | template<>
146 | struct get_data_type<nv_bfloat16> {
147 |     static constexpr auto value = DataType::BF16;
148 | };
149 | #endif
150 | 
151 | template<>
152 | struct get_data_type<uint4_t> {
153 |     static constexpr auto value = DataType::U4;
154 | };
155 | 
156 | template<>
157 | struct get_data_type<uint8_t> {
158 |     static constexpr auto value = DataType::U8;
159 | };
160 | 
161 | template<class T>
162 | inline constexpr auto get_data_type_v = get_data_type<T>::value;
163 | 
164 | template<DataType dtype>
165 | struct get_dtype {
166 | };
167 | 
168 | template<>
169 | struct get_dtype<DataType::F16> {
170 |     using type = half;
171 | };
172 | 
173 | template<>
174 | struct get_dtype<DataType::U4> {
175 |     using type = uint4_t;
176 | };
177 | 
178 | template<>
179 | struct get_dtype<DataType::U8> {
180 |     using type = uint8_t;
181 | };
182 | 
183 | template<>
184 | struct get_dtype<DataType::U16> {
185 |     using type = uint16_t;
186 | };
187 | 
188 | template<>
189 | struct get_dtype<DataType::U32> {
190 |     using type = uint32_t;
191 | };
192 | 
193 | struct QuantDesc {
194 |     QuantType type;
195 |     int       group_size;
196 | };
197 | 
198 | enum class DispatchPolicy : int
199 | {
200 |     kDefault = 0,
201 |     kMeasure = 1,
202 |     kReuse   = 2,
203 |     kAppend  = 3,
204 | };
205 | 
206 | constexpr bool operator&(const DispatchPolicy& a, const DispatchPolicy& b)
207 | {
208 |     return ((int)a & (int)b);
209 | }
210 | 
211 | struct Operation {
212 |     DispatchPolicy dispatch;
213 |     Epilogue       epilogue;
214 |     QuantDesc      quant_a;
215 |     QuantDesc      quant_b;
216 |     int            batch_dim;
217 |     void*          reserved;
218 | };
219 | 
220 | struct MatrixLayout {
221 |     DataType type;
222 |     Order    order;
223 |     int      rows;
224 |     int      cols;
225 |     int      ld;
226 |     Pack     pack;
227 | };
228 | 
229 | inline int64_t get_size(const MatrixLayout& m)
230 | {
231 |     return get_size(m.type, (int64_t)m.rows * m.cols);
232 | }
233 | 
234 | struct Workspace {
235 |     void*  barriers;
236 |     size_t barriers_size;
237 |     void*  partials;
238 |     size_t partials_size;
239 | };
240 | 
241 | }  // namespace turbomind::gemm
242 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/thread_map.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) OpenMMLab. All rights reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "src/turbomind/kernels/core/common.h"
  6 | 
  7 | #include <iostream>
  8 | 
  9 | namespace turbomind {
 10 | 
 11 | template<int C, int S, int AccessC, int WarpCount>
 12 | struct ThreadMapQ {
 13 |     static constexpr int kWarpCount = WarpCount;
 14 |     static constexpr int kAccessC   = AccessC;
 15 | 
 16 |     static constexpr int kWarpThreadC = C / kAccessC;
 17 |     static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC;
 18 | 
 19 |     static_assert(kWarpThreadC <= WARP_SIZE);
 20 | 
 21 |     static constexpr int kWarpAccessC = kWarpThreadC * kAccessC;  // C
 22 |     static constexpr int kWarpAccessS = kWarpThreadS;
 23 | 
 24 |     static constexpr int kWarpIterC = C / kWarpAccessC;  // 1
 25 |     static constexpr int kWarpIterS = S / kWarpAccessS;
 26 | 
 27 |     static constexpr int kWarpC = 1;
 28 |     static constexpr int kWarpS = kWarpCount;
 29 | 
 30 |     static constexpr int kIterC = kWarpIterC / kWarpC;  // 1
 31 |     static constexpr int kIterS = std::max(kWarpIterS / kWarpS, 1);
 32 | 
 33 |     static constexpr int kFootprintC = kWarpAccessC * kIterC;  // C
 34 |     static constexpr int kFootprintS = kWarpAccessS * kIterS;
 35 | 
 36 |     static constexpr int kDeltaC = kWarpAccessC;
 37 |     static constexpr int kDeltaS = kWarpAccessS;
 38 | 
 39 |     __device__ static int2 get_offset(int warp_id, int lane_id)
 40 |     {
 41 |         int warp_offset_c = warp_id % kWarpC;
 42 |         int warp_offset_s = warp_id / kWarpC;
 43 | 
 44 |         int warp_thread_offset_c = lane_id % kWarpThreadC;
 45 |         int warp_thread_offset_s = lane_id / kWarpThreadC;
 46 | 
 47 |         int cta_thread_offset_c = kFootprintC * warp_offset_c + warp_thread_offset_c * kAccessC;
 48 |         int cta_thread_offset_s = kFootprintS * warp_offset_s + warp_thread_offset_s;
 49 | 
 50 |         return {cta_thread_offset_c, cta_thread_offset_s};
 51 |     }
 52 | };
 53 | 
 54 | template<int DimC, int DimS, int AccessC, int WarpCount, int WarpThreadC = DimC / AccessC>
 55 | struct RakedThreadMap {
 56 |     static constexpr int kDimC = DimC;
 57 |     static constexpr int kDimS = DimS;
 58 | 
 59 |     static constexpr int kWarpCount = WarpCount;
 60 |     static constexpr int kAccessC   = AccessC;
 61 | 
 62 |     static constexpr int kWarpThreadC = WarpThreadC;
 63 |     static constexpr int kWarpThreadS = WARP_SIZE / kWarpThreadC;
 64 | 
 65 |     static_assert(kWarpThreadC <= WARP_SIZE);
 66 | 
 67 |     static constexpr int kWarpAccessC = kWarpThreadC * kAccessC;
 68 |     static constexpr int kWarpAccessS = kWarpThreadS;
 69 | 
 70 |     static constexpr int kWarpIterC = (kDimC + kWarpAccessC - 1) / kWarpAccessC;
 71 |     static constexpr int kWarpIterS = kDimS / kWarpAccessS;
 72 | 
 73 |     static constexpr int kWarpC = 1;
 74 |     static constexpr int kWarpS = kWarpCount;
 75 | 
 76 |     static constexpr int kIterC = kWarpIterC / kWarpC;
 77 |     static constexpr int kIterS = std::max(kWarpIterS / kWarpS, 1);
 78 | 
 79 |     // Allow partial tile when there is ONLY 1 iteration
 80 |     static_assert(kDimC % kWarpAccessC == 0 || kIterC == 1);
 81 | 
 82 |     static_assert(kIterC > 0);
 83 |     static_assert(kIterS > 0);
 84 | 
 85 |     static constexpr bool kPartialC = kDimC % kWarpAccessC != 0;
 86 | 
 87 |     static constexpr int kFootprintC = kWarpAccessC * kIterC;
 88 |     static constexpr int kFootprintS = kWarpAccessS * kIterS;
 89 | 
 90 |     static constexpr int kDeltaC = kWarpAccessC;
 91 |     static constexpr int kDeltaS = kWarpAccessS;
 92 | 
 93 |     // static constexpr int kDeltaC = kWarpAccessC * kWarpC;
 94 |     // static constexpr int kDeltaS = kWarpAccessS * kWarpS;
 95 | 
 96 |     __device__ static int2 get_offset(int warp_id, int lane_id)
 97 |     {
 98 |         int warp_offset_c = warp_id % kWarpC;
 99 |         int warp_offset_s = warp_id / kWarpC;
100 | 
101 |         int warp_thread_offset_c = lane_id % kWarpThreadC;
102 |         int warp_thread_offset_s = lane_id / kWarpThreadC;
103 | 
104 |         int cta_thread_offset_c = kFootprintC * warp_offset_c + warp_thread_offset_c * kAccessC;
105 |         int cta_thread_offset_s = kFootprintS * warp_offset_s + warp_thread_offset_s;
106 | 
107 |         // int cta_thread_offset_c = kWarpAccessC * warp_offset_c + warp_thread_offset_c * kAccessC;
108 |         // int cta_thread_offset_s = kWarpAccessS * warp_offset_s + warp_thread_offset_s;
109 | 
110 |         return {cta_thread_offset_c, cta_thread_offset_s};
111 |     }
112 | };
113 | 
114 | namespace {
115 | 
116 | template<class TMap>
117 | void Print(TMap)
118 | {
119 |     std::cout << "     warps: " << TMap::kWarpCount << "\n";
120 |     std::cout << "     shape: (" << TMap::kDimC << ", " << TMap::kDimS << ")\n";
121 |     std::cout << "    access: (" << TMap::kAccessC << ", " << 1 << ")\n";
122 |     std::cout << "warpThread: (" << TMap::kWarpThreadC << ", " << TMap::kWarpThreadS << ")\n";
123 |     std::cout << "warpAccess: (" << TMap::kWarpAccessC << ", " << TMap::kWarpAccessS << ")\n";
124 |     std::cout << "  warpIter: (" << TMap::kWarpIterC << ", " << TMap::kWarpIterS << ")\n";
125 |     std::cout << "      warp: (" << TMap::kWarpC << ", " << TMap::kWarpS << ")\n";
126 |     std::cout << "      iter: (" << TMap::kIterC << ", " << TMap::kIterS << ")\n";
127 |     std::cout << " footprint: (" << TMap::kFootprintC << ", " << TMap::kFootprintS << ")\n";
128 |     std::cout << "     delta: (" << TMap::kDeltaC << ", " << TMap::kDeltaS << ")\n";
129 |     std::cout << "  partialC: " << TMap::kPartialC << "\n";
130 | }
131 | 
132 | }  // namespace
133 | 
134 | }  // namespace turbomind
135 | 


--------------------------------------------------------------------------------
/example/modeling_turbomind.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | from typing import Dict, Union
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import transformers
  7 | from accelerate.big_modeling import (init_empty_weights,
  8 |                                      load_checkpoint_and_dispatch)
  9 | from module import get_named_linears, set_op_by_name
 10 | from tqdm import tqdm
 11 | from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
 12 | from typing_extensions import Annotated, Doc
 13 | 
 14 | # from turbomind import Linear
 15 | import turbomind
 16 | 
 17 | 
 18 | class TurbomindForCausalLM(nn.Module):
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         model,
 23 |         is_quantized,
 24 |         config,
 25 |         quant_config,
 26 |     ):
 27 |         """The base model for all AutoAWQ models.
 28 | 
 29 |         Args:
 30 |             model: The pretrained or quantized model.
 31 |             is_quantized: Indicates if the current model is quantized
 32 |             config: The config of the model.
 33 |             quant_config: The quantization config of the model.
 34 |         """
 35 |         super().__init__()
 36 |         self.model: PreTrainedModel = model
 37 |         self.is_quantized: bool = is_quantized
 38 |         self.search_result = None
 39 |         self.config: PretrainedConfig = config
 40 |         self.quant_config = quant_config
 41 | 
 42 |     def to(self, device: Annotated[str,
 43 |                                    Doc('The device to move your model to.')]):
 44 |         """A utility function for moving the model to a device."""
 45 |         return self.model.to(device)
 46 | 
 47 |     def forward(self, *args, **kwargs):
 48 |         """A forward function that mimics the torch forward."""
 49 |         return self.model(*args, **kwargs)
 50 | 
 51 |     def generate(self, *args, **kwargs):
 52 |         """A generate function that mimics the HF generate function."""
 53 |         with torch.inference_mode():
 54 |             return self.model.generate(*args, **kwargs)
 55 | 
 56 |     @classmethod
 57 |     def from_quantized(self,
 58 |                        model_path: str,
 59 |                        torch_dtype: torch.dtype = torch.float16,
 60 |                        device_map: Union[str, Dict] = 'balanced',
 61 |                        **config_kwargs: Dict):
 62 |         """A method for initialization of a quantized model, usually in INT4.
 63 | 
 64 |         Args:
 65 |             model_path (str): The model path
 66 |             max_seq_len (int): The maximum sequence cached sequence length of
 67 |                 the model. Larger values may increase loading time and
 68 |                 memory usage.
 69 |             torch_dtype: The dtype to load the model as. May not work with
 70 |                 other values than float16.
 71 |             device_map: A device map that will be passed onto the model
 72 |                 loading method from transformers.
 73 |         **config_kwargs: Additional kwargs that are passed to the config
 74 |             during initialization
 75 |         """
 76 |         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 77 |         quant_config = config.quantization_config
 78 | 
 79 |         target_cls = getattr(transformers, config.architectures[0])
 80 | 
 81 |         # Load model
 82 |         with init_empty_weights():
 83 |             model = target_cls._from_config(config=config,
 84 |                                             torch_dtype=torch_dtype)
 85 |         # Prepare quantized linear layers, replace nn.Linear
 86 |         self._load_quantized_modules(
 87 |             self,
 88 |             model,
 89 |             quant_config,
 90 |         )
 91 | 
 92 |         model.tie_weights()
 93 | 
 94 |         # loads the weights into modules and distributes
 95 |         # across available devices automatically
 96 |         load_checkpoint_and_dispatch(
 97 |             model,
 98 |             checkpoint=model_path,
 99 |             device_map=device_map,
100 |             no_split_module_classes=[model.model.layers[0].__class__.__name__],
101 |             dtype=torch_dtype,
102 |         )
103 | 
104 |         # model = turbomind_post_init(model)
105 |         for _, submodule in model.named_modules():
106 |             if isinstance(submodule, turbomind.Linear):
107 |                 submodule.post_init()
108 | 
109 |         model.eval()
110 | 
111 |         return self(
112 |             model,
113 |             is_quantized=True,
114 |             config=config,
115 |             quant_config=quant_config,
116 |         )
117 | 
118 |     def _load_quantized_modules(self, model, quant_config):
119 |         assert quant_config['quant_method'] in ['awq', 'gptq']
120 |         if quant_config['quant_method'] == 'awq':
121 |             assert quant_config['version'] == 'gemm'
122 | 
123 |         # Get blocks of model
124 |         layers = model.model.layers
125 | 
126 |         for i in tqdm(range(len(layers)), desc='Replacing layers...'):
127 |             layer = layers[i]
128 | 
129 |             # Get every linear layer in a block
130 |             named_linears = get_named_linears(layer)
131 | 
132 |             # # Filter out the linear layers we don't want to include
133 |             # named_linears = exclude_layers_to_not_quantize(
134 |             #     named_linears, quant_config.modules_to_not_convert)
135 | 
136 |             # Replace nn.Linear with turbomind Linear
137 |             for name, module in named_linears.items():
138 |                 q_linear_module = turbomind.Linear
139 |                 q_linear = q_linear_module.from_linear(
140 |                     module, quant_config['bits'], quant_config['group_size'],
141 |                     quant_config['quant_method'], True)
142 |                 q_linear.to(next(layer.parameters()).device)
143 |                 set_op_by_name(layer, name, q_linear)
144 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/kernel/f16_u4g128_f16_tnt_sm80_s16816.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/arch/config_sm80_s16816.h"
 4 | #include "src/turbomind/kernels/gemm/registry.h"
 5 | #include "src/turbomind/kernels/gemm/transform.h"
 6 | #include "src/turbomind/kernels/gemm/types.h"
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | void Registry::f16_u4g128_f16_tnt_sm80_s16816()
11 | {
12 |     using namespace sm80_s16816;
13 |     using namespace cache_policy;
14 |     using S = cache_policy::Stream;
15 |     using D = cache_policy::Default;
16 | 
17 |     using C = Sm80_s16816<Sm80,
18 |                           Operand_A<half, kRowMajor>,          // A
19 |                           Transform_Default,                   // tarnsform A
20 |                           VoidOperand,                         // U
21 |                           Operand_B_Pack<uint4_t, kColMajor>,  // B
22 |                           Transform_HMMA_16816<1, 0>,          // transform B
23 |                           Operand_UV_Pack<uint32_t, true>,     // V
24 |                           kRowMajor,                           // order_C
25 |                           half>;                               // Tc
26 | 
27 |     // clang-format off
28 |     // Add<C::Type<128, 256,  64, 1, 8, 1, D, S, 3, true, 1, 128>>(); // 0/0
29 |     Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 3, true, 1, 128, 128, 128>>(); // 30/3
30 |     Add<C::Type<128, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128, 128, 128>>(); // --/20
31 |     Add<C::Type<128, 128,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>();  // --/13
32 |     Add<C::Type<128, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128, 64, 128>>();  // 21/13
33 |     Add<C::Type<128, 128,  64, 1, 4, 2, D, S, 3, true, 1, 128, 64, 128>>();  // 6/6
34 | 
35 |     Add<C::Type<96, 256,  32, 1, 8, 1, D, D, 4, true, 1, 128>>();  // --/3
36 |     Add<C::Type<96, 256,  32, 1, 8, 1, D, S, 3, true, 1, 128>>();  // 13/13
37 |     Add<C::Type<96, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 14/10
38 |     Add<C::Type<96, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 2/2
39 | 
40 |     Add<C::Type<64, 256,  32, 1, 4, 1, D, D, 3, true, 1, 128, 64, 128>>(); // --/21
41 |     Add<C::Type<64, 256,  32, 1, 4, 1, D, S, 4, true, 1, 128, 64, 128>>(); // 27/13
42 |     Add<C::Type<64, 128,  32, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 8/5
43 |     Add<C::Type<64, 128,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();  // 7/5
44 |     Add<C::Type<64, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 6/7
45 |     Add<C::Type<64,  64,  64, 1, 2, 2, D, S, 6, true, 1, 128>>();
46 | 
47 |     Add<C::Type<48, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128, 48, 128>>(); // 1/1
48 |     Add<C::Type<48, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();  // 1/1
49 |     Add<C::Type<48, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();  // 4/4
50 |     Add<C::Type<48,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
51 | 
52 |     Add<C::Type<32, 256,  64, 1, 4, 1, D, S, 3, true, 1, 128>>();
53 |     Add<C::Type<32, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();
54 |     Add<C::Type<32, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();
55 |     Add<C::Type<32,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();
56 |     Add<C::Type<32,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
57 | 
58 |     Add<C::Type<16, 128,  64, 1, 4, 1, D, S, 4, true, 1, 128>>();
59 |     Add<C::Type<16, 128, 128, 1, 4, 2, D, S, 3, true, 1, 128>>();
60 |     Add<C::Type<16, 128, 128, 1, 4, 2, D, S, 4, true, 1, 128>>();
61 |     Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 3, true, 1, 128>>();
62 |     Add<C::Type<16,  64, 128, 1, 2, 2, D, S, 4, true, 1, 128>>();
63 |     // clang-format on
64 | }
65 | 
66 | // sm80_f16_u4g128_f16_ttt_128x256x32_4_s16816_1x8x1_c128x128_a1x32x32_00: 46
67 | // sm80_f16_u4g128_f16_ttt_128x128x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 27
68 | // sm80_f16_u4g128_f16_ttt_64x256x32_3_s16816_1x4x1_c64x128_a1x32x32_00: 21
69 | // sm80_f16_u4g128_f16_ttt_64x256x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 19
70 | // sm80_f16_u4g128_f16_ttt_16x128x128_4_s16816_1x4x2_c16x128_a1x32x128_01: 17
71 | // sm80_f16_u4g128_f16_ttt_32x128x128_3_s16816_1x4x2_c32x128_a1x32x128_01: 16
72 | // sm80_f16_u4g128_f16_ttt_64x128x128_3_s16816_1x4x2_c64x128_a1x32x128_01: 16
73 | // sm80_f16_u4g128_f16_ttt_96x128x32_4_s16816_1x4x1_c96x128_a1x32x32_01: 16
74 | // sm80_f16_u4g128_f16_ttt_96x256x32_4_s16816_1x8x1_c96x256_a1x32x32_00: 15
75 | // sm80_f16_u4g128_f16_ttt_16x64x128_3_s16816_1x2x2_c16x64_a1x32x128_01: 13
76 | // sm80_f16_u4g128_f16_ttt_16x128x64_4_s16816_1x4x1_c16x128_a1x32x64_01: 13
77 | // sm80_f16_u4g128_f16_ttt_48x128x128_3_s16816_1x4x2_c48x128_a1x32x128_01: 13
78 | // sm80_f16_u4g128_f16_ttt_48x256x64_3_s16816_1x4x1_c48x128_a1x32x64_01: 13
79 | // sm80_f16_u4g128_f16_ttt_16x64x128_4_s16816_1x2x2_c16x64_a1x32x128_01: 11
80 | // sm80_f16_u4g128_f16_ttt_64x128x64_3_s16816_1x4x1_c64x128_a1x32x64_01: 9
81 | // sm80_f16_u4g128_f16_ttt_128x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 9
82 | // sm80_f16_u4g128_f16_ttt_96x128x128_3_s16816_1x4x2_c96x128_a1x32x128_01: 7
83 | // sm80_f16_u4g128_f16_ttt_96x256x32_3_s16816_1x8x1_c96x256_a1x32x32_01: 7
84 | // sm80_f16_u4g128_f16_ttt_48x128x64_4_s16816_1x4x1_c48x128_a1x32x64_01: 6
85 | // sm80_f16_u4g128_f16_ttt_32x64x128_4_s16816_1x2x2_c32x64_a1x32x128_01: 5
86 | // sm80_f16_u4g128_f16_ttt_32x256x64_3_s16816_1x4x1_c32x256_a1x32x64_01: 5
87 | // sm80_f16_u4g128_f16_ttt_64x64x64_6_s16816_1x2x2_c64x64_a1x32x64_01: 5
88 | // sm80_f16_u4g128_f16_ttt_16x128x128_3_s16816_1x4x2_c16x128_a1x32x128_01: 4
89 | // sm80_f16_u4g128_f16_ttt_32x128x64_4_s16816_1x4x1_c32x128_a1x32x64_01: 4
90 | // sm80_f16_u4g128_f16_ttt_48x64x128_4_s16816_1x2x2_c48x64_a1x32x128_01: 4
91 | // sm80_f16_u4g128_f16_ttt_64x128x32_4_s16816_1x4x1_c64x128_a1x32x32_01: 4
92 | // sm80_f16_u4g128_f16_ttt_128x128x64_3_s16816_1x4x2_c64x128_a1x32x64_01: 4
93 | // sm80_f16_u4g128_f16_ttt_128x256x32_3_s16816_1x8x1_c128x128_a1x32x32_00: 4
94 | // sm80_f16_u4g128_f16_ttt_32x64x128_3_s16816_1x2x2_c32x64_a1x32x128_01: 3
95 | // sm80_f16_u4g128_f16_ttt_128x256x64_3_s16816_1x8x1_c128x256_a1x32x64_01: 0
96 | 
97 | }  // namespace turbomind::gemm
98 | 


--------------------------------------------------------------------------------