├── .gitignore
├── LICENSE
├── README.md
├── docs
    └── figures
    │   ├── Accuracy.png
    │   ├── Speedup_to_4bit.png
    │   ├── Speedup_to_8bit.png
    │   ├── banner.png
    │   ├── designs.png
    │   └── e2e_inference.png
├── examples
    └── README.md
├── lowbit_kernel
    ├── Makefile
    ├── all_benchmark
    ├── bgemm
    ├── csrc
    │   ├── benchmark.cu
    │   ├── fp6_linear.cu
    │   ├── fp6_linear.cuh
    │   ├── fp6_llm.cu
    │   ├── include
    │   │   ├── configs.h
    │   │   ├── kernel_matmul.cuh
    │   │   ├── kernel_reduction.cuh
    │   │   ├── ptx_cp.async.cuh
    │   │   ├── ptx_mma.cuh
    │   │   ├── utils_core.cuh
    │   │   ├── utils_gmem.cuh
    │   │   └── utils_parallel_dequant.cuh
    │   ├── kernel_test.h
    │   ├── pybind.cpp
    │   └── utils
    │   │   ├── helper.h
    │   │   ├── weight_dequant.h
    │   │   ├── weight_prepacking.h
    │   │   └── weight_quant.h
    └── run.sh
├── setup.py
└── tests
    ├── cpp
        ├── Makefile
        ├── benchmark.cu
        ├── bgemm
        ├── kernel_test.cu
        ├── kernel_test.h
        └── run.sh
    └── python
        ├── Test accuracy_x=Iterations_y=Accuracy [%].pdf
        ├── Training loss_x=Iterations_y=Loss.pdf
        ├── __pycache__
            └── bgemm_linear.cpython-38.pyc
        ├── bert
            ├── bert.py
            ├── configuration_bert.py
            ├── configuration_utils.py
            └── file_utils.py
        ├── bgemm_linear.py
        ├── run.sh
        ├── test_kernel.py
        └── test_model_demo.py


/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/.gitignore


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/README.md


--------------------------------------------------------------------------------
/docs/figures/Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Accuracy.png


--------------------------------------------------------------------------------
/docs/figures/Speedup_to_4bit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Speedup_to_4bit.png


--------------------------------------------------------------------------------
/docs/figures/Speedup_to_8bit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Speedup_to_8bit.png


--------------------------------------------------------------------------------
/docs/figures/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/banner.png


--------------------------------------------------------------------------------
/docs/figures/designs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/designs.png


--------------------------------------------------------------------------------
/docs/figures/e2e_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/e2e_inference.png


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/examples/README.md


--------------------------------------------------------------------------------
/lowbit_kernel/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/Makefile


--------------------------------------------------------------------------------
/lowbit_kernel/all_benchmark:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/all_benchmark


--------------------------------------------------------------------------------
/lowbit_kernel/bgemm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/bgemm


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/benchmark.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/benchmark.cu


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/fp6_linear.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_linear.cu


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/fp6_linear.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_linear.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/fp6_llm.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_llm.cu


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/configs.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/configs.h


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/kernel_matmul.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/kernel_matmul.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/kernel_reduction.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/kernel_reduction.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/ptx_cp.async.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/ptx_cp.async.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/ptx_mma.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/ptx_mma.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/utils_core.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_core.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/utils_gmem.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_gmem.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/include/utils_parallel_dequant.cuh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_parallel_dequant.cuh


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/kernel_test.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/kernel_test.h


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/pybind.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/pybind.cpp


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/utils/helper.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/helper.h


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/utils/weight_dequant.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_dequant.h


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/utils/weight_prepacking.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_prepacking.h


--------------------------------------------------------------------------------
/lowbit_kernel/csrc/utils/weight_quant.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_quant.h


--------------------------------------------------------------------------------
/lowbit_kernel/run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/run.sh


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/setup.py


--------------------------------------------------------------------------------
/tests/cpp/Makefile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/Makefile


--------------------------------------------------------------------------------
/tests/cpp/benchmark.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/benchmark.cu


--------------------------------------------------------------------------------
/tests/cpp/bgemm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/bgemm


--------------------------------------------------------------------------------
/tests/cpp/kernel_test.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/kernel_test.cu


--------------------------------------------------------------------------------
/tests/cpp/kernel_test.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/kernel_test.h


--------------------------------------------------------------------------------
/tests/cpp/run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/run.sh


--------------------------------------------------------------------------------
/tests/python/Test accuracy_x=Iterations_y=Accuracy [%].pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/Test accuracy_x=Iterations_y=Accuracy [%].pdf


--------------------------------------------------------------------------------
/tests/python/Training loss_x=Iterations_y=Loss.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/Training loss_x=Iterations_y=Loss.pdf


--------------------------------------------------------------------------------
/tests/python/__pycache__/bgemm_linear.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/__pycache__/bgemm_linear.cpython-38.pyc


--------------------------------------------------------------------------------
/tests/python/bert/bert.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/bert.py


--------------------------------------------------------------------------------
/tests/python/bert/configuration_bert.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/configuration_bert.py


--------------------------------------------------------------------------------
/tests/python/bert/configuration_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/configuration_utils.py


--------------------------------------------------------------------------------
/tests/python/bert/file_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/file_utils.py


--------------------------------------------------------------------------------
/tests/python/bgemm_linear.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bgemm_linear.py


--------------------------------------------------------------------------------
/tests/python/run.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/run.sh


--------------------------------------------------------------------------------
/tests/python/test_kernel.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/test_kernel.py


--------------------------------------------------------------------------------
/tests/python/test_model_demo.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/test_model_demo.py


--------------------------------------------------------------------------------