├── .gitignore ├── LICENSE ├── README.md ├── docs └── figures │ ├── Accuracy.png │ ├── Speedup_to_4bit.png │ ├── Speedup_to_8bit.png │ ├── banner.png │ ├── designs.png │ └── e2e_inference.png ├── examples └── README.md ├── lowbit_kernel ├── Makefile ├── all_benchmark ├── bgemm ├── csrc │ ├── benchmark.cu │ ├── fp6_linear.cu │ ├── fp6_linear.cuh │ ├── fp6_llm.cu │ ├── include │ │ ├── configs.h │ │ ├── kernel_matmul.cuh │ │ ├── kernel_reduction.cuh │ │ ├── ptx_cp.async.cuh │ │ ├── ptx_mma.cuh │ │ ├── utils_core.cuh │ │ ├── utils_gmem.cuh │ │ └── utils_parallel_dequant.cuh │ ├── kernel_test.h │ ├── pybind.cpp │ └── utils │ │ ├── helper.h │ │ ├── weight_dequant.h │ │ ├── weight_prepacking.h │ │ └── weight_quant.h └── run.sh ├── setup.py └── tests ├── cpp ├── Makefile ├── benchmark.cu ├── bgemm ├── kernel_test.cu ├── kernel_test.h └── run.sh └── python ├── Test accuracy_x=Iterations_y=Accuracy [%].pdf ├── Training loss_x=Iterations_y=Loss.pdf ├── __pycache__ └── bgemm_linear.cpython-38.pyc ├── bert ├── bert.py ├── configuration_bert.py ├── configuration_utils.py └── file_utils.py ├── bgemm_linear.py ├── run.sh ├── test_kernel.py └── test_model_demo.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/README.md -------------------------------------------------------------------------------- /docs/figures/Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Accuracy.png -------------------------------------------------------------------------------- /docs/figures/Speedup_to_4bit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Speedup_to_4bit.png -------------------------------------------------------------------------------- /docs/figures/Speedup_to_8bit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/Speedup_to_8bit.png -------------------------------------------------------------------------------- /docs/figures/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/banner.png -------------------------------------------------------------------------------- /docs/figures/designs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/designs.png -------------------------------------------------------------------------------- /docs/figures/e2e_inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/docs/figures/e2e_inference.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/examples/README.md -------------------------------------------------------------------------------- /lowbit_kernel/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/Makefile -------------------------------------------------------------------------------- /lowbit_kernel/all_benchmark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/all_benchmark -------------------------------------------------------------------------------- /lowbit_kernel/bgemm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/bgemm -------------------------------------------------------------------------------- /lowbit_kernel/csrc/benchmark.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/benchmark.cu -------------------------------------------------------------------------------- /lowbit_kernel/csrc/fp6_linear.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_linear.cu -------------------------------------------------------------------------------- /lowbit_kernel/csrc/fp6_linear.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_linear.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/fp6_llm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/fp6_llm.cu -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/configs.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/configs.h -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/kernel_matmul.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/kernel_matmul.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/kernel_reduction.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/kernel_reduction.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/ptx_cp.async.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/ptx_cp.async.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/ptx_mma.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/ptx_mma.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/utils_core.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_core.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/utils_gmem.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_gmem.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/include/utils_parallel_dequant.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/include/utils_parallel_dequant.cuh -------------------------------------------------------------------------------- /lowbit_kernel/csrc/kernel_test.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/kernel_test.h -------------------------------------------------------------------------------- /lowbit_kernel/csrc/pybind.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/pybind.cpp -------------------------------------------------------------------------------- /lowbit_kernel/csrc/utils/helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/helper.h -------------------------------------------------------------------------------- /lowbit_kernel/csrc/utils/weight_dequant.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_dequant.h -------------------------------------------------------------------------------- /lowbit_kernel/csrc/utils/weight_prepacking.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_prepacking.h -------------------------------------------------------------------------------- /lowbit_kernel/csrc/utils/weight_quant.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/csrc/utils/weight_quant.h -------------------------------------------------------------------------------- /lowbit_kernel/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/lowbit_kernel/run.sh -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/setup.py -------------------------------------------------------------------------------- /tests/cpp/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/Makefile -------------------------------------------------------------------------------- /tests/cpp/benchmark.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/benchmark.cu -------------------------------------------------------------------------------- /tests/cpp/bgemm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/bgemm -------------------------------------------------------------------------------- /tests/cpp/kernel_test.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/kernel_test.cu -------------------------------------------------------------------------------- /tests/cpp/kernel_test.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/kernel_test.h -------------------------------------------------------------------------------- /tests/cpp/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/cpp/run.sh -------------------------------------------------------------------------------- /tests/python/Test accuracy_x=Iterations_y=Accuracy [%].pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/Test accuracy_x=Iterations_y=Accuracy [%].pdf -------------------------------------------------------------------------------- /tests/python/Training loss_x=Iterations_y=Loss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/Training loss_x=Iterations_y=Loss.pdf -------------------------------------------------------------------------------- /tests/python/__pycache__/bgemm_linear.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/__pycache__/bgemm_linear.cpython-38.pyc -------------------------------------------------------------------------------- /tests/python/bert/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/bert.py -------------------------------------------------------------------------------- /tests/python/bert/configuration_bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/configuration_bert.py -------------------------------------------------------------------------------- /tests/python/bert/configuration_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/configuration_utils.py -------------------------------------------------------------------------------- /tests/python/bert/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bert/file_utils.py -------------------------------------------------------------------------------- /tests/python/bgemm_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/bgemm_linear.py -------------------------------------------------------------------------------- /tests/python/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/run.sh -------------------------------------------------------------------------------- /tests/python/test_kernel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/test_kernel.py -------------------------------------------------------------------------------- /tests/python/test_model_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yifu-ding/BGEMM-CUDA/HEAD/tests/python/test_model_demo.py --------------------------------------------------------------------------------