├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── Makefile ├── README.md ├── benchs └── python │ ├── .gitignore │ ├── bench_flash_attn_f32.py │ └── context.py ├── include ├── common.hpp ├── config.hpp ├── cuda_utils.hpp ├── kernels │ ├── flash_attn │ │ └── flash_attn_f32.hpp │ ├── gemv.hpp │ ├── memory │ │ ├── tile.hpp │ │ └── vec.hpp │ ├── mod.hpp │ ├── online_softmax.hpp │ ├── reduce.hpp │ └── softmax.hpp ├── memory │ ├── mod.hpp │ ├── tile.hpp │ ├── types │ │ └── register.hpp │ └── vec.hpp └── warp │ ├── ldmatrix.hpp │ ├── mma.hpp │ ├── mod.hpp │ └── reduce.hpp ├── notes ├── TK │ └── memory │ │ ├── tile.md │ │ ├── tile │ │ └── ldmatrix_layout.png │ │ └── vec.md ├── flash_attn.md ├── memory │ ├── coalescing.md │ └── vec.md └── warp.md ├── pybindings ├── .gitignore └── __init__.py ├── scripts └── cmake │ └── generic.cmake ├── src ├── CMakeLists.txt ├── cuda_utils.cc ├── kernels │ ├── flash_attn │ │ └── flash_attn_f32.cu │ ├── gemv.cu │ ├── memory │ │ ├── tile.cu │ │ └── vec.cu │ ├── online_softmax.cu │ ├── reduce.cu │ └── softmax.cu └── torch_bind.cc └── tests ├── .gitignore ├── context.py ├── test_2d_tile_copy.py ├── test_flash_attn_f32.py ├── test_reduce.py ├── test_softmax.py └── test_vec_copy_f32.py /.clang-format: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/.clang-format -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/* 2 | build/* 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/README.md -------------------------------------------------------------------------------- /benchs/python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /benchs/python/bench_flash_attn_f32.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/benchs/python/bench_flash_attn_f32.py -------------------------------------------------------------------------------- /benchs/python/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/benchs/python/context.py -------------------------------------------------------------------------------- /include/common.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/common.hpp -------------------------------------------------------------------------------- /include/config.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/config.hpp -------------------------------------------------------------------------------- /include/cuda_utils.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/cuda_utils.hpp -------------------------------------------------------------------------------- /include/kernels/flash_attn/flash_attn_f32.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/flash_attn/flash_attn_f32.hpp -------------------------------------------------------------------------------- /include/kernels/gemv.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/gemv.hpp -------------------------------------------------------------------------------- /include/kernels/memory/tile.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/memory/tile.hpp -------------------------------------------------------------------------------- /include/kernels/memory/vec.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/memory/vec.hpp -------------------------------------------------------------------------------- /include/kernels/mod.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/mod.hpp -------------------------------------------------------------------------------- /include/kernels/online_softmax.hpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /include/kernels/reduce.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/reduce.hpp -------------------------------------------------------------------------------- /include/kernels/softmax.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/kernels/softmax.hpp -------------------------------------------------------------------------------- /include/memory/mod.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/memory/mod.hpp -------------------------------------------------------------------------------- /include/memory/tile.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/memory/tile.hpp -------------------------------------------------------------------------------- /include/memory/types/register.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/memory/types/register.hpp -------------------------------------------------------------------------------- /include/memory/vec.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/memory/vec.hpp -------------------------------------------------------------------------------- /include/warp/ldmatrix.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/warp/ldmatrix.hpp -------------------------------------------------------------------------------- /include/warp/mma.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/warp/mma.hpp -------------------------------------------------------------------------------- /include/warp/mod.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/warp/mod.hpp -------------------------------------------------------------------------------- /include/warp/reduce.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/include/warp/reduce.hpp -------------------------------------------------------------------------------- /notes/TK/memory/tile.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/TK/memory/tile.md -------------------------------------------------------------------------------- /notes/TK/memory/tile/ldmatrix_layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/TK/memory/tile/ldmatrix_layout.png -------------------------------------------------------------------------------- /notes/TK/memory/vec.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/TK/memory/vec.md -------------------------------------------------------------------------------- /notes/flash_attn.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/flash_attn.md -------------------------------------------------------------------------------- /notes/memory/coalescing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/memory/coalescing.md -------------------------------------------------------------------------------- /notes/memory/vec.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/memory/vec.md -------------------------------------------------------------------------------- /notes/warp.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/notes/warp.md -------------------------------------------------------------------------------- /pybindings/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /pybindings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/pybindings/__init__.py -------------------------------------------------------------------------------- /scripts/cmake/generic.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/scripts/cmake/generic.cmake -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/CMakeLists.txt -------------------------------------------------------------------------------- /src/cuda_utils.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/cuda_utils.cc -------------------------------------------------------------------------------- /src/kernels/flash_attn/flash_attn_f32.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/flash_attn/flash_attn_f32.cu -------------------------------------------------------------------------------- /src/kernels/gemv.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/gemv.cu -------------------------------------------------------------------------------- /src/kernels/memory/tile.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/memory/tile.cu -------------------------------------------------------------------------------- /src/kernels/memory/vec.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/memory/vec.cu -------------------------------------------------------------------------------- /src/kernels/online_softmax.cu: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/kernels/reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/reduce.cu -------------------------------------------------------------------------------- /src/kernels/softmax.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/kernels/softmax.cu -------------------------------------------------------------------------------- /src/torch_bind.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/src/torch_bind.cc -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/context.py -------------------------------------------------------------------------------- /tests/test_2d_tile_copy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/test_2d_tile_copy.py -------------------------------------------------------------------------------- /tests/test_flash_attn_f32.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/test_flash_attn_f32.py -------------------------------------------------------------------------------- /tests/test_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/test_reduce.py -------------------------------------------------------------------------------- /tests/test_softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/test_softmax.py -------------------------------------------------------------------------------- /tests/test_vec_copy_f32.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KuangjuX/CUDAKernels/HEAD/tests/test_vec_copy_f32.py --------------------------------------------------------------------------------