├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── chunk-attn.png ├── cmake ├── detect_python.cmake └── policy.cmake ├── cpp ├── CMakeLists.txt ├── chunk_attn │ ├── CMakeLists.txt │ ├── attention.cpp │ ├── attention.h │ ├── chunk.cpp │ ├── chunk.h │ ├── chunk_allocator.cpp │ ├── chunk_allocator.h │ ├── chunk_info.h │ ├── cuda_compat.h │ ├── kernel_cpu_mkl.cpp │ ├── kernel_cpu_mkl.h │ ├── kernel_cpu_tls.h │ ├── kernel_cuda.cu │ ├── kernel_cuda.h │ ├── layernorm_kernels.cu │ ├── layernorm_kernels.h │ ├── logging.cpp │ ├── logging.h │ ├── pos_encoding_kernels.cu │ ├── pos_encoding_kernels.h │ ├── python_exports.cpp │ ├── python_exports_ops.cpp │ ├── reduction_utils.cuh │ ├── small_vector.h │ ├── spin_lock.h │ ├── str_utils.cpp │ ├── str_utils.h │ ├── task_executor.cpp │ └── task_executor.h └── tests │ ├── CMakeLists.txt │ ├── test_cpu_kernel.cpp │ └── test_gpu_kernel.cpp ├── pyproject.toml ├── src └── chunk_attn │ ├── __init__.py │ ├── arithmetic │ ├── __init__.py │ ├── dtype.py │ ├── gpt.py │ ├── linear.py │ ├── llama.py │ ├── metrics.py │ ├── module.py │ └── tensor.py │ ├── logger.py │ ├── models │ ├── __init__.py │ ├── llama_hf │ │ ├── __init__.py │ │ ├── casual_attn.py │ │ ├── configuration_llama.py │ │ ├── layernorm.py │ │ ├── layernorm_vllm.py │ │ ├── modeling_llama.py │ │ ├── rotary_embedding.py │ │ └── rotary_embedding_vllm.py │ ├── model_host.py │ └── sequence.py │ └── nvtx.py ├── tests ├── benchmark_attn_pytorch.py ├── benchmark_attn_xformers.py ├── benchmark_chunk_attn.py ├── benchmark_flash_attn.py ├── benchmark_tgi.py ├── benchmark_vllm.py ├── my_llm_engine.py ├── test_arithmetic.py ├── test_arithmetic_llama.py ├── test_chunk_attn_cpu.py ├── test_chunk_attn_cuda.py ├── test_llama.py └── tgi_server_hook.py └── third_party ├── CMakeLists.txt ├── gtest.cmake ├── libtorch.cmake ├── nvtx.cmake ├── onnxruntime.cmake ├── pybind11.cmake └── spdlog.cmake /.clang-format: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/.clang-format -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/.gitignore -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/README.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/SECURITY.md -------------------------------------------------------------------------------- /chunk-attn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/chunk-attn.png -------------------------------------------------------------------------------- /cmake/detect_python.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cmake/detect_python.cmake -------------------------------------------------------------------------------- /cmake/policy.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cmake/policy.cmake -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/CMakeLists.txt -------------------------------------------------------------------------------- /cpp/chunk_attn/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/CMakeLists.txt -------------------------------------------------------------------------------- /cpp/chunk_attn/attention.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/attention.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/attention.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/attention.h -------------------------------------------------------------------------------- /cpp/chunk_attn/chunk.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/chunk.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/chunk.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/chunk.h -------------------------------------------------------------------------------- /cpp/chunk_attn/chunk_allocator.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/chunk_allocator.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/chunk_allocator.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/chunk_allocator.h -------------------------------------------------------------------------------- /cpp/chunk_attn/chunk_info.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/chunk_info.h -------------------------------------------------------------------------------- /cpp/chunk_attn/cuda_compat.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/cuda_compat.h -------------------------------------------------------------------------------- /cpp/chunk_attn/kernel_cpu_mkl.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/kernel_cpu_mkl.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/kernel_cpu_mkl.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/kernel_cpu_mkl.h -------------------------------------------------------------------------------- /cpp/chunk_attn/kernel_cpu_tls.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/kernel_cpu_tls.h -------------------------------------------------------------------------------- /cpp/chunk_attn/kernel_cuda.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/kernel_cuda.cu -------------------------------------------------------------------------------- /cpp/chunk_attn/kernel_cuda.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/kernel_cuda.h -------------------------------------------------------------------------------- /cpp/chunk_attn/layernorm_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/layernorm_kernels.cu -------------------------------------------------------------------------------- /cpp/chunk_attn/layernorm_kernels.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/layernorm_kernels.h -------------------------------------------------------------------------------- /cpp/chunk_attn/logging.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/logging.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/logging.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/logging.h -------------------------------------------------------------------------------- /cpp/chunk_attn/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/pos_encoding_kernels.cu -------------------------------------------------------------------------------- /cpp/chunk_attn/pos_encoding_kernels.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/pos_encoding_kernels.h -------------------------------------------------------------------------------- /cpp/chunk_attn/python_exports.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/python_exports.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/python_exports_ops.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/python_exports_ops.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/reduction_utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/reduction_utils.cuh -------------------------------------------------------------------------------- /cpp/chunk_attn/small_vector.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/small_vector.h -------------------------------------------------------------------------------- /cpp/chunk_attn/spin_lock.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/spin_lock.h -------------------------------------------------------------------------------- /cpp/chunk_attn/str_utils.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/str_utils.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/str_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/str_utils.h -------------------------------------------------------------------------------- /cpp/chunk_attn/task_executor.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/task_executor.cpp -------------------------------------------------------------------------------- /cpp/chunk_attn/task_executor.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/chunk_attn/task_executor.h -------------------------------------------------------------------------------- /cpp/tests/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/tests/CMakeLists.txt -------------------------------------------------------------------------------- /cpp/tests/test_cpu_kernel.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/tests/test_cpu_kernel.cpp -------------------------------------------------------------------------------- /cpp/tests/test_gpu_kernel.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/cpp/tests/test_gpu_kernel.cpp -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/chunk_attn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/__init__.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/__init__.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/dtype.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/dtype.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/gpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/gpt.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/linear.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/llama.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/metrics.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/module.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/module.py -------------------------------------------------------------------------------- /src/chunk_attn/arithmetic/tensor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/arithmetic/tensor.py -------------------------------------------------------------------------------- /src/chunk_attn/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/logger.py -------------------------------------------------------------------------------- /src/chunk_attn/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/__init__.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/__init__.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/casual_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/casual_attn.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/configuration_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/configuration_llama.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/layernorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/layernorm.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/layernorm_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/layernorm_vllm.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/modeling_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/modeling_llama.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/rotary_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/rotary_embedding.py -------------------------------------------------------------------------------- /src/chunk_attn/models/llama_hf/rotary_embedding_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/llama_hf/rotary_embedding_vllm.py -------------------------------------------------------------------------------- /src/chunk_attn/models/model_host.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/model_host.py -------------------------------------------------------------------------------- /src/chunk_attn/models/sequence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/models/sequence.py -------------------------------------------------------------------------------- /src/chunk_attn/nvtx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/src/chunk_attn/nvtx.py -------------------------------------------------------------------------------- /tests/benchmark_attn_pytorch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_attn_pytorch.py -------------------------------------------------------------------------------- /tests/benchmark_attn_xformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_attn_xformers.py -------------------------------------------------------------------------------- /tests/benchmark_chunk_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_chunk_attn.py -------------------------------------------------------------------------------- /tests/benchmark_flash_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_flash_attn.py -------------------------------------------------------------------------------- /tests/benchmark_tgi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_tgi.py -------------------------------------------------------------------------------- /tests/benchmark_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/benchmark_vllm.py -------------------------------------------------------------------------------- /tests/my_llm_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/my_llm_engine.py -------------------------------------------------------------------------------- /tests/test_arithmetic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/test_arithmetic.py -------------------------------------------------------------------------------- /tests/test_arithmetic_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/test_arithmetic_llama.py -------------------------------------------------------------------------------- /tests/test_chunk_attn_cpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/test_chunk_attn_cpu.py -------------------------------------------------------------------------------- /tests/test_chunk_attn_cuda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/test_chunk_attn_cuda.py -------------------------------------------------------------------------------- /tests/test_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/test_llama.py -------------------------------------------------------------------------------- /tests/tgi_server_hook.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/tests/tgi_server_hook.py -------------------------------------------------------------------------------- /third_party/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/CMakeLists.txt -------------------------------------------------------------------------------- /third_party/gtest.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/gtest.cmake -------------------------------------------------------------------------------- /third_party/libtorch.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/libtorch.cmake -------------------------------------------------------------------------------- /third_party/nvtx.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/nvtx.cmake -------------------------------------------------------------------------------- /third_party/onnxruntime.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/onnxruntime.cmake -------------------------------------------------------------------------------- /third_party/pybind11.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/pybind11.cmake -------------------------------------------------------------------------------- /third_party/spdlog.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/chunk-attention/HEAD/third_party/spdlog.cmake --------------------------------------------------------------------------------