├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── assets ├── figures │ ├── demo.gif │ ├── fig-examples.png │ ├── fig-kernel-bench.png │ ├── fig-teaser.png │ └── fig_e2e.png ├── quest_paper.pdf ├── quest_poster.pdf └── quest_slides.pdf ├── evaluation ├── LongBench │ ├── config │ │ ├── dataset2maxlen.json │ │ ├── dataset2prompt.json │ │ ├── model2maxlen.json │ │ └── model2path.json │ ├── eval.py │ ├── metrics.py │ └── pred.py ├── __init__.py ├── llama.py ├── mistral.py ├── passkey │ └── passkey.py ├── pg19 │ └── ppl_eval.py └── quest_attention.py ├── kernels ├── CMakeLists.txt ├── cmake │ ├── fetch_rapids.cmake │ └── get_raft.cmake ├── include │ ├── decode │ │ ├── decode_attn.cuh │ │ ├── decode_handler.cuh │ │ └── decode_page.cuh │ ├── prefill │ │ └── prefill.cuh │ └── topk │ │ └── decode_select_k.cuh └── src │ ├── bench │ ├── bench_batch_decode.cu │ ├── bench_decode_select_k.cu │ ├── bench_max_possible.cu │ ├── bench_page.cu │ └── bench_prefill.cu │ ├── include │ ├── cpu_reference.h │ └── cpu_utils.h │ └── test │ ├── test_batch_decode.cu │ ├── test_max_possible.cu │ ├── test_page.cu │ └── test_prefill.cu ├── pyproject.toml ├── quest ├── __init__.py ├── models │ ├── QuestAttention.py │ ├── __init__.py │ └── llama.py ├── ops │ ├── CMakeLists.txt │ ├── cmake │ │ ├── fetch_rapids.cmake │ │ └── get_raft.cmake │ ├── csrc │ │ ├── approx_attn.cu │ │ ├── batch_prefill.cu │ │ ├── bsk_ops.cu │ │ ├── bsk_ops.h │ │ ├── estimate.cu │ │ ├── page.cu │ │ ├── pytorch_extension_utils.h │ │ ├── rms_norm.cu │ │ └── topk.cu │ └── setup.sh ├── tests │ ├── test_approx_attention.py │ ├── test_decode_attention.py │ ├── test_estimate.py │ ├── test_prefill_attention.py │ ├── test_rope.py │ └── test_topk.py └── utils │ ├── __init__.py │ ├── controller.py │ ├── decode_wrapper.py │ ├── kv_cache.py │ └── utils.py └── scripts ├── bench_efficiency_e2e.sh ├── bench_kernels.sh ├── bench_textgen.py ├── example_demo.py ├── example_textgen.py ├── longbench.sh ├── passkey.sh ├── ppl_eval.sh └── profile_textgen.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/.gitmodules -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/README.md -------------------------------------------------------------------------------- /assets/figures/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/figures/demo.gif -------------------------------------------------------------------------------- /assets/figures/fig-examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/figures/fig-examples.png -------------------------------------------------------------------------------- /assets/figures/fig-kernel-bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/figures/fig-kernel-bench.png -------------------------------------------------------------------------------- /assets/figures/fig-teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/figures/fig-teaser.png -------------------------------------------------------------------------------- /assets/figures/fig_e2e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/figures/fig_e2e.png -------------------------------------------------------------------------------- /assets/quest_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/quest_paper.pdf -------------------------------------------------------------------------------- /assets/quest_poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/quest_poster.pdf -------------------------------------------------------------------------------- /assets/quest_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/assets/quest_slides.pdf -------------------------------------------------------------------------------- /evaluation/LongBench/config/dataset2maxlen.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/config/dataset2maxlen.json -------------------------------------------------------------------------------- /evaluation/LongBench/config/dataset2prompt.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/config/dataset2prompt.json -------------------------------------------------------------------------------- /evaluation/LongBench/config/model2maxlen.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/config/model2maxlen.json -------------------------------------------------------------------------------- /evaluation/LongBench/config/model2path.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/config/model2path.json -------------------------------------------------------------------------------- /evaluation/LongBench/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/eval.py -------------------------------------------------------------------------------- /evaluation/LongBench/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/metrics.py -------------------------------------------------------------------------------- /evaluation/LongBench/pred.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/LongBench/pred.py -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/llama.py -------------------------------------------------------------------------------- /evaluation/mistral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/mistral.py -------------------------------------------------------------------------------- /evaluation/passkey/passkey.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/passkey/passkey.py -------------------------------------------------------------------------------- /evaluation/pg19/ppl_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/pg19/ppl_eval.py -------------------------------------------------------------------------------- /evaluation/quest_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/evaluation/quest_attention.py -------------------------------------------------------------------------------- /kernels/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/CMakeLists.txt -------------------------------------------------------------------------------- /kernels/cmake/fetch_rapids.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/cmake/fetch_rapids.cmake -------------------------------------------------------------------------------- /kernels/cmake/get_raft.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/cmake/get_raft.cmake -------------------------------------------------------------------------------- /kernels/include/decode/decode_attn.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/include/decode/decode_attn.cuh -------------------------------------------------------------------------------- /kernels/include/decode/decode_handler.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/include/decode/decode_handler.cuh -------------------------------------------------------------------------------- /kernels/include/decode/decode_page.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/include/decode/decode_page.cuh -------------------------------------------------------------------------------- /kernels/include/prefill/prefill.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/include/prefill/prefill.cuh -------------------------------------------------------------------------------- /kernels/include/topk/decode_select_k.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/include/topk/decode_select_k.cuh -------------------------------------------------------------------------------- /kernels/src/bench/bench_batch_decode.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/bench/bench_batch_decode.cu -------------------------------------------------------------------------------- /kernels/src/bench/bench_decode_select_k.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/bench/bench_decode_select_k.cu -------------------------------------------------------------------------------- /kernels/src/bench/bench_max_possible.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/bench/bench_max_possible.cu -------------------------------------------------------------------------------- /kernels/src/bench/bench_page.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/bench/bench_page.cu -------------------------------------------------------------------------------- /kernels/src/bench/bench_prefill.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/bench/bench_prefill.cu -------------------------------------------------------------------------------- /kernels/src/include/cpu_reference.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/include/cpu_reference.h -------------------------------------------------------------------------------- /kernels/src/include/cpu_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/include/cpu_utils.h -------------------------------------------------------------------------------- /kernels/src/test/test_batch_decode.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/test/test_batch_decode.cu -------------------------------------------------------------------------------- /kernels/src/test/test_max_possible.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/test/test_max_possible.cu -------------------------------------------------------------------------------- /kernels/src/test/test_page.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/test/test_page.cu -------------------------------------------------------------------------------- /kernels/src/test/test_prefill.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/kernels/src/test/test_prefill.cu -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/pyproject.toml -------------------------------------------------------------------------------- /quest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/__init__.py -------------------------------------------------------------------------------- /quest/models/QuestAttention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/models/QuestAttention.py -------------------------------------------------------------------------------- /quest/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/models/__init__.py -------------------------------------------------------------------------------- /quest/models/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/models/llama.py -------------------------------------------------------------------------------- /quest/ops/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/CMakeLists.txt -------------------------------------------------------------------------------- /quest/ops/cmake/fetch_rapids.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/cmake/fetch_rapids.cmake -------------------------------------------------------------------------------- /quest/ops/cmake/get_raft.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/cmake/get_raft.cmake -------------------------------------------------------------------------------- /quest/ops/csrc/approx_attn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/approx_attn.cu -------------------------------------------------------------------------------- /quest/ops/csrc/batch_prefill.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/batch_prefill.cu -------------------------------------------------------------------------------- /quest/ops/csrc/bsk_ops.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/bsk_ops.cu -------------------------------------------------------------------------------- /quest/ops/csrc/bsk_ops.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/bsk_ops.h -------------------------------------------------------------------------------- /quest/ops/csrc/estimate.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/estimate.cu -------------------------------------------------------------------------------- /quest/ops/csrc/page.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/page.cu -------------------------------------------------------------------------------- /quest/ops/csrc/pytorch_extension_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/pytorch_extension_utils.h -------------------------------------------------------------------------------- /quest/ops/csrc/rms_norm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/rms_norm.cu -------------------------------------------------------------------------------- /quest/ops/csrc/topk.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/csrc/topk.cu -------------------------------------------------------------------------------- /quest/ops/setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/ops/setup.sh -------------------------------------------------------------------------------- /quest/tests/test_approx_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_approx_attention.py -------------------------------------------------------------------------------- /quest/tests/test_decode_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_decode_attention.py -------------------------------------------------------------------------------- /quest/tests/test_estimate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_estimate.py -------------------------------------------------------------------------------- /quest/tests/test_prefill_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_prefill_attention.py -------------------------------------------------------------------------------- /quest/tests/test_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_rope.py -------------------------------------------------------------------------------- /quest/tests/test_topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/tests/test_topk.py -------------------------------------------------------------------------------- /quest/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/utils/__init__.py -------------------------------------------------------------------------------- /quest/utils/controller.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/utils/controller.py -------------------------------------------------------------------------------- /quest/utils/decode_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/utils/decode_wrapper.py -------------------------------------------------------------------------------- /quest/utils/kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/utils/kv_cache.py -------------------------------------------------------------------------------- /quest/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/quest/utils/utils.py -------------------------------------------------------------------------------- /scripts/bench_efficiency_e2e.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/bench_efficiency_e2e.sh -------------------------------------------------------------------------------- /scripts/bench_kernels.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/bench_kernels.sh -------------------------------------------------------------------------------- /scripts/bench_textgen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/bench_textgen.py -------------------------------------------------------------------------------- /scripts/example_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/example_demo.py -------------------------------------------------------------------------------- /scripts/example_textgen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/example_textgen.py -------------------------------------------------------------------------------- /scripts/longbench.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/longbench.sh -------------------------------------------------------------------------------- /scripts/passkey.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/passkey.sh -------------------------------------------------------------------------------- /scripts/ppl_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/ppl_eval.sh -------------------------------------------------------------------------------- /scripts/profile_textgen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mit-han-lab/Quest/HEAD/scripts/profile_textgen.py --------------------------------------------------------------------------------