├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── benchmarks ├── hadamard_benchmark.py ├── qattention_benchmark.py └── qlinear_benchmark.py ├── e2e ├── __init__.py ├── benchmark.py ├── benchmark_layer.py ├── checkpoint_utils │ ├── __init__.py │ ├── data_utils.py │ ├── gptq_utils.py │ ├── quantize_llama_checkpoint.py │ └── rotation_utils.py └── quantized_llama │ ├── __init__.py │ └── modeling_llama.py ├── fake_quant ├── README.md ├── data_utils.py ├── eval_utils.py ├── gptq_utils.py ├── hadamard_utils.py ├── main.py ├── model_utils.py ├── monkeypatch.py ├── quant_utils.py ├── rotation_utils.py └── utils.py ├── img ├── carrot.png └── fig1.png ├── quarot ├── __init__.py ├── functional │ ├── __init__.py │ ├── hadamard.py │ └── quantization.py ├── kernels │ ├── bindings.cpp │ ├── flashinfer.cu │ ├── gemm.cu │ ├── include │ │ ├── common.h │ │ ├── flashinfer.h │ │ ├── flashinfer │ │ │ ├── cp_async.cuh │ │ │ ├── decode.cuh │ │ │ ├── layout.cuh │ │ │ ├── math.cuh │ │ │ ├── mma.cuh │ │ │ ├── page.cuh │ │ │ ├── permuted_smem.cuh │ │ │ ├── prefill.cuh │ │ │ ├── quantization.cuh │ │ │ ├── rope.cuh │ │ │ ├── state.cuh │ │ │ ├── utils.cuh │ │ │ └── vec_dtypes.cuh │ │ ├── gemm.h │ │ ├── int4.h │ │ ├── quant.h │ │ └── util.h │ └── quant.cu ├── nn │ ├── __init__.py │ ├── hadamard.py │ ├── linear.py │ ├── normalization.py │ └── quantization.py └── transformers │ ├── __init__.py │ └── kv_cache.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/.gitmodules -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/README.md -------------------------------------------------------------------------------- /benchmarks/hadamard_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/benchmarks/hadamard_benchmark.py -------------------------------------------------------------------------------- /benchmarks/qattention_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/benchmarks/qattention_benchmark.py -------------------------------------------------------------------------------- /benchmarks/qlinear_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/benchmarks/qlinear_benchmark.py -------------------------------------------------------------------------------- /e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/__init__.py -------------------------------------------------------------------------------- /e2e/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/benchmark.py -------------------------------------------------------------------------------- /e2e/benchmark_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/benchmark_layer.py -------------------------------------------------------------------------------- /e2e/checkpoint_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e2e/checkpoint_utils/data_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/checkpoint_utils/data_utils.py -------------------------------------------------------------------------------- /e2e/checkpoint_utils/gptq_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/checkpoint_utils/gptq_utils.py -------------------------------------------------------------------------------- /e2e/checkpoint_utils/quantize_llama_checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/checkpoint_utils/quantize_llama_checkpoint.py -------------------------------------------------------------------------------- /e2e/checkpoint_utils/rotation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/checkpoint_utils/rotation_utils.py -------------------------------------------------------------------------------- /e2e/quantized_llama/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e2e/quantized_llama/modeling_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/e2e/quantized_llama/modeling_llama.py -------------------------------------------------------------------------------- /fake_quant/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/README.md -------------------------------------------------------------------------------- /fake_quant/data_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/data_utils.py -------------------------------------------------------------------------------- /fake_quant/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/eval_utils.py -------------------------------------------------------------------------------- /fake_quant/gptq_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/gptq_utils.py -------------------------------------------------------------------------------- /fake_quant/hadamard_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/hadamard_utils.py -------------------------------------------------------------------------------- /fake_quant/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/main.py -------------------------------------------------------------------------------- /fake_quant/model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/model_utils.py -------------------------------------------------------------------------------- /fake_quant/monkeypatch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/monkeypatch.py -------------------------------------------------------------------------------- /fake_quant/quant_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/quant_utils.py -------------------------------------------------------------------------------- /fake_quant/rotation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/rotation_utils.py -------------------------------------------------------------------------------- /fake_quant/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/fake_quant/utils.py -------------------------------------------------------------------------------- /img/carrot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/img/carrot.png -------------------------------------------------------------------------------- /img/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/img/fig1.png -------------------------------------------------------------------------------- /quarot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/__init__.py -------------------------------------------------------------------------------- /quarot/functional/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/functional/__init__.py -------------------------------------------------------------------------------- /quarot/functional/hadamard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/functional/hadamard.py -------------------------------------------------------------------------------- /quarot/functional/quantization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/functional/quantization.py -------------------------------------------------------------------------------- /quarot/kernels/bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/bindings.cpp -------------------------------------------------------------------------------- /quarot/kernels/flashinfer.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/flashinfer.cu -------------------------------------------------------------------------------- /quarot/kernels/gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/gemm.cu -------------------------------------------------------------------------------- /quarot/kernels/include/common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/common.h -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer.h -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/cp_async.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/cp_async.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/decode.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/decode.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/layout.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/layout.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/math.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/math.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/mma.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/mma.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/page.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/page.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/permuted_smem.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/permuted_smem.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/prefill.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/prefill.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/quantization.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/quantization.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/rope.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/rope.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/state.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/state.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/utils.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/flashinfer/vec_dtypes.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/flashinfer/vec_dtypes.cuh -------------------------------------------------------------------------------- /quarot/kernels/include/gemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/gemm.h -------------------------------------------------------------------------------- /quarot/kernels/include/int4.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/int4.h -------------------------------------------------------------------------------- /quarot/kernels/include/quant.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/quant.h -------------------------------------------------------------------------------- /quarot/kernels/include/util.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/include/util.h -------------------------------------------------------------------------------- /quarot/kernels/quant.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/kernels/quant.cu -------------------------------------------------------------------------------- /quarot/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/nn/__init__.py -------------------------------------------------------------------------------- /quarot/nn/hadamard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/nn/hadamard.py -------------------------------------------------------------------------------- /quarot/nn/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/nn/linear.py -------------------------------------------------------------------------------- /quarot/nn/normalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/nn/normalization.py -------------------------------------------------------------------------------- /quarot/nn/quantization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/nn/quantization.py -------------------------------------------------------------------------------- /quarot/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .kv_cache import MultiLayerPagedKVCache4Bit 2 | -------------------------------------------------------------------------------- /quarot/transformers/kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/quarot/transformers/kv_cache.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/QuaRot/HEAD/setup.py --------------------------------------------------------------------------------