├── LICENSE
├── README.md
├── attention-mechanism
    ├── contextual-position-encoding--learning-to-count-what-s-important
    │   ├── README.md
    │   └── fig.1.jpg
    ├── deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model
    │   ├── README.md
    │   ├── fig.2.jpg
    │   ├── fig.3.jpg
    │   └── table.1.jpg
    ├── efficient-streaming-language-models-with-attention-sinks
    │   ├── README.md
    │   ├── fig.1.jpg
    │   └── fig.2.jpg
    ├── flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness
    │   ├── README.md
    │   ├── algo.1.jpg
    │   └── fig.1.jpg
    ├── flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning
    │   ├── README.md
    │   ├── algo.1.jpg
    │   ├── fig.2.jpg
    │   └── table.1.jpg
    ├── gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints
    │   ├── README.md
    │   └── fig.2.jpg
    ├── hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning
    │   ├── README.md
    │   ├── fig.1.jpg
    │   └── fig.4.jpg
    ├── lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning
    │   ├── README.md
    │   ├── fig.1.jpg
    │   └── fig.2.jpg
    ├── longformer--the-long-document-transformer
    │   ├── README.md
    │   └── fig.1.jpg
    ├── lora--low-rank-adaptation-of-large-language-models
    │   ├── README.md
    │   └── fig.1.jpg
    ├── mistral-7b
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.2.jpg
    │   └── fig.3.jpg
    ├── multi-matrix-factorization-attention
    │   ├── README.md
    │   ├── fig.2.jpg
    │   └── table.1.jpg
    ├── roformer--enhanced-transformer-with-rotary-position-embedding
    │   ├── 3.2.2.jpg
    │   ├── README.md
    │   ├── fig.1.jpg
    │   └── fig.2.jpg
    ├── tensor-product-attention-is-all-you-need
    │   ├── README.md
    │   └── fig.1.jpg
    └── train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation
    │   ├── README.md
    │   └── fig.3.jpg
├── continuous-batching
    ├── orca--a-distributed-serving-system-for-transformer-based-generative-models
    │   ├── EuroSys18.fig.5.jpg
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.4.jpg
    │   ├── fig.5.jpg
    │   └── fig.8.jpg
    └── sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.5.jpg
    │   ├── fig.6.jpg
    │   └── table.2.jpg
├── kv-cache
    └── efficient-memory-management-for-large-language-model-serving-with-pagedattention
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.3.jpg
    │   ├── fig.6.jpg
    │   └── fig.7.jpg
├── more
    ├── block-transformer--global-to-local-language-modeling-for-fast-inference
    │   ├── README.md
    │   ├── fig.1.jpg
    │   └── table.1.jpg
    ├── lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.3.jpg
    │   └── fig.4.jpg
    └── learning-to--learn-at-test-time---rnns-with-expressive-hidden-states
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.2.jpg
    │   ├── fig.3.jpg
    │   └── fig.4.jpg
├── quantization
    ├── awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration
    │   ├── README.md
    │   ├── err.jpg
    │   └── fig.2.jpg
    ├── llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale
    │   ├── 3.2.jpg
    │   ├── README.md
    │   └── fig.2.jpg
    ├── onebit--towards-extremely-low-bit-large-language-models
    │   ├── README.md
    │   └── fig.2.jpg
    ├── smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.4.jpg
    │   └── fig.5.jpg
    └── the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits
    │   ├── README.md
    │   └── fig.1.jpg
├── review
    ├── a-survey-on-efficient-inference-for-large-language-models
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.14.jpg
    │   ├── fig.17.jpg
    │   ├── fig.2.jpg
    │   ├── fig.4.jpg
    │   ├── fig.7.jpg
    │   ├── fig.8.jpg
    │   ├── fig.9.jpg
    │   ├── table.3.jpg
    │   └── table.6.jpg
    └── towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems
    │   ├── README.md
    │   ├── fig.1.jpg
    │   ├── fig.2.jpg
    │   ├── table.1.jpg
    │   └── table.2.jpg
└── template
    └── README.md


/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/README.md


--------------------------------------------------------------------------------
/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/README.md


--------------------------------------------------------------------------------
/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/README.md


--------------------------------------------------------------------------------
/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.3.jpg


--------------------------------------------------------------------------------
/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/table.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/table.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/README.md


--------------------------------------------------------------------------------
/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/README.md


--------------------------------------------------------------------------------
/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/algo.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/algo.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/README.md


--------------------------------------------------------------------------------
/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/algo.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/algo.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/table.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/table.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/README.md


--------------------------------------------------------------------------------
/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/README.md


--------------------------------------------------------------------------------
/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.4.jpg


--------------------------------------------------------------------------------
/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/README.md


--------------------------------------------------------------------------------
/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/longformer--the-long-document-transformer/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/longformer--the-long-document-transformer/README.md


--------------------------------------------------------------------------------
/attention-mechanism/longformer--the-long-document-transformer/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/longformer--the-long-document-transformer/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/README.md


--------------------------------------------------------------------------------
/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/mistral-7b/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/README.md


--------------------------------------------------------------------------------
/attention-mechanism/mistral-7b/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/mistral-7b/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/mistral-7b/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.3.jpg


--------------------------------------------------------------------------------
/attention-mechanism/multi-matrix-factorization-attention/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/README.md


--------------------------------------------------------------------------------
/attention-mechanism/multi-matrix-factorization-attention/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/multi-matrix-factorization-attention/table.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/table.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/3.2.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/3.2.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/README.md


--------------------------------------------------------------------------------
/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.2.jpg


--------------------------------------------------------------------------------
/attention-mechanism/tensor-product-attention-is-all-you-need/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/tensor-product-attention-is-all-you-need/README.md


--------------------------------------------------------------------------------
/attention-mechanism/tensor-product-attention-is-all-you-need/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/tensor-product-attention-is-all-you-need/fig.1.jpg


--------------------------------------------------------------------------------
/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/README.md


--------------------------------------------------------------------------------
/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/fig.3.jpg


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/EuroSys18.fig.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/EuroSys18.fig.5.jpg


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/README.md


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.1.jpg


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.4.jpg


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.5.jpg


--------------------------------------------------------------------------------
/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.8.jpg


--------------------------------------------------------------------------------
/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/README.md


--------------------------------------------------------------------------------
/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.1.jpg


--------------------------------------------------------------------------------
/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.5.jpg


--------------------------------------------------------------------------------
/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.6.jpg


--------------------------------------------------------------------------------
/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/table.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/table.2.jpg


--------------------------------------------------------------------------------
/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/README.md


--------------------------------------------------------------------------------
/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.1.jpg


--------------------------------------------------------------------------------
/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.3.jpg


--------------------------------------------------------------------------------
/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.6.jpg


--------------------------------------------------------------------------------
/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.7.jpg


--------------------------------------------------------------------------------
/more/block-transformer--global-to-local-language-modeling-for-fast-inference/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/README.md


--------------------------------------------------------------------------------
/more/block-transformer--global-to-local-language-modeling-for-fast-inference/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/fig.1.jpg


--------------------------------------------------------------------------------
/more/block-transformer--global-to-local-language-modeling-for-fast-inference/table.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/table.1.jpg


--------------------------------------------------------------------------------
/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/README.md


--------------------------------------------------------------------------------
/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.1.jpg


--------------------------------------------------------------------------------
/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.3.jpg


--------------------------------------------------------------------------------
/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.4.jpg


--------------------------------------------------------------------------------
/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/README.md


--------------------------------------------------------------------------------
/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.1.jpg


--------------------------------------------------------------------------------
/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.2.jpg


--------------------------------------------------------------------------------
/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.3.jpg


--------------------------------------------------------------------------------
/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.4.jpg


--------------------------------------------------------------------------------
/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/README.md


--------------------------------------------------------------------------------
/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/err.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/err.jpg


--------------------------------------------------------------------------------
/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/fig.2.jpg


--------------------------------------------------------------------------------
/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/3.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/3.2.jpg


--------------------------------------------------------------------------------
/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/README.md


--------------------------------------------------------------------------------
/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/fig.2.jpg


--------------------------------------------------------------------------------
/quantization/onebit--towards-extremely-low-bit-large-language-models/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/onebit--towards-extremely-low-bit-large-language-models/README.md


--------------------------------------------------------------------------------
/quantization/onebit--towards-extremely-low-bit-large-language-models/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/onebit--towards-extremely-low-bit-large-language-models/fig.2.jpg


--------------------------------------------------------------------------------
/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/README.md


--------------------------------------------------------------------------------
/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.1.jpg


--------------------------------------------------------------------------------
/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.4.jpg


--------------------------------------------------------------------------------
/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.5.jpg


--------------------------------------------------------------------------------
/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/README.md


--------------------------------------------------------------------------------
/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/fig.1.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/README.md


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.1.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.14.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.17.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.2.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.4.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.7.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.8.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/fig.9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.9.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/table.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/table.3.jpg


--------------------------------------------------------------------------------
/review/a-survey-on-efficient-inference-for-large-language-models/table.6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/table.6.jpg


--------------------------------------------------------------------------------
/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/README.md


--------------------------------------------------------------------------------
/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.1.jpg


--------------------------------------------------------------------------------
/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.2.jpg


--------------------------------------------------------------------------------
/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.1.jpg


--------------------------------------------------------------------------------
/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.2.jpg


--------------------------------------------------------------------------------
/template/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/template/README.md


--------------------------------------------------------------------------------