├── LICENSE ├── README.md ├── attention-mechanism ├── contextual-position-encoding--learning-to-count-what-s-important │ ├── README.md │ └── fig.1.jpg ├── deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model │ ├── README.md │ ├── fig.2.jpg │ ├── fig.3.jpg │ └── table.1.jpg ├── efficient-streaming-language-models-with-attention-sinks │ ├── README.md │ ├── fig.1.jpg │ └── fig.2.jpg ├── flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness │ ├── README.md │ ├── algo.1.jpg │ └── fig.1.jpg ├── flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning │ ├── README.md │ ├── algo.1.jpg │ ├── fig.2.jpg │ └── table.1.jpg ├── gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints │ ├── README.md │ └── fig.2.jpg ├── hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning │ ├── README.md │ ├── fig.1.jpg │ └── fig.4.jpg ├── lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning │ ├── README.md │ ├── fig.1.jpg │ └── fig.2.jpg ├── longformer--the-long-document-transformer │ ├── README.md │ └── fig.1.jpg ├── lora--low-rank-adaptation-of-large-language-models │ ├── README.md │ └── fig.1.jpg ├── mistral-7b │ ├── README.md │ ├── fig.1.jpg │ ├── fig.2.jpg │ └── fig.3.jpg ├── multi-matrix-factorization-attention │ ├── README.md │ ├── fig.2.jpg │ └── table.1.jpg ├── roformer--enhanced-transformer-with-rotary-position-embedding │ ├── 3.2.2.jpg │ ├── README.md │ ├── fig.1.jpg │ └── fig.2.jpg ├── tensor-product-attention-is-all-you-need │ ├── README.md │ └── fig.1.jpg └── train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation │ ├── README.md │ └── fig.3.jpg ├── continuous-batching ├── orca--a-distributed-serving-system-for-transformer-based-generative-models │ ├── EuroSys18.fig.5.jpg │ ├── README.md │ ├── fig.1.jpg │ ├── fig.4.jpg │ ├── fig.5.jpg │ └── fig.8.jpg └── sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills │ ├── README.md │ ├── fig.1.jpg │ ├── fig.5.jpg │ ├── fig.6.jpg │ └── table.2.jpg ├── kv-cache └── efficient-memory-management-for-large-language-model-serving-with-pagedattention │ ├── README.md │ ├── fig.1.jpg │ ├── fig.3.jpg │ ├── fig.6.jpg │ └── fig.7.jpg ├── more ├── block-transformer--global-to-local-language-modeling-for-fast-inference │ ├── README.md │ ├── fig.1.jpg │ └── table.1.jpg ├── lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference │ ├── README.md │ ├── fig.1.jpg │ ├── fig.3.jpg │ └── fig.4.jpg └── learning-to--learn-at-test-time---rnns-with-expressive-hidden-states │ ├── README.md │ ├── fig.1.jpg │ ├── fig.2.jpg │ ├── fig.3.jpg │ └── fig.4.jpg ├── quantization ├── awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration │ ├── README.md │ ├── err.jpg │ └── fig.2.jpg ├── llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale │ ├── 3.2.jpg │ ├── README.md │ └── fig.2.jpg ├── onebit--towards-extremely-low-bit-large-language-models │ ├── README.md │ └── fig.2.jpg ├── smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models │ ├── README.md │ ├── fig.1.jpg │ ├── fig.4.jpg │ └── fig.5.jpg └── the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits │ ├── README.md │ └── fig.1.jpg ├── review ├── a-survey-on-efficient-inference-for-large-language-models │ ├── README.md │ ├── fig.1.jpg │ ├── fig.14.jpg │ ├── fig.17.jpg │ ├── fig.2.jpg │ ├── fig.4.jpg │ ├── fig.7.jpg │ ├── fig.8.jpg │ ├── fig.9.jpg │ ├── table.3.jpg │ └── table.6.jpg └── towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems │ ├── README.md │ ├── fig.1.jpg │ ├── fig.2.jpg │ ├── table.1.jpg │ └── table.2.jpg └── template └── README.md /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/README.md -------------------------------------------------------------------------------- /attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/README.md -------------------------------------------------------------------------------- /attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/contextual-position-encoding--learning-to-count-what-s-important/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/README.md -------------------------------------------------------------------------------- /attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/fig.3.jpg -------------------------------------------------------------------------------- /attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/table.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/deepseek-v2--a-strong--economical--and-efficient-mixture-of-experts-language-model/table.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/efficient-streaming-language-models-with-attention-sinks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/README.md -------------------------------------------------------------------------------- /attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/efficient-streaming-language-models-with-attention-sinks/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/README.md -------------------------------------------------------------------------------- /attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/algo.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/algo.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention--fast-and-memory-efficient-exact-attention-with-io-awareness/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/README.md -------------------------------------------------------------------------------- /attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/algo.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/algo.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/table.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/flashattention-2--faster-attention-with-better-parallelism-and-work-partitioning/table.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/README.md -------------------------------------------------------------------------------- /attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/gqa--training-generalized-multi-query-transformer-models-from-multi-head-checkpoints/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/README.md -------------------------------------------------------------------------------- /attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/hydralora--an-asymmetric-lora-architecture-for-efficient-fine-tuning/fig.4.jpg -------------------------------------------------------------------------------- /attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/README.md -------------------------------------------------------------------------------- /attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lisa--layerwise-importance-sampling-for-memory-efficient-large-language-model-fine-tuning/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/longformer--the-long-document-transformer/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/longformer--the-long-document-transformer/README.md -------------------------------------------------------------------------------- /attention-mechanism/longformer--the-long-document-transformer/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/longformer--the-long-document-transformer/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/lora--low-rank-adaptation-of-large-language-models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/README.md -------------------------------------------------------------------------------- /attention-mechanism/lora--low-rank-adaptation-of-large-language-models/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/lora--low-rank-adaptation-of-large-language-models/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/mistral-7b/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/README.md -------------------------------------------------------------------------------- /attention-mechanism/mistral-7b/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/mistral-7b/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/mistral-7b/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/mistral-7b/fig.3.jpg -------------------------------------------------------------------------------- /attention-mechanism/multi-matrix-factorization-attention/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/README.md -------------------------------------------------------------------------------- /attention-mechanism/multi-matrix-factorization-attention/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/multi-matrix-factorization-attention/table.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/multi-matrix-factorization-attention/table.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/3.2.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/3.2.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/README.md -------------------------------------------------------------------------------- /attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/roformer--enhanced-transformer-with-rotary-position-embedding/fig.2.jpg -------------------------------------------------------------------------------- /attention-mechanism/tensor-product-attention-is-all-you-need/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/tensor-product-attention-is-all-you-need/README.md -------------------------------------------------------------------------------- /attention-mechanism/tensor-product-attention-is-all-you-need/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/tensor-product-attention-is-all-you-need/fig.1.jpg -------------------------------------------------------------------------------- /attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/README.md -------------------------------------------------------------------------------- /attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/attention-mechanism/train-short--test-long--attention-with-linear-biases-enables-input-length-extrapolation/fig.3.jpg -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/EuroSys18.fig.5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/EuroSys18.fig.5.jpg -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/README.md -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.1.jpg -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.4.jpg -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.5.jpg -------------------------------------------------------------------------------- /continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/orca--a-distributed-serving-system-for-transformer-based-generative-models/fig.8.jpg -------------------------------------------------------------------------------- /continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/README.md -------------------------------------------------------------------------------- /continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.1.jpg -------------------------------------------------------------------------------- /continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.5.jpg -------------------------------------------------------------------------------- /continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/fig.6.jpg -------------------------------------------------------------------------------- /continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/table.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/continuous-batching/sarathi--efficient-llm-inference-by-piggybacking-decodes-with-chunked-prefills/table.2.jpg -------------------------------------------------------------------------------- /kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/README.md -------------------------------------------------------------------------------- /kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.1.jpg -------------------------------------------------------------------------------- /kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.3.jpg -------------------------------------------------------------------------------- /kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.6.jpg -------------------------------------------------------------------------------- /kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/kv-cache/efficient-memory-management-for-large-language-model-serving-with-pagedattention/fig.7.jpg -------------------------------------------------------------------------------- /more/block-transformer--global-to-local-language-modeling-for-fast-inference/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/README.md -------------------------------------------------------------------------------- /more/block-transformer--global-to-local-language-modeling-for-fast-inference/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/fig.1.jpg -------------------------------------------------------------------------------- /more/block-transformer--global-to-local-language-modeling-for-fast-inference/table.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/block-transformer--global-to-local-language-modeling-for-fast-inference/table.1.jpg -------------------------------------------------------------------------------- /more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/README.md -------------------------------------------------------------------------------- /more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.1.jpg -------------------------------------------------------------------------------- /more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.3.jpg -------------------------------------------------------------------------------- /more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/lazyllm--dynamic-token-pruning-for-efficient-long-context-llm-inference/fig.4.jpg -------------------------------------------------------------------------------- /more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/README.md -------------------------------------------------------------------------------- /more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.1.jpg -------------------------------------------------------------------------------- /more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.2.jpg -------------------------------------------------------------------------------- /more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.3.jpg -------------------------------------------------------------------------------- /more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/more/learning-to--learn-at-test-time---rnns-with-expressive-hidden-states/fig.4.jpg -------------------------------------------------------------------------------- /quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/README.md -------------------------------------------------------------------------------- /quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/err.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/err.jpg -------------------------------------------------------------------------------- /quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/awq--activation-aware-weight-quantization-for-llm-compression-and-acceleration/fig.2.jpg -------------------------------------------------------------------------------- /quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/3.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/3.2.jpg -------------------------------------------------------------------------------- /quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/README.md -------------------------------------------------------------------------------- /quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/llm.int8----8-bit-matrix-multiplication-for-transformers-at-scale/fig.2.jpg -------------------------------------------------------------------------------- /quantization/onebit--towards-extremely-low-bit-large-language-models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/onebit--towards-extremely-low-bit-large-language-models/README.md -------------------------------------------------------------------------------- /quantization/onebit--towards-extremely-low-bit-large-language-models/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/onebit--towards-extremely-low-bit-large-language-models/fig.2.jpg -------------------------------------------------------------------------------- /quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/README.md -------------------------------------------------------------------------------- /quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.1.jpg -------------------------------------------------------------------------------- /quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.4.jpg -------------------------------------------------------------------------------- /quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/smoothquant--accurate-and-efficient-post-training-quantization-for-large-language-models/fig.5.jpg -------------------------------------------------------------------------------- /quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/README.md -------------------------------------------------------------------------------- /quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/quantization/the-era-of-1-bit-llms--all-large-language-models-are-in-1.58-bits/fig.1.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/README.md -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.1.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.14.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.17.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.2.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.4.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.7.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.8.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/fig.9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/fig.9.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/table.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/table.3.jpg -------------------------------------------------------------------------------- /review/a-survey-on-efficient-inference-for-large-language-models/table.6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/a-survey-on-efficient-inference-for-large-language-models/table.6.jpg -------------------------------------------------------------------------------- /review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/README.md -------------------------------------------------------------------------------- /review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.1.jpg -------------------------------------------------------------------------------- /review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/fig.2.jpg -------------------------------------------------------------------------------- /review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.1.jpg -------------------------------------------------------------------------------- /review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/review/towards-efficient-generative-large-language-model-serving--a-survey-from-algorithms-to-systems/table.2.jpg -------------------------------------------------------------------------------- /template/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishishu/LLM-Inference-Acceleration/HEAD/template/README.md --------------------------------------------------------------------------------