├── .gitignore ├── CITATION.cff ├── README.md ├── README.zh.md ├── apply_weight_convert.py ├── cli.py ├── cli_llava.py ├── docs ├── LlamaForCausalLM.md ├── LlavaForConditionalGeneration.md ├── LlavaNextForConditionalGeneration.md ├── Qwen2ForCausalLM.md ├── benchamrk_kernels.md ├── benchmark.md ├── benchmark_models.md ├── benchmark_models_history.md └── performance_optimization.md ├── examples ├── benchmark.py ├── evaluator │ ├── __init__.py │ └── eval.py ├── example_chat.py ├── example_eval_acc.py └── example_llava.py ├── generate.py ├── images ├── acc_test.jpg ├── anwser.png ├── benchamrk_result │ ├── fused-attention-batch4-head32-d64-fwd-causal=False.png │ ├── fused-attention-batch8-head64-d64-fwd-causal=False.png │ ├── layer-norm-forward.csv │ ├── layer-norm-forward.png │ ├── matmul-performance-fp16.csv │ ├── matmul-performance-fp16.png │ ├── matmul-performance-fp8.csv │ ├── matmul-performance-fp8.png │ ├── mlp-silu-performance.csv │ ├── mlp-silu-performance.png │ ├── mlp-silu-performance_ret.png │ ├── result.png │ ├── results.html │ ├── rms-norm-forward.csv │ ├── rms-norm-forward.png │ ├── skip_rmsnorm_benchmark.png │ ├── softmax-performance.csv │ ├── softmax-performance.png │ ├── te_benchmark.png │ └── token_embedding_benchmark.png ├── cli_stream.png ├── flashattention_nopad_benchamrk.png ├── flashattentionv2_nopad_benchamrk.png ├── flashattentionv2_nopad_benchamrk2.png ├── flashdecoding_benchamrk.png ├── generate.gif ├── generate_stream.png ├── llava_output.gif ├── llava_output1.gif ├── llava_output2.gif ├── llava_output3.gif ├── llava_test │ ├── WechatIMG205.jpg │ ├── dog.jpeg │ ├── dog2.png │ ├── extreme_ironing.jpg │ ├── graduate.png │ ├── kaali.jpg │ ├── markdown.png │ ├── mask.png │ ├── movie.jpeg │ ├── painting.png │ ├── panda.jpg │ ├── pexels-christian-heitz-285904-842711.jpg │ ├── pexels-francesco-ungaro-1525041.jpg │ ├── pexels-sanaan-3052361.jpg │ ├── superJumbo.png │ ├── taitan.jpg │ └── website.png ├── output.gif └── qwen2.5-3b-output.gif ├── lite_llama ├── __init__.py ├── executor │ ├── __init__.py │ ├── cuda_graph.py │ ├── executor_struct.py │ ├── mem_manager.py │ ├── model_executor.py │ └── req_tokens_manager.py ├── generate.py ├── generate_stream.py ├── generete_with_probs.py ├── inference.py ├── kernels │ ├── __init__.py │ ├── activations.py │ ├── flashattention.py │ ├── flashattention2_nopad.py │ ├── flashattentionv2.py │ ├── flashdecoding.py │ ├── others │ │ ├── activation_layers.py │ │ ├── context_flashattention_nopad.py │ │ ├── fused_linear.py │ │ ├── layernorm.py │ │ ├── rmsnorm_layer.py │ │ ├── rmsnorm_v1.py │ │ ├── rope_orig.py │ │ └── rotary_emb_v1.py │ ├── rope_emb.py │ ├── skip_rmsnorm.py │ ├── softmax_split.py │ ├── swiglu.py │ ├── update_kv_buffer.py │ ├── update_kv_index.py │ └── utils.py ├── llava_generate_stream.py ├── models │ ├── RotaryEmbedding.py │ ├── llama.py │ ├── llava.py │ ├── model_config.py │ ├── qwen2.py │ ├── qwen3.py │ └── utils.py └── utils │ ├── common.py │ ├── config_convert.py │ ├── constants.py │ ├── dummy_data.py │ ├── file_interface.py │ ├── image_process.py │ ├── logger.py │ └── prompt_templates.py ├── requirement.txt └── tests ├── __init__.py ├── kernels ├── fused_mlp_silu.py ├── kernels_benchmark.py ├── kernels_test.py ├── softmax_native.py ├── softmax_split.py ├── test_attention.py ├── test_available_blocks.py ├── test_cuda_graph.py ├── test_flashattentionv2.py ├── test_flashdecoding.py ├── test_flashdecoding_stage1.py ├── test_flashdecoding_stage2.py ├── test_mask.py ├── test_mem_manager.py ├── test_merge_input_ids_with_image_features.py └── test_rope_forward.py ├── models ├── test_LlamaConfig.py ├── test_LlamaForCausalLM.py ├── test_LlamaModel.py ├── test_LlavaConfig.py ├── test_LlavaForConditionalGeneration.py ├── test_LlavaLlama.py ├── test_Qwen2ForCausalLM.py ├── test_get_model_name.py ├── test_gpt2.py ├── test_qwen2.py └── test_transformers.py ├── others ├── test_convert.py ├── test_embedding_merge.py ├── test_image_process.py ├── test_image_token.py ├── test_load_weight.py ├── test_standard_mha.py └── test_temperature.py ├── test_torch_matmul.py └── test_torch_rope.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/.gitignore -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/CITATION.cff -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/README.md -------------------------------------------------------------------------------- /README.zh.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/README.zh.md -------------------------------------------------------------------------------- /apply_weight_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/apply_weight_convert.py -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/cli.py -------------------------------------------------------------------------------- /cli_llava.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/cli_llava.py -------------------------------------------------------------------------------- /docs/LlamaForCausalLM.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/LlamaForCausalLM.md -------------------------------------------------------------------------------- /docs/LlavaForConditionalGeneration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/LlavaForConditionalGeneration.md -------------------------------------------------------------------------------- /docs/LlavaNextForConditionalGeneration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/LlavaNextForConditionalGeneration.md -------------------------------------------------------------------------------- /docs/Qwen2ForCausalLM.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/Qwen2ForCausalLM.md -------------------------------------------------------------------------------- /docs/benchamrk_kernels.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/benchamrk_kernels.md -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/benchmark.md -------------------------------------------------------------------------------- /docs/benchmark_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/benchmark_models.md -------------------------------------------------------------------------------- /docs/benchmark_models_history.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/benchmark_models_history.md -------------------------------------------------------------------------------- /docs/performance_optimization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/docs/performance_optimization.md -------------------------------------------------------------------------------- /examples/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/examples/benchmark.py -------------------------------------------------------------------------------- /examples/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/evaluator/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/examples/evaluator/eval.py -------------------------------------------------------------------------------- /examples/example_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/examples/example_chat.py -------------------------------------------------------------------------------- /examples/example_eval_acc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/examples/example_eval_acc.py -------------------------------------------------------------------------------- /examples/example_llava.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/examples/example_llava.py -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/generate.py -------------------------------------------------------------------------------- /images/acc_test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/acc_test.jpg -------------------------------------------------------------------------------- /images/anwser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/anwser.png -------------------------------------------------------------------------------- /images/benchamrk_result/fused-attention-batch4-head32-d64-fwd-causal=False.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/fused-attention-batch4-head32-d64-fwd-causal=False.png -------------------------------------------------------------------------------- /images/benchamrk_result/fused-attention-batch8-head64-d64-fwd-causal=False.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/fused-attention-batch8-head64-d64-fwd-causal=False.png -------------------------------------------------------------------------------- /images/benchamrk_result/layer-norm-forward.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/layer-norm-forward.csv -------------------------------------------------------------------------------- /images/benchamrk_result/layer-norm-forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/layer-norm-forward.png -------------------------------------------------------------------------------- /images/benchamrk_result/matmul-performance-fp16.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/matmul-performance-fp16.csv -------------------------------------------------------------------------------- /images/benchamrk_result/matmul-performance-fp16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/matmul-performance-fp16.png -------------------------------------------------------------------------------- /images/benchamrk_result/matmul-performance-fp8.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/matmul-performance-fp8.csv -------------------------------------------------------------------------------- /images/benchamrk_result/matmul-performance-fp8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/matmul-performance-fp8.png -------------------------------------------------------------------------------- /images/benchamrk_result/mlp-silu-performance.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/mlp-silu-performance.csv -------------------------------------------------------------------------------- /images/benchamrk_result/mlp-silu-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/mlp-silu-performance.png -------------------------------------------------------------------------------- /images/benchamrk_result/mlp-silu-performance_ret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/mlp-silu-performance_ret.png -------------------------------------------------------------------------------- /images/benchamrk_result/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/result.png -------------------------------------------------------------------------------- /images/benchamrk_result/results.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /images/benchamrk_result/rms-norm-forward.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/rms-norm-forward.csv -------------------------------------------------------------------------------- /images/benchamrk_result/rms-norm-forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/rms-norm-forward.png -------------------------------------------------------------------------------- /images/benchamrk_result/skip_rmsnorm_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/skip_rmsnorm_benchmark.png -------------------------------------------------------------------------------- /images/benchamrk_result/softmax-performance.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/softmax-performance.csv -------------------------------------------------------------------------------- /images/benchamrk_result/softmax-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/softmax-performance.png -------------------------------------------------------------------------------- /images/benchamrk_result/te_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/te_benchmark.png -------------------------------------------------------------------------------- /images/benchamrk_result/token_embedding_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/benchamrk_result/token_embedding_benchmark.png -------------------------------------------------------------------------------- /images/cli_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/cli_stream.png -------------------------------------------------------------------------------- /images/flashattention_nopad_benchamrk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/flashattention_nopad_benchamrk.png -------------------------------------------------------------------------------- /images/flashattentionv2_nopad_benchamrk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/flashattentionv2_nopad_benchamrk.png -------------------------------------------------------------------------------- /images/flashattentionv2_nopad_benchamrk2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/flashattentionv2_nopad_benchamrk2.png -------------------------------------------------------------------------------- /images/flashdecoding_benchamrk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/flashdecoding_benchamrk.png -------------------------------------------------------------------------------- /images/generate.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/generate.gif -------------------------------------------------------------------------------- /images/generate_stream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/generate_stream.png -------------------------------------------------------------------------------- /images/llava_output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_output.gif -------------------------------------------------------------------------------- /images/llava_output1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_output1.gif -------------------------------------------------------------------------------- /images/llava_output2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_output2.gif -------------------------------------------------------------------------------- /images/llava_output3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_output3.gif -------------------------------------------------------------------------------- /images/llava_test/WechatIMG205.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/WechatIMG205.jpg -------------------------------------------------------------------------------- /images/llava_test/dog.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/dog.jpeg -------------------------------------------------------------------------------- /images/llava_test/dog2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/dog2.png -------------------------------------------------------------------------------- /images/llava_test/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/extreme_ironing.jpg -------------------------------------------------------------------------------- /images/llava_test/graduate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/graduate.png -------------------------------------------------------------------------------- /images/llava_test/kaali.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/kaali.jpg -------------------------------------------------------------------------------- /images/llava_test/markdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/markdown.png -------------------------------------------------------------------------------- /images/llava_test/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/mask.png -------------------------------------------------------------------------------- /images/llava_test/movie.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/movie.jpeg -------------------------------------------------------------------------------- /images/llava_test/painting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/painting.png -------------------------------------------------------------------------------- /images/llava_test/panda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/panda.jpg -------------------------------------------------------------------------------- /images/llava_test/pexels-christian-heitz-285904-842711.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/pexels-christian-heitz-285904-842711.jpg -------------------------------------------------------------------------------- /images/llava_test/pexels-francesco-ungaro-1525041.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/pexels-francesco-ungaro-1525041.jpg -------------------------------------------------------------------------------- /images/llava_test/pexels-sanaan-3052361.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/pexels-sanaan-3052361.jpg -------------------------------------------------------------------------------- /images/llava_test/superJumbo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/superJumbo.png -------------------------------------------------------------------------------- /images/llava_test/taitan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/taitan.jpg -------------------------------------------------------------------------------- /images/llava_test/website.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/llava_test/website.png -------------------------------------------------------------------------------- /images/output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/output.gif -------------------------------------------------------------------------------- /images/qwen2.5-3b-output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/images/qwen2.5-3b-output.gif -------------------------------------------------------------------------------- /lite_llama/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/__init__.py -------------------------------------------------------------------------------- /lite_llama/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lite_llama/executor/cuda_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/executor/cuda_graph.py -------------------------------------------------------------------------------- /lite_llama/executor/executor_struct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/executor/executor_struct.py -------------------------------------------------------------------------------- /lite_llama/executor/mem_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/executor/mem_manager.py -------------------------------------------------------------------------------- /lite_llama/executor/model_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/executor/model_executor.py -------------------------------------------------------------------------------- /lite_llama/executor/req_tokens_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/executor/req_tokens_manager.py -------------------------------------------------------------------------------- /lite_llama/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/generate.py -------------------------------------------------------------------------------- /lite_llama/generate_stream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/generate_stream.py -------------------------------------------------------------------------------- /lite_llama/generete_with_probs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/generete_with_probs.py -------------------------------------------------------------------------------- /lite_llama/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/inference.py -------------------------------------------------------------------------------- /lite_llama/kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/__init__.py -------------------------------------------------------------------------------- /lite_llama/kernels/activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/activations.py -------------------------------------------------------------------------------- /lite_llama/kernels/flashattention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/flashattention.py -------------------------------------------------------------------------------- /lite_llama/kernels/flashattention2_nopad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/flashattention2_nopad.py -------------------------------------------------------------------------------- /lite_llama/kernels/flashattentionv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/flashattentionv2.py -------------------------------------------------------------------------------- /lite_llama/kernels/flashdecoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/flashdecoding.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/activation_layers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/activation_layers.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/context_flashattention_nopad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/context_flashattention_nopad.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/fused_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/fused_linear.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/layernorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/layernorm.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/rmsnorm_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/rmsnorm_layer.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/rmsnorm_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/rmsnorm_v1.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/rope_orig.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/rope_orig.py -------------------------------------------------------------------------------- /lite_llama/kernels/others/rotary_emb_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/others/rotary_emb_v1.py -------------------------------------------------------------------------------- /lite_llama/kernels/rope_emb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/rope_emb.py -------------------------------------------------------------------------------- /lite_llama/kernels/skip_rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/skip_rmsnorm.py -------------------------------------------------------------------------------- /lite_llama/kernels/softmax_split.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/softmax_split.py -------------------------------------------------------------------------------- /lite_llama/kernels/swiglu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/swiglu.py -------------------------------------------------------------------------------- /lite_llama/kernels/update_kv_buffer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/update_kv_buffer.py -------------------------------------------------------------------------------- /lite_llama/kernels/update_kv_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/update_kv_index.py -------------------------------------------------------------------------------- /lite_llama/kernels/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/kernels/utils.py -------------------------------------------------------------------------------- /lite_llama/llava_generate_stream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/llava_generate_stream.py -------------------------------------------------------------------------------- /lite_llama/models/RotaryEmbedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/RotaryEmbedding.py -------------------------------------------------------------------------------- /lite_llama/models/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/llama.py -------------------------------------------------------------------------------- /lite_llama/models/llava.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/llava.py -------------------------------------------------------------------------------- /lite_llama/models/model_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/model_config.py -------------------------------------------------------------------------------- /lite_llama/models/qwen2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/qwen2.py -------------------------------------------------------------------------------- /lite_llama/models/qwen3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/qwen3.py -------------------------------------------------------------------------------- /lite_llama/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/models/utils.py -------------------------------------------------------------------------------- /lite_llama/utils/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/common.py -------------------------------------------------------------------------------- /lite_llama/utils/config_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/config_convert.py -------------------------------------------------------------------------------- /lite_llama/utils/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/constants.py -------------------------------------------------------------------------------- /lite_llama/utils/dummy_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/dummy_data.py -------------------------------------------------------------------------------- /lite_llama/utils/file_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/file_interface.py -------------------------------------------------------------------------------- /lite_llama/utils/image_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/image_process.py -------------------------------------------------------------------------------- /lite_llama/utils/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/logger.py -------------------------------------------------------------------------------- /lite_llama/utils/prompt_templates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/lite_llama/utils/prompt_templates.py -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/requirement.txt -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/kernels/fused_mlp_silu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/fused_mlp_silu.py -------------------------------------------------------------------------------- /tests/kernels/kernels_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/kernels_benchmark.py -------------------------------------------------------------------------------- /tests/kernels/kernels_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/kernels_test.py -------------------------------------------------------------------------------- /tests/kernels/softmax_native.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/softmax_native.py -------------------------------------------------------------------------------- /tests/kernels/softmax_split.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/softmax_split.py -------------------------------------------------------------------------------- /tests/kernels/test_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_attention.py -------------------------------------------------------------------------------- /tests/kernels/test_available_blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_available_blocks.py -------------------------------------------------------------------------------- /tests/kernels/test_cuda_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_cuda_graph.py -------------------------------------------------------------------------------- /tests/kernels/test_flashattentionv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_flashattentionv2.py -------------------------------------------------------------------------------- /tests/kernels/test_flashdecoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_flashdecoding.py -------------------------------------------------------------------------------- /tests/kernels/test_flashdecoding_stage1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_flashdecoding_stage1.py -------------------------------------------------------------------------------- /tests/kernels/test_flashdecoding_stage2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_flashdecoding_stage2.py -------------------------------------------------------------------------------- /tests/kernels/test_mask.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_mask.py -------------------------------------------------------------------------------- /tests/kernels/test_mem_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_mem_manager.py -------------------------------------------------------------------------------- /tests/kernels/test_merge_input_ids_with_image_features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_merge_input_ids_with_image_features.py -------------------------------------------------------------------------------- /tests/kernels/test_rope_forward.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/kernels/test_rope_forward.py -------------------------------------------------------------------------------- /tests/models/test_LlamaConfig.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlamaConfig.py -------------------------------------------------------------------------------- /tests/models/test_LlamaForCausalLM.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlamaForCausalLM.py -------------------------------------------------------------------------------- /tests/models/test_LlamaModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlamaModel.py -------------------------------------------------------------------------------- /tests/models/test_LlavaConfig.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlavaConfig.py -------------------------------------------------------------------------------- /tests/models/test_LlavaForConditionalGeneration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlavaForConditionalGeneration.py -------------------------------------------------------------------------------- /tests/models/test_LlavaLlama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_LlavaLlama.py -------------------------------------------------------------------------------- /tests/models/test_Qwen2ForCausalLM.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_Qwen2ForCausalLM.py -------------------------------------------------------------------------------- /tests/models/test_get_model_name.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_get_model_name.py -------------------------------------------------------------------------------- /tests/models/test_gpt2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_gpt2.py -------------------------------------------------------------------------------- /tests/models/test_qwen2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_qwen2.py -------------------------------------------------------------------------------- /tests/models/test_transformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/models/test_transformers.py -------------------------------------------------------------------------------- /tests/others/test_convert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_convert.py -------------------------------------------------------------------------------- /tests/others/test_embedding_merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_embedding_merge.py -------------------------------------------------------------------------------- /tests/others/test_image_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_image_process.py -------------------------------------------------------------------------------- /tests/others/test_image_token.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_image_token.py -------------------------------------------------------------------------------- /tests/others/test_load_weight.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_load_weight.py -------------------------------------------------------------------------------- /tests/others/test_standard_mha.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_standard_mha.py -------------------------------------------------------------------------------- /tests/others/test_temperature.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/others/test_temperature.py -------------------------------------------------------------------------------- /tests/test_torch_matmul.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/test_torch_matmul.py -------------------------------------------------------------------------------- /tests/test_torch_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harleyszhang/lite_llama/HEAD/tests/test_torch_rope.py --------------------------------------------------------------------------------