├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── docker-publish.yml │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets ├── att.gif ├── lightllm.drawio.png └── logo.png ├── benchmark.md ├── build_and_upload_docker.sh ├── demos ├── qa_server │ ├── __init__.py │ ├── chat_server.py │ ├── qabot.py │ └── templates │ │ └── chat.html └── readme.txt ├── docs ├── CN │ ├── .readthedocs.yaml │ ├── Makefile │ ├── README.md │ ├── make.bat │ ├── rebuild.sh │ ├── requirements-docs.txt │ └── source │ │ ├── _static │ │ └── openapi.json │ │ ├── assets │ │ ├── lightllm │ │ │ ├── ER1.png │ │ │ ├── ER2.png │ │ │ ├── ER3.png │ │ │ ├── ER4.png │ │ │ ├── HttpServer.png │ │ │ ├── Performance.png │ │ │ ├── Performance2.png │ │ │ ├── Router.png │ │ │ ├── Visual_Server.png │ │ │ ├── arch.png │ │ │ ├── backend.png │ │ │ └── token_attn.gif │ │ └── logos │ │ │ └── lightllm-logo.png │ │ ├── conf.py │ │ ├── dev │ │ ├── router.rst │ │ └── token_attention.rst │ │ ├── getting_started │ │ ├── installation.rst │ │ └── quickstart.rst │ │ ├── index.rst │ │ ├── lightllm │ │ ├── lightllm_impl.rst │ │ └── lightllm_intro.rst │ │ ├── models │ │ ├── add_new_model.md │ │ ├── supported_models.rst │ │ └── test.rst │ │ ├── server │ │ ├── api_server_args_zh.rst │ │ └── benchmark.rst │ │ └── user │ │ ├── api_param.rst │ │ └── openapi_docs.rst └── EN │ ├── .readthedocs.yaml │ ├── Makefile │ ├── README.md │ ├── make.bat │ ├── rebuild.sh │ ├── requirements-docs.txt │ └── source │ ├── _static │ └── openapi.json │ ├── assets │ ├── lightllm │ │ ├── ER1.png │ │ ├── ER2.png │ │ ├── ER3.png │ │ ├── ER4.png │ │ ├── HttpServer.png │ │ ├── Performance.png │ │ ├── Performance2.png │ │ ├── Router.png │ │ ├── Visual_Server.png │ │ ├── arch.png │ │ ├── backend.png │ │ └── token_attn.gif │ └── logos │ │ └── lightllm-logo.png │ ├── conf.py │ ├── dev │ ├── performance.rst │ ├── router.rst │ └── token_attention.rst │ ├── getting_started │ ├── faq.rst │ ├── installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── lightllm │ ├── lightllm_impl.rst │ └── lightllm_intro.rst │ ├── models │ ├── add_new_model.md │ ├── supported_models.rst │ └── test.rst │ ├── server │ ├── api_server_args.rst │ └── benchmark.rst │ └── user │ ├── api_param.rst │ └── openapi_docs.rst ├── format.py ├── format_out ├── __init__.py ├── grammer │ ├── __init__.py │ ├── core.py │ ├── dpda.py │ ├── json.ebnf │ ├── test.sh │ ├── test0.py │ ├── test1.py │ ├── test2.py │ ├── test3.py │ ├── test4.py │ ├── test5.py │ └── test6.py └── impl.py ├── lightllm ├── __init__.py ├── common │ ├── __init__.py │ ├── all_kernel_configs │ │ ├── __init__.py │ │ ├── bmm_scaled_fp8 │ │ │ ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json │ │ │ └── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ ├── fp8_block_mm │ │ │ ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json │ │ │ └── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ ├── grouped_moe_gemm_kernel │ │ │ ├── {K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json │ │ │ ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json │ │ │ ├── {K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json │ │ │ ├── {K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json │ │ │ ├── {K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json │ │ │ ├── {K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json │ │ │ ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json │ │ │ ├── {K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json │ │ │ └── {K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json │ │ ├── mla_decode_attentnion │ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json │ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H200.json │ │ │ └── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json │ │ ├── moe_silu_and_mul_kernel │ │ │ ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ │ ├── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json │ │ │ └── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json │ │ └── moe_sum_reduce_kernel │ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json │ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json │ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json │ │ │ └── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json │ ├── basemodel │ │ ├── __init__.py │ │ ├── basemodel.py │ │ ├── cuda_graph.py │ │ ├── infer_lock.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── base_layer_infer.py │ │ │ ├── cache_tensor_manager.py │ │ │ ├── post_layer_infer.py │ │ │ ├── pre_layer_infer.py │ │ │ ├── template │ │ │ │ ├── __init__.py │ │ │ │ ├── post_layer_infer_template.py │ │ │ │ ├── pre_layer_infer_template.py │ │ │ │ ├── transformer_layer_infer_cohere_template.py │ │ │ │ └── transformer_layer_infer_template.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── base_layer_weight.py │ │ │ ├── hf_load_utils.py │ │ │ ├── meta_weights │ │ │ │ ├── __init__.py │ │ │ │ ├── base_weight.py │ │ │ │ ├── fused_moe_weight_ep.py │ │ │ │ ├── fused_moe_weight_ep_redundancy.py │ │ │ │ ├── fused_moe_weight_tp.py │ │ │ │ ├── mm_weight │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── colmm_weight.py │ │ │ │ │ ├── mm_weight.py │ │ │ │ │ └── rowmm_weight.py │ │ │ │ └── norm_weight.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── microbatch_overlap_objs.py │ │ ├── multimodal_tokenizer.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── add_in_place.py │ │ │ ├── apply_penalty.py │ │ │ ├── bmm_scaled_fp8.py │ │ │ ├── copy_kv_index_to_req.py │ │ │ ├── dequantize_gemm_int4.py │ │ │ ├── dequantize_gemm_int8.py │ │ │ ├── destindex_copy_kv.py │ │ │ ├── gen_decode_params.py │ │ │ ├── gen_prefill_params.py │ │ │ ├── multimodal_emb.py │ │ │ ├── quantize_gemm_int8.py │ │ │ ├── redundancy_topk_ids_repair.py │ │ │ └── sp_pad_copy.py │ ├── build_utils.py │ ├── cuda_wrapper.py │ ├── deepseek2_fp8kv_mem_manager.py │ ├── deepseek2_mem_manager.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── deepep_scatter_gather.py │ │ ├── grouped_fused_moe.py │ │ ├── grouped_fused_moe_ep.py │ │ ├── grouped_topk.py │ │ ├── moe_kernel_configs.py │ │ ├── moe_silu_and_mul.py │ │ ├── moe_silu_and_mul_config.py │ │ ├── moe_silu_and_mul_mix_quant_ep.py │ │ ├── moe_sum_recude_config.py │ │ ├── moe_sum_reduce.py │ │ ├── softmax_topk.py │ │ └── topk_select.py │ ├── infer_utils.py │ ├── int8kv_mem_manager.py │ ├── kernel_config.py │ ├── kv_trans_kernel │ │ ├── __init__.py │ │ ├── kv_trans.py │ │ └── kv_trans_v2.py │ ├── mem_manager.py │ ├── mem_utils.py │ ├── ppl_int4kv_mem_manager.py │ ├── ppl_int8kv_mem_manager.py │ ├── quantization │ │ ├── __init__.py │ │ ├── configs │ │ │ └── llamacls-mix-down.yaml │ │ ├── deepgemm_quant.py │ │ ├── quantize_method.py │ │ ├── registry.py │ │ ├── torchao_quant.py │ │ ├── triton_quant │ │ │ ├── __init__.py │ │ │ ├── fp8 │ │ │ │ ├── __init__.py │ │ │ │ ├── fp8act_quant_kernel.py │ │ │ │ └── fp8w8a8_block_gemm_kernel.py │ │ │ └── triton_quant.py │ │ └── w8a8_quant.py │ └── req_manager.py ├── distributed │ ├── __init__.py │ ├── communication_op.py │ ├── custom_all_gather.py │ ├── custom_all_reduce.py │ ├── pynccl.py │ └── pynccl_wrapper.py ├── models │ ├── __init__.py │ ├── bloom │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── post_layer_infer.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── hf_load_utils.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── context_flashattention_nopad.py │ │ │ ├── layernorm.py │ │ │ ├── token_attention_nopad_att1.py │ │ │ ├── token_attention_nopad_reduceV.py │ │ │ ├── token_attention_nopad_softmax.py │ │ │ └── token_flashattention_nopad.py │ ├── chatglm2 │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ └── rotary_emb.py │ ├── cohere │ │ ├── __init__.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── post_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernels │ │ │ ├── __init__.py │ │ │ ├── layernorm.py │ │ │ └── rotary_emb.py │ ├── deepseek2 │ │ ├── __init__.py │ │ ├── flashattention_infer_struct.py │ │ ├── flashinfer_struct.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── context_flashattention_nopad.py │ │ │ ├── context_flashattention_nopad_fp8.py │ │ │ ├── context_flashattention_nopad_with_v.py │ │ │ ├── destindex_copy_kv.py │ │ │ ├── destindex_copy_kv_fp8.py │ │ │ ├── gqa_flash_decoding.py │ │ │ ├── gqa_flash_decoding_config.py │ │ │ ├── gqa_flash_decoding_fp8.py │ │ │ ├── gqa_flash_decoding_stage1.py │ │ │ ├── gqa_flash_decoding_stage1_fp8.py │ │ │ ├── gqa_flash_decoding_stage2.py │ │ │ ├── repack_kv_index.py │ │ │ ├── repeat_rope.py │ │ │ ├── rotary_emb.py │ │ │ ├── sample_kv.py │ │ │ └── weight_dequant.py │ ├── gemma3 │ │ ├── __init__.py │ │ ├── gemma3_visual.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── post_layer_infer.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── gemma_2b │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ └── gelu_and_mul.py │ ├── internlm │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── internlm2 │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── internlm2_reward │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── post_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── pre_and_post_layer_weight.py │ │ └── model.py │ ├── internvl │ │ ├── __init__.py │ │ ├── img_process.py │ │ ├── internvl_visual.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── pre_and_post_layer_weight.py │ │ └── model.py │ ├── llama │ │ ├── __init__.py │ │ ├── flashattention_infer_struct.py │ │ ├── flashinfer_struct.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── post_layer_infer.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── ds_load_utils.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ ├── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── context_flashattention_nopad.py │ │ │ ├── embedding.py │ │ │ ├── flash_decoding.py │ │ │ ├── flash_decoding_stage1.py │ │ │ ├── flash_decoding_stage2.py │ │ │ ├── gqa_decode_flashattention_nopad.py │ │ │ ├── gqa_flash_decoding.py │ │ │ ├── gqa_flash_decoding_stage1.py │ │ │ ├── gqa_flash_decoding_stage2.py │ │ │ ├── gqa_flash_decoding_vsm.py │ │ │ ├── ppl_fp16_flash_decoding.py │ │ │ ├── ppl_int4kv_copy_kv.py │ │ │ ├── ppl_int4kv_flash_decoding.py │ │ │ ├── ppl_int8kv_flash_decoding.py │ │ │ ├── ppl_quant_copy_kv.py │ │ │ ├── rmsnorm.py │ │ │ ├── rotary_emb.py │ │ │ ├── silu_and_mul.py │ │ │ ├── token_attention_nopad_att1.py │ │ │ ├── token_attention_nopad_reduceV.py │ │ │ ├── token_attention_nopad_softmax.py │ │ │ └── token_attention_softmax_and_reducev.py │ │ └── yarn_rotary_utils.py │ ├── llava │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── pre_and_post_layer_weight.py │ │ ├── llava_visual.py │ │ └── model.py │ ├── minicpm │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── mistral │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── context_flashattention_nopad.py │ │ │ ├── init_att_sliding_window_info.py │ │ │ ├── token_attention_nopad_att1.py │ │ │ ├── token_attention_nopad_reduceV.py │ │ │ └── token_attention_softmax_and_reducev.py │ ├── mixtral │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── _custom_ops.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── phi3 │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── context_flashattention_nopad.py │ │ │ ├── destindex_copy_kv.py │ │ │ ├── flash_decoding.py │ │ │ ├── flash_decoding_stage1.py │ │ │ ├── flash_decoding_stage2.py │ │ │ └── rotary_emb.py │ ├── qwen │ │ ├── __init__.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── qwen2 │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── qwen2_5_vl │ │ ├── __init__.py │ │ └── qwen2_5_visual.py │ ├── qwen2_reward │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── post_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── pre_and_post_layer_weight.py │ │ └── model.py │ ├── qwen2_vl │ │ ├── __init__.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── model.py │ │ ├── qwen2_visual.py │ │ ├── triton_kernel │ │ │ ├── __init__.py │ │ │ └── mrope.py │ │ └── vision_process.py │ ├── qwen3 │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── qwen3_moe │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── qwen_vl │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── pre_layer_infer.py │ │ ├── model.py │ │ └── qwen_visual.py │ ├── registry.py │ ├── stablelm │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── starcoder │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── starcoder2 │ │ ├── __init__.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ └── model.py │ ├── tarsier2 │ │ ├── __init__.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ └── pre_and_post_layer_weight.py │ │ ├── model.py │ │ └── tarsier2_visual.py │ ├── vit │ │ ├── __init__.py │ │ ├── infer_struct.py │ │ ├── layer_infer │ │ │ ├── __init__.py │ │ │ ├── post_layer_infer.py │ │ │ ├── pre_layer_infer.py │ │ │ └── transformer_layer_infer.py │ │ ├── layer_weights │ │ │ ├── __init__.py │ │ │ ├── hf_load_utils.py │ │ │ ├── pre_and_post_layer_weight.py │ │ │ └── transformer_layer_weight.py │ │ ├── model.py │ │ └── triton_kernel │ │ │ ├── __init__.py │ │ │ ├── flashattention_nopad.py │ │ │ ├── gelu_vit.py │ │ │ └── rms_norm_vit.py │ └── whisper │ │ ├── __init__.py │ │ ├── defaults.py │ │ ├── modeling_whisper.py │ │ └── whisper_audio.py ├── server │ ├── __init__.py │ ├── api_cli.py │ ├── api_http.py │ ├── api_lightllm.py │ ├── api_models.py │ ├── api_openai.py │ ├── api_server.py │ ├── api_start.py │ ├── api_tgi.py │ ├── audioserver │ │ ├── __init__.py │ │ ├── manager.py │ │ └── model_infer │ │ │ ├── __init__.py │ │ │ └── model_rpc.py │ ├── build_prompt.py │ ├── config_server │ │ ├── __init__.py │ │ ├── api_http.py │ │ └── nccl_tcp_store.py │ ├── core │ │ ├── __init__.py │ │ └── objs │ │ │ ├── __init__.py │ │ │ ├── atomic_array_lock.py │ │ │ ├── atomic_lock.py │ │ │ ├── io_objs │ │ │ ├── __init__.py │ │ │ └── group_req.py │ │ │ ├── out_token_circlequeue.py │ │ │ ├── py_sampling_params.py │ │ │ ├── req.py │ │ │ ├── rpc_shm.py │ │ │ ├── sampling_params.py │ │ │ ├── shm_array.py │ │ │ ├── shm_req_manager.py │ │ │ └── start_args_type.py │ ├── detokenization │ │ ├── __init__.py │ │ ├── decode.py │ │ ├── decode_mode_fix.py │ │ ├── decode_req.py │ │ └── manager.py │ ├── embed_cache │ │ ├── __init__.py │ │ ├── impl │ │ │ ├── __init__.py │ │ │ └── naive_memory_cache.py │ │ ├── interface.py │ │ ├── manager.py │ │ └── utils.py │ ├── function_call_parser.py │ ├── health_monitor │ │ ├── __init__.py │ │ └── manager.py │ ├── httpserver │ │ ├── __init__.py │ │ ├── async_queue.py │ │ ├── manager.py │ │ └── pd_loop.py │ ├── httpserver_for_pd_master │ │ ├── __init__.py │ │ ├── manager.py │ │ └── register_loop.py │ ├── metrics │ │ ├── __init__.py │ │ ├── manager.py │ │ └── metrics.py │ ├── multimodal_params.py │ ├── pd_io_struct.py │ ├── req_id_generator.py │ ├── router │ │ ├── __init__.py │ │ ├── batch.py │ │ ├── dynamic_prompt │ │ │ ├── __init__.py │ │ │ ├── radix_cache.py │ │ │ └── shared_arr.py │ │ ├── manager.py │ │ ├── model_infer │ │ │ ├── __init__.py │ │ │ ├── infer_batch.py │ │ │ ├── mode_backend │ │ │ │ ├── __init__.py │ │ │ │ ├── base_backend.py │ │ │ │ ├── chunked_prefill │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── impl.py │ │ │ │ │ ├── impl_for_first_token_constraint_mode.py │ │ │ │ │ ├── impl_for_outlines_constraint_mode.py │ │ │ │ │ ├── impl_for_token_healing.py │ │ │ │ │ └── impl_for_xgrammar_mode.py │ │ │ │ ├── continues_batch │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── impl.py │ │ │ │ │ ├── impl_for_return_all_prompt_logprobs.py │ │ │ │ │ ├── impl_for_reward_model.py │ │ │ │ │ └── pd_mode │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── decode_node_impl │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── decode_impl.py │ │ │ │ │ │ ├── decode_impl_for_dp.py │ │ │ │ │ │ ├── decode_infer_rpyc.py │ │ │ │ │ │ ├── decode_kv_move_manager.py │ │ │ │ │ │ ├── decode_task_cache.py │ │ │ │ │ │ ├── decode_trans_obj.py │ │ │ │ │ │ ├── decode_trans_process.py │ │ │ │ │ │ └── up_status.py │ │ │ │ │ │ ├── p2p_fix.py │ │ │ │ │ │ ├── prefill_node_impl │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── prefill_impl.py │ │ │ │ │ │ ├── prefill_impl_for_dp_chuncked.py │ │ │ │ │ │ ├── prefill_infer_rpyc.py │ │ │ │ │ │ ├── prefill_kv_move_manager.py │ │ │ │ │ │ ├── prefill_task_cache.py │ │ │ │ │ │ ├── prefill_trans_obj.py │ │ │ │ │ │ └── prefill_trans_process.py │ │ │ │ │ │ ├── task_queue.py │ │ │ │ │ │ └── utils.py │ │ │ │ ├── diverse_backend │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── impl.py │ │ │ │ ├── dp_backend │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── impl.py │ │ │ │ │ └── pre_process.py │ │ │ │ ├── generic_post_process.py │ │ │ │ ├── generic_pre_process.py │ │ │ │ └── redundancy_expert_manager.py │ │ │ └── model_rpc.py │ │ ├── pause_strategy.py │ │ ├── req_queue │ │ │ ├── __init__.py │ │ │ ├── base_queue.py │ │ │ ├── chunked_prefill │ │ │ │ ├── __init__.py │ │ │ │ ├── beam_impl.py │ │ │ │ ├── impl.py │ │ │ │ └── impl_for_pd_prefill.py │ │ │ ├── continues_batch │ │ │ │ ├── __init__.py │ │ │ │ ├── impl.py │ │ │ │ └── impl_for_pd_decode.py │ │ │ └── dp_base_queue.py │ │ ├── stats.py │ │ └── token_load.py │ ├── tokenizer.py │ └── visualserver │ │ ├── __init__.py │ │ ├── manager.py │ │ └── model_infer │ │ ├── __init__.py │ │ └── model_rpc.py └── utils │ ├── __init__.py │ ├── config_utils.py │ ├── custom_kernel_utis.py │ ├── device_utils.py │ ├── dist_utils.py │ ├── envs_utils.py │ ├── error_utils.py │ ├── graceful_utils.py │ ├── health_check.py │ ├── infer_utils.py │ ├── light_utils.py │ ├── log_utils.py │ ├── multimodal_utils.py │ ├── multinode_utils.py │ ├── net_utils.py │ ├── petrel_helper.py │ ├── process_check.py │ ├── profile_max_tokens.py │ ├── retry_utils.py │ ├── rpyc_fix_utils.py │ ├── sgl_utils.py │ ├── start_utils.py │ ├── statics_utils.py │ ├── time_utils.py │ ├── tuning_utils.py │ ├── vllm_utils.py │ └── watchdog_utils.py ├── requirements.txt ├── setup.py ├── test ├── benchmark_client.py ├── benchmark_mcq.py ├── benchmark_qps.py ├── benchmark_serving.py ├── deepseek.sh ├── format_out │ ├── gomoku_game.py │ ├── qabot.py │ ├── test_constraint_server.py │ ├── test_demo.py │ └── test_xgrammar_constraint.py ├── kernel │ ├── alignment │ │ └── llama_gqa_decode_vsm.py │ ├── deepseekv2_bmm_scaled_fp8_tuning.py │ ├── deepseekv3_fp8_block_gemm_tuning.py │ ├── fuse_moe_tuning_bf16.py │ ├── fuse_moe_tuning_fp8.py │ ├── moe_silu_and_mul_tuning_bf16.py │ ├── moe_sum_reduce_tuning_bf16.py │ └── tuning │ │ ├── deepseekv2_gqa_decode_tuning.py │ │ └── llama_gqa_decode_vsm_tuning.py ├── model │ ├── model_infer.py │ ├── model_infer_vit.py │ ├── test_model.py │ ├── test_script.sh │ └── test_settings │ │ ├── model_infer_batchs.py │ │ ├── process_utils.py │ │ └── test_settings.py ├── server │ ├── benchmark_prompt_cache.py │ ├── readme.md │ └── test_settings.py ├── test.jpg ├── test.sh ├── test_accuracy.py ├── test_constraint_server.py ├── test_function_call_api.py ├── test_multimodal_server.py ├── test_redundancy_expert_config.json └── test_server.py ├── tools ├── quick_launch_docker.py └── resolve_ptx_version └── unit_tests ├── common ├── basemodel │ └── triton_kernel │ │ ├── test_add_in_place.py │ │ ├── test_gen_decode_params.py │ │ ├── test_gen_prefill_params.py │ │ ├── test_redundancy_topk_ids_repair.py │ │ └── test_sp_pad_kernel.py ├── fused_moe │ ├── test_deepep.py │ ├── test_grouped_fused_moe.py │ ├── test_grouped_fused_moe_speed.py │ ├── test_grouped_topk.py │ ├── test_moe_silu_and_mul_mix_quant_ep.py │ └── test_softmax_topk.py └── kv_trans_kernel │ └── test_kv_trans_v2.py ├── models ├── deepseek2 │ ├── test_destindex_copy_kv.py │ ├── test_destindex_copy_kv_fp8.py │ ├── test_gqa_flash_decoding.py │ ├── test_gqa_flash_decoding_fp8.py │ ├── test_repack_kv_index.py │ └── test_rope_repeat.py ├── llama │ ├── test_context_flashattention_nopad.py │ └── test_token_attention_nopad.py └── qwen2_vl │ └── test_mrope.py ├── server ├── core │ └── objs │ │ ├── test_atomic_array_lock.py │ │ ├── test_atomic_lock.py │ │ ├── test_out_token_circlequeue.py │ │ ├── test_req.py │ │ ├── test_sampling_params.py │ │ ├── test_shm_array.py │ │ └── test_shm_req_manager.py └── router │ └── dynamic_prompt │ └── test_radix_cache.py └── utils └── test_custom_kernel_utils.py /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: Pre-commit checks 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 # Fetch all history for all branches and tags 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.9' 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install pre-commit 27 | pre-commit install-hooks 28 | 29 | - name: Run pre-commit on modified files 30 | run: | 31 | if [ -n "$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }})" ]; then 32 | pre-commit run --files $(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }}) 33 | else 34 | echo "No files to check" 35 | fi 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .pyc 3 | build 4 | dist 5 | *.egg-info 6 | .idea 7 | .vscode 8 | tmp/ 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 21.12b0 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | args: [--line-length=120] 8 | additional_dependencies: ['click==8.0.4'] 9 | - repo: https://github.com/pycqa/flake8 10 | rev: 3.9.0 11 | hooks: 12 | - id: flake8 13 | additional_dependencies: [flake8-typing-imports==1.9.0] 14 | args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606'] -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing Guidelines 2 | 3 | ### Coding Style Guide 4 | 5 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html), and we recommend to use `yapf` to format your code. 6 | 7 | In this project, we adopted `pre-commit` to automatic check the code style. 8 | 9 | To begin with, you should follow the step below to install `pre-commit`. 10 | 11 | ```bash 12 | pip install pre-commit 13 | ``` 14 | 15 | Then, you should config the pre-commit hook as below. 16 | 17 | ```bash 18 | pre-commit install 19 | ``` 20 | 21 | Then when you commit your change, your code will be automatically checked. 22 | -------------------------------------------------------------------------------- /assets/att.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/att.gif -------------------------------------------------------------------------------- /assets/lightllm.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/lightllm.drawio.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/logo.png -------------------------------------------------------------------------------- /benchmark.md: -------------------------------------------------------------------------------- 1 | #### lightllm 2 | 3 | #### Launch service 4 | 5 | ~~~shell 6 | python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto 7 | ~~~ 8 | 9 | #### Evaluation 10 | 11 | ~~~shell 12 | python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 13 | ~~~ 14 | 15 | #### vllm 16 | 17 | #### Launch service 18 | ~~~shell 19 | python -m vllm.entrypoints.api_server --model /path/llama-7b --swap-space 16 --disable-log-requests --port 9009 20 | ~~~ 21 | 22 | #### Evaluation 23 | 24 | ~~~shell 25 | python benchmark_serving_vllm.py --backend vllm --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 --host 127.0.0.1 --port 9009 26 | ~~~ -------------------------------------------------------------------------------- /build_and_upload_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build and push docker image to AWS ECR. 4 | 5 | set -eo pipefail 6 | 7 | if [ -z "$1" ]; then 8 | echo "Must supply AWS account ID" 9 | exit 1; 10 | fi 11 | 12 | if [ -z "$2" ]; then 13 | echo "Must supply the image tag" 14 | exit 1; 15 | fi 16 | 17 | IMAGE_TAG=$2 18 | ACCOUNT=$1 19 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com 20 | DOCKER_BUILDKIT=1 docker build -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG . 21 | docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG 22 | -------------------------------------------------------------------------------- /demos/qa_server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/demos/qa_server/__init__.py -------------------------------------------------------------------------------- /demos/readme.txt: -------------------------------------------------------------------------------- 1 | 一些应用demo的目录 -------------------------------------------------------------------------------- /docs/CN/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-20.04 6 | tools: 7 | python: "3.10" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/CN/source/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: docs/CN/requirements-docs.txt -------------------------------------------------------------------------------- /docs/CN/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/CN/README.md: -------------------------------------------------------------------------------- 1 | ## Build the docs 2 | 3 | ```bash 4 | # Install lightllm 5 | 6 | # git clone https://github.com/ModelTC/lightllm.git 7 | # cd lightllm 8 | pip install --no-deps . 9 | ``` 10 | 11 | ```bash 12 | # Install dependencies. 13 | 14 | # cd docs/CN 15 | pip install -r requirements-docs.txt 16 | 17 | # Build the docs. 18 | make clean 19 | make html 20 | ``` 21 | 22 | ## Open the docs with your browser 23 | 24 | ```bash 25 | python -m http.server -d build/html/ 26 | ``` 27 | 28 | Launch your browser and open localhost:8000. 29 | -------------------------------------------------------------------------------- /docs/CN/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/CN/rebuild.sh: -------------------------------------------------------------------------------- 1 | make clean 2 | make html 3 | python -m http.server -d build/html/ 8000 -------------------------------------------------------------------------------- /docs/CN/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | myst-parser == 2.0.0 5 | sphinx-argparse 6 | sphinxcontrib.redoc 7 | sphinxcontrib.openapi 8 | 9 | # packages to install to build the documentation 10 | pydantic 11 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args 12 | numpy -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/ER1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER1.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/ER2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER2.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/ER3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER3.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/ER4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER4.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/HttpServer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/HttpServer.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/Performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/Performance2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance2.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/Router.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Router.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/Visual_Server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Visual_Server.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/arch.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/backend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/backend.png -------------------------------------------------------------------------------- /docs/CN/source/assets/lightllm/token_attn.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/token_attn.gif -------------------------------------------------------------------------------- /docs/CN/source/assets/logos/lightllm-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/logos/lightllm-logo.png -------------------------------------------------------------------------------- /docs/CN/source/server/api_server_args_zh.rst: -------------------------------------------------------------------------------- 1 | APIServer 参数详解 2 | ============================= 3 | 4 | 5 | 使用方法 6 | ++++++++++++ 7 | 8 | .. argparse:: 9 | :module: lightllm.server.api_cli 10 | :func: make_argument_parser 11 | :prog: python -m lightllm.server.api_server 12 | :nodefaultconst: 13 | -------------------------------------------------------------------------------- /docs/CN/source/server/benchmark.rst: -------------------------------------------------------------------------------- 1 | 服务性能评测 2 | ================== 3 | 4 | 部署完模型以后,对服务性能进行评测是非常重要的,通过服务性能的表现调整配置从而更好地利用显卡资源。 5 | 本文中,我们使用 LLaMA-7B 模型,在80G的A800显卡上,比较了lightllm 和 vLLM==0.1.2 的性能。 6 | 具体比较方式参考以下步骤: 7 | 8 | 1. 下载数据集 9 | ^^^^^^^^^^^^^^ 10 | 11 | .. code-block:: console 12 | 13 | $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 14 | 15 | 16 | 2. 开启模型服务 17 | ^^^^^^^^^^^^^^^^^^^ 18 | 19 | .. code-block:: console 20 | 21 | $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto 22 | 23 | 24 | 3. 性能评测 25 | ^^^^^^^^^^^^^^^^ 26 | 27 | .. code-block:: console 28 | 29 | $ cd test 30 | $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 31 | 32 | 33 | 输出: 34 | 35 | .. code-block:: console 36 | 37 | read data set finish 38 | total tokens: 494250 39 | Total time: 111.37 s 40 | Throughput: 8.98 requests/s 41 | Average latency: 43.52 s 42 | Average latency per token: 0.15 s 43 | Average latency per output token: 0.73 s -------------------------------------------------------------------------------- /docs/EN/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-20.04 6 | tools: 7 | python: "3.10" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/EN/source/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: docs/EN/requirements-docs.txt -------------------------------------------------------------------------------- /docs/EN/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/EN/README.md: -------------------------------------------------------------------------------- 1 | ## Build the docs 2 | 3 | ```bash 4 | # Install lightllm 5 | 6 | # git clone https://github.com/ModelTC/lightllm.git 7 | # cd lightllm 8 | pip install --no-deps . 9 | ``` 10 | 11 | ```bash 12 | # Install dependencies. 13 | 14 | # cd docs/EN 15 | pip install -r requirements-docs.txt 16 | 17 | # Build the docs. 18 | make clean 19 | make html 20 | ``` 21 | 22 | ## Open the docs with your browser 23 | 24 | ```bash 25 | python -m http.server -d build/html/ 26 | ``` 27 | 28 | Launch your browser and open localhost:8000. 29 | -------------------------------------------------------------------------------- /docs/EN/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/EN/rebuild.sh: -------------------------------------------------------------------------------- 1 | make clean 2 | make html 3 | python -m http.server -d build/html/ 5888 -------------------------------------------------------------------------------- /docs/EN/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | myst-parser == 2.0.0 5 | sphinx-argparse 6 | sphinxcontrib.redoc 7 | sphinxcontrib.openapi 8 | 9 | # packages to install to build the documentation 10 | pydantic 11 | -f https://download.pytorch.org/whl/cpu 12 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args 13 | numpy -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/ER1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER1.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/ER2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER2.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/ER3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER3.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/ER4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER4.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/HttpServer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/HttpServer.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/Performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/Performance2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance2.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/Router.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Router.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/Visual_Server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Visual_Server.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/arch.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/backend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/backend.png -------------------------------------------------------------------------------- /docs/EN/source/assets/lightllm/token_attn.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/token_attn.gif -------------------------------------------------------------------------------- /docs/EN/source/assets/logos/lightllm-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/logos/lightllm-logo.png -------------------------------------------------------------------------------- /docs/EN/source/getting_started/faq.rst: -------------------------------------------------------------------------------- 1 | .. _faq: 2 | 3 | - The LLaMA tokenizer fails to load. 4 | - Consider resolving this by running the command: 5 | 6 | .. code-block:: shell 7 | 8 | pip install protobuf==3.20.0 9 | 10 | - ``error : PTX .version 7.4 does not support .target sm_89`` 11 | - Launch with: 12 | 13 | .. code-block:: shell 14 | 15 | bash tools/resolve_ptx_version python -m lightllm.server.api_server ... -------------------------------------------------------------------------------- /docs/EN/source/server/api_server_args.rst: -------------------------------------------------------------------------------- 1 | APIServer Args 2 | ============================= 3 | 4 | 5 | Usage 6 | ++++++++++++ 7 | 8 | .. argparse:: 9 | :module: lightllm.server.api_cli 10 | :func: make_argument_parser 11 | :prog: python -m lightllm.server.api_server 12 | :nodefaultconst: -------------------------------------------------------------------------------- /format.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | for filename in glob.glob('./**/*.py', recursive=True): 5 | print(filename) 6 | os.system(f"autopep8 --max-line-length 140 --in-place --aggressive --aggressive {filename}") 7 | -------------------------------------------------------------------------------- /format_out/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/format_out/__init__.py -------------------------------------------------------------------------------- /format_out/grammer/__init__.py: -------------------------------------------------------------------------------- 1 | # 文法表达形式限制 2 | # 1. 起始表示符一定是 S‘ 3 | # 2. 不支持 "ε" 表达式 4 | 5 | 6 | grammar = [ 7 | ("S'", ["S"]), 8 | ("S", ["A", "B"]), 9 | ("A", ["a", "A"]), 10 | ("A", ["ε"]), 11 | ("B", ["b", "B"]), 12 | ("B", ["ε"]), 13 | ] 14 | -------------------------------------------------------------------------------- /format_out/grammer/json.ebnf: -------------------------------------------------------------------------------- 1 | root ::= basic_array | basic_object 2 | basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object 3 | basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"? 4 | basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)? 5 | basic_string ::= (([\"] basic_string_1 [\"])) 6 | basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1 7 | escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] 8 | basic_boolean ::= "true" | "false" 9 | basic_null ::= "null" 10 | basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]" 11 | basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}" 12 | ws ::= [ \n\t]* -------------------------------------------------------------------------------- /format_out/grammer/test.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | python test0.py 3 | python test1.py 4 | python test2.py 5 | python test3.py 6 | python test4.py 7 | python test5.py 8 | python test6.py 9 | -------------------------------------------------------------------------------- /lightllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/__init__.py -------------------------------------------------------------------------------- /lightllm/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/__init__.py -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/all_kernel_configs/__init__.py -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}} 2 | -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}} 2 | -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "2048": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "512": {"BLOCK_M": 4, "BLOCK_N": 64, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 16}, "4096": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 16}, "64": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "512": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 16}, "2048": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "8": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 2}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 256, "num_warps": 2}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "128": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 2}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "512": {"BLOCK_M": 32, "BLOCK_N": 128, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 8, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 2}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 5}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}} -------------------------------------------------------------------------------- /lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json: -------------------------------------------------------------------------------- 1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}} -------------------------------------------------------------------------------- /lightllm/common/basemodel/__init__.py: -------------------------------------------------------------------------------- 1 | from .layer_weights.base_layer_weight import BaseLayerWeight 2 | from .layer_weights.pre_and_post_layer_weight import PreAndPostLayerWeight 3 | from .layer_weights.transformer_layer_weight import TransformerLayerWeight 4 | from .layer_infer.base_layer_infer import BaseLayerInfer 5 | from .layer_infer.pre_layer_infer import PreLayerInfer 6 | from .layer_infer.post_layer_infer import PostLayerInfer 7 | from .layer_infer.transformer_layer_infer import TransformerLayerInfer 8 | from .layer_infer.template.transformer_layer_infer_template import TransformerLayerInferTpl 9 | from .layer_infer.template.pre_layer_infer_template import PreLayerInferTpl 10 | from .layer_infer.template.post_layer_infer_template import PostLayerInferTpl 11 | from .infer_struct import InferStateInfo 12 | from .basemodel import TpPartBaseModel 13 | 14 | 15 | __all__ = [ 16 | "BaseLayerWeight", 17 | "PreAndPostLayerWeight", 18 | "TransformerLayerWeight", 19 | "BaseLayerInfer", 20 | "PreLayerInfer", 21 | "PostLayerInfer", 22 | "TransformerLayerInfer", 23 | "TransformerLayerInferTpl", 24 | "InferStateInfo", 25 | "TpPartBaseModel", 26 | "PreLayerInferTpl", 27 | "PostLayerInferTpl", 28 | ] 29 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/post_layer_infer.py: -------------------------------------------------------------------------------- 1 | from .base_layer_infer import BaseLayerInfer 2 | 3 | 4 | class PostLayerInfer(BaseLayerInfer): 5 | """ """ 6 | 7 | def __init__(self, network_config, mode): 8 | super().__init__() 9 | self.network_config_ = network_config 10 | self.mode = mode 11 | return 12 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/pre_layer_infer.py: -------------------------------------------------------------------------------- 1 | from .base_layer_infer import BaseLayerInfer 2 | 3 | 4 | class PreLayerInfer(BaseLayerInfer): 5 | """ """ 6 | 7 | def __init__(self, network_config, mode): 8 | super().__init__() 9 | self.network_config_ = network_config 10 | self.mode = mode 11 | return 12 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/template/__init__.py -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/template/post_layer_infer_template.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ..post_layer_infer import PostLayerInfer 3 | from typing import Tuple 4 | 5 | 6 | class PostLayerInferTpl(PostLayerInfer): 7 | """ """ 8 | 9 | def __init__(self, network_config, mode): 10 | super().__init__(network_config, mode) 11 | self.eps_ = 1e-5 12 | self.vocab_size_ = network_config["vocab_size"] 13 | self.embed_dim_ = network_config["n_embed"] 14 | return 15 | 16 | def _norm(self, input, infer_state, layer_weight) -> torch.Tensor: 17 | raise Exception("need to impl") 18 | 19 | def _slice_get_last_input(self, input, infer_state) -> Tuple[torch.Tensor, int]: 20 | raise Exception("need to impl") 21 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/template/pre_layer_infer_template.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ..pre_layer_infer import PreLayerInfer 3 | 4 | 5 | class PreLayerInferTpl(PreLayerInfer): 6 | """ """ 7 | 8 | def __init__(self, network_config, mode): 9 | super().__init__(network_config, mode) 10 | self.eps_ = 1e-5 11 | self.vob_start_id_ = -1 12 | self.vob_end_id_ = -1 13 | return 14 | 15 | def _norm(self, input, infer_state, layer_weight) -> torch.Tensor: 16 | raise Exception("need to impl") 17 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_infer/transformer_layer_infer.py: -------------------------------------------------------------------------------- 1 | from .base_layer_infer import BaseLayerInfer 2 | 3 | 4 | class TransformerLayerInfer(BaseLayerInfer): 5 | """ """ 6 | 7 | def __init__(self, layer_num, network_config, mode): 8 | super().__init__() 9 | self.layer_num_ = layer_num 10 | self.network_config_ = network_config 11 | self.mode = mode 12 | return 13 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_weights/meta_weights/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_weight import BaseWeight 2 | from .mm_weight import ( 3 | MMWeightTpl, 4 | MultiMMWeightTpl, 5 | ROWMMWeight, 6 | COLMMWeight, 7 | MultiROWMMWeight, 8 | ROWBMMWeight, 9 | ) 10 | from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight 11 | from .fused_moe_weight_tp import FusedMoeWeightTP 12 | from .fused_moe_weight_ep import FusedMoeWeightEP 13 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py: -------------------------------------------------------------------------------- 1 | from .mm_weight import ( 2 | MMWeightTpl, 3 | MultiMMWeightTpl, 4 | ) 5 | from .rowmm_weight import ( 6 | ROWMMWeight, 7 | ROWBMMWeight, 8 | MultiROWMMWeight, 9 | W8A8B128ROWMMWeight, 10 | W8A8B128ROWBMMWeight, 11 | W8A8B128MultiROWMMWeight, 12 | ) 13 | from .colmm_weight import ( 14 | COLMMWeight, 15 | W8A8B128COLMMWeight, 16 | ) 17 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | from .base_layer_weight import BaseLayerWeight 2 | 3 | 4 | class PreAndPostLayerWeight(BaseLayerWeight): 5 | def __init__(self, data_type, network_config, mode): 6 | super().__init__() 7 | self.data_type_ = data_type 8 | self.network_config_ = network_config 9 | self.mode = mode 10 | self.init_static_params() 11 | return 12 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/microbatch_overlap_objs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class DecodeMicroBatch: 7 | batch_size: int 8 | total_token_num: int 9 | max_len_in_batch: int 10 | input_ids: torch.Tensor 11 | mem_indexes: torch.Tensor 12 | b_req_idx: torch.Tensor 13 | b_seq_len: torch.Tensor 14 | 15 | 16 | @dataclass 17 | class PrefillMicroBatch: 18 | batch_size: int 19 | total_token_num: int 20 | max_len_in_batch: int 21 | input_ids: torch.Tensor 22 | mem_indexes: torch.Tensor 23 | b_req_idx: torch.Tensor 24 | b_seq_len: torch.Tensor 25 | b_ready_cache_len: torch.Tensor 26 | multimodal_params: list 27 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/common/basemodel/triton_kernel/add_in_place.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | @triton.jit 8 | def _add_in_place( 9 | input_ptr, 10 | other_ptr, 11 | n_elements, 12 | alpha, 13 | BLOCK_SIZE: tl.constexpr, 14 | ): 15 | pid = tl.program_id(axis=0) 16 | block_start = pid * BLOCK_SIZE 17 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 18 | mask = offsets < n_elements 19 | x = tl.load(input_ptr + offsets, mask=mask) 20 | y = tl.load(other_ptr + offsets, mask=mask) 21 | x = x + y * alpha 22 | tl.store(input_ptr + offsets, x, mask=mask) 23 | 24 | 25 | @torch.no_grad() 26 | def add_in_place(input: torch.Tensor, other: torch.Tensor, *, alpha=1): 27 | assert input.is_contiguous(), "input tensor must be contiguous" 28 | assert other.is_contiguous(), "other tensor must be contiguous" 29 | n_elements = input.numel() 30 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) 31 | _add_in_place[grid]( 32 | input, 33 | other, 34 | n_elements, 35 | alpha, 36 | BLOCK_SIZE=1024, 37 | ) 38 | return input 39 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/triton_kernel/copy_kv_index_to_req.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | @triton.jit 8 | def _fwd_kernel_copy_kv_index_to_req( 9 | req_to_token_indexs, b_req_idx, b_seq_len, memindex, 10 | stride_req_to_token_b, stride_req_to_token_s 11 | ): 12 | cur_index = tl.program_id(0) 13 | cur_req_idx = tl.load(b_req_idx + cur_index) 14 | cur_token_index = tl.load(memindex + cur_index) 15 | cur_seq_len = tl.load(b_seq_len + cur_index) 16 | dest_offset = req_to_token_indexs + cur_req_idx * stride_req_to_token_b + (cur_seq_len - 1) * stride_req_to_token_s 17 | tl.store(dest_offset, cur_token_index) 18 | return 19 | 20 | 21 | @torch.no_grad() 22 | def copy_kv_index_to_req(req_to_token_indexs, b_req_idx, b_seq_len, memindex): 23 | seq_len = b_seq_len.shape[0] 24 | assert b_seq_len.shape[0] == memindex.shape[0] and b_req_idx.shape[0] == b_seq_len.shape[0] 25 | grid = (seq_len,) 26 | num_warps = 1 27 | 28 | _fwd_kernel_copy_kv_index_to_req[grid]( 29 | req_to_token_indexs, b_req_idx, b_seq_len, memindex, 30 | req_to_token_indexs.stride(0), req_to_token_indexs.stride(1), 31 | num_warps=num_warps, 32 | num_stages=1, 33 | ) 34 | return 35 | -------------------------------------------------------------------------------- /lightllm/common/basemodel/triton_kernel/gen_decode_params.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | from .gen_prefill_params import gen_cumsum_pad0_tensor 5 | 6 | 7 | @torch.no_grad() 8 | def gen_decode_params(b_seq_len: torch.Tensor): 9 | b_kv_seq_len = b_seq_len 10 | position_ids = b_seq_len - 1 11 | b_q_seq_len = torch.ones_like(b_seq_len) 12 | b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len) 13 | max_q_seq_len = b_q_seq_len.max().item() 14 | max_kv_seq_len = b_kv_seq_len.max().item() 15 | return b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len 16 | -------------------------------------------------------------------------------- /lightllm/common/build_utils.py: -------------------------------------------------------------------------------- 1 | 2 | def repair_config(config, same_names): 3 | find_value = None 4 | for name in same_names: 5 | if name in config and config[name] is not None: 6 | find_value = config[name] 7 | break 8 | for name in same_names: 9 | config[name] = find_value 10 | return -------------------------------------------------------------------------------- /lightllm/common/deepseek2_fp8kv_mem_manager.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .deepseek2_mem_manager import Deepseek2MemoryManager 3 | 4 | 5 | class Deepseek2FP8KVMemoryManager(Deepseek2MemoryManager): 6 | def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9): 7 | # scale被追加到kv_buffer末尾, 因此加2, dtype统一改成uint8 8 | super().__init__(size, torch.uint8, head_num, head_dim + 2, layer_num, always_copy, mem_fraction) 9 | -------------------------------------------------------------------------------- /lightllm/common/fused_moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/fused_moe/__init__.py -------------------------------------------------------------------------------- /lightllm/common/infer_utils.py: -------------------------------------------------------------------------------- 1 | def init_req_to_token_indexes( 2 | req_to_token_indexs, b_req_idx, b_seq_len, b_ready_cache_len, max_len_in_batch, alloc_mem_index 3 | ): 4 | start_index = 0 5 | b_seq_len_numpy = b_seq_len.cpu().numpy() 6 | b_ready_cache_len_numpy = b_ready_cache_len.cpu().numpy() 7 | b_req_idx_numpy = b_req_idx.cpu().numpy() 8 | for i in range(len(b_seq_len)): 9 | cur_seq_len = b_seq_len_numpy[i] 10 | cur_ready_cache_len = b_ready_cache_len_numpy[i] 11 | req_to_token_indexs[b_req_idx_numpy[i], cur_ready_cache_len:cur_seq_len] = alloc_mem_index[ 12 | start_index : start_index + cur_seq_len - cur_ready_cache_len 13 | ] 14 | start_index += cur_seq_len - cur_ready_cache_len 15 | return 16 | -------------------------------------------------------------------------------- /lightllm/common/kv_trans_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/kv_trans_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/common/mem_utils.py: -------------------------------------------------------------------------------- 1 | from lightllm.common.mem_manager import MemoryManager 2 | from lightllm.common.int8kv_mem_manager import INT8KVMemoryManager 3 | from lightllm.common.ppl_int8kv_mem_manager import PPLINT8KVMemoryManager 4 | from lightllm.common.ppl_int4kv_mem_manager import PPLINT4KVMemoryManager 5 | from lightllm.utils.log_utils import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | def select_mem_manager_class(mode): 11 | logger.info(f"mode setting params: {mode}") 12 | if "ppl_int8kv" in mode or "ppl_int8kv_flashdecoding" in mode: 13 | memory_manager_class = PPLINT8KVMemoryManager 14 | logger.info(f"Model kv cache using mode {mode}") 15 | elif "ppl_int4kv_flashdecoding" in mode: 16 | memory_manager_class = PPLINT4KVMemoryManager 17 | logger.info(f"Model kv cache using mode {mode}") 18 | elif "triton_int8kv" in mode: 19 | memory_manager_class = INT8KVMemoryManager 20 | logger.info("Model kv cache using mode triton int8kv") 21 | elif "triton_fp8kv" in mode: 22 | raise Exception("currently only for deepseek") 23 | else: 24 | memory_manager_class = MemoryManager 25 | logger.info("Model kv cache using mode normal") 26 | return memory_manager_class 27 | -------------------------------------------------------------------------------- /lightllm/common/quantization/configs/llamacls-mix-down.yaml: -------------------------------------------------------------------------------- 1 | quant_type: vllm-w8a8 2 | mix_bits: 3 | - name: "down_proj" 4 | quant_type: "none" 5 | layer_nums: [1, 2, 3] # Defaults to all layers, or you can specify a layer_num list. -------------------------------------------------------------------------------- /lightllm/common/quantization/quantize_method.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from abc import ABC, abstractmethod 3 | from lightllm.utils.dist_utils import get_current_device_id 4 | 5 | 6 | class QuantizationMethod(ABC): 7 | def __init__(self): 8 | super().__init__() 9 | self.device_id_ = get_current_device_id() 10 | self.weight_scale_suffix = None 11 | self.act_scale_suffix = None 12 | 13 | @abstractmethod 14 | def quantize(self, weights: torch.Tensor): 15 | pass 16 | 17 | @abstractmethod 18 | def apply(self, input_tensor, weight, bias=None, out=None, use_custom_tensor_mananger=True): 19 | pass 20 | -------------------------------------------------------------------------------- /lightllm/common/quantization/registry.py: -------------------------------------------------------------------------------- 1 | class QuantMethodFactory: 2 | def __init__(self): 3 | self._quant_methods = {} 4 | 5 | def register(self, names): 6 | def decorator(cls): 7 | local_names = names 8 | if isinstance(local_names, str): 9 | local_names = [local_names] 10 | for n in local_names: 11 | self._quant_methods[n] = cls 12 | return cls 13 | 14 | return decorator 15 | 16 | def get(self, key, *args, **kwargs): 17 | if key == "none": 18 | return None 19 | quant_method_class = self._quant_methods.get(key) 20 | if not quant_method_class: 21 | raise ValueError(f"QuantMethod '{key}' not supported.") 22 | return quant_method_class() 23 | 24 | 25 | QUANTMETHODS = QuantMethodFactory() 26 | -------------------------------------------------------------------------------- /lightllm/common/quantization/triton_quant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/__init__.py -------------------------------------------------------------------------------- /lightllm/common/quantization/triton_quant/fp8/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/fp8/__init__.py -------------------------------------------------------------------------------- /lightllm/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | -------------------------------------------------------------------------------- /lightllm/models/bloom/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/__init__.py -------------------------------------------------------------------------------- /lightllm/models/bloom/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/bloom/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/bloom/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/chatglm2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/chatglm2/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/chatglm2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/chatglm2/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/cohere/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/__init__.py -------------------------------------------------------------------------------- /lightllm/models/cohere/infer_struct.py: -------------------------------------------------------------------------------- 1 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo 2 | 3 | 4 | class CohereInferStateInfo(LlamaInferStateInfo): 5 | def __init__(self): 6 | super().__init__() 7 | self._attn_out = None 8 | self._ffn_out = None 9 | -------------------------------------------------------------------------------- /lightllm/models/cohere/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/cohere/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/cohere/triton_kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/triton_kernels/__init__.py -------------------------------------------------------------------------------- /lightllm/models/deepseek2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/deepseek2/infer_struct.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import torch.distributed as dist 5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo 6 | 7 | 8 | class Deepseek2InferStateInfo(LlamaInferStateInfo): 9 | def __init__(self): 10 | super().__init__() 11 | self.kv_starts = None 12 | 13 | def init_some_extra_state(self, model, input_ids: torch.Tensor): 14 | super().init_some_extra_state(model, input_ids) 15 | if not self.is_prefill: 16 | self.kv_starts = self.b1_cu_kv_seq_len 17 | 18 | if self.is_prefill: 19 | self.b1_kv_start_loc = self.b1_cu_kv_seq_len 20 | self.max_value_in_b_seq_len = self.b_seq_len.max().item() 21 | return 22 | -------------------------------------------------------------------------------- /lightllm/models/deepseek2/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/deepseek2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/deepseek2/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma3/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma3/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 4 | 5 | 6 | # add key: language_model.xxx -> xxx 7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now 8 | def rename_weight_keys(weights): 9 | prefix = "language_model." 10 | keys = list(weights.keys()) 11 | for k in keys: 12 | if prefix in k: 13 | weights[k[len(prefix) :]] = weights[k] 14 | 15 | 16 | class Gemma3PreAndPostLayerWeight(LlamaPreAndPostLayerWeight): 17 | def __init__(self, data_type, network_config, mode): 18 | network_config["tie_word_embeddingse"] = True 19 | super().__init__(data_type, network_config, mode) 20 | return 21 | 22 | def load_hf_weights(self, weights): 23 | rename_weight_keys(weights) 24 | super().load_hf_weights(weights) 25 | return 26 | -------------------------------------------------------------------------------- /lightllm/models/gemma_2b/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma_2b/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma_2b/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 4 | 5 | 6 | class Gemma_2bPreAndPostLayerWeight(LlamaPreAndPostLayerWeight): 7 | def __init__(self, data_type, network_config, mode): 8 | super().__init__(data_type, network_config, mode) 9 | return 10 | 11 | def load_hf_weights(self, weights): 12 | vob_size = self.network_config_["vocab_size"] 13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64) 14 | split_start = split_indexes[self.tp_rank_] 15 | split_end = split_indexes[self.tp_rank_ + 1] 16 | if "model.embed_tokens.weight" in weights: 17 | # print(weights['model.embed_tokens.weight'].shape) 18 | self.wte_weight_ = self._cuda(weights["model.embed_tokens.weight"][split_start:split_end, :]) 19 | self.lm_head_weight_ = self.wte_weight_ 20 | 21 | if "model.norm.weight" in weights: 22 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"]) 23 | self.final_norm_weight_ = self.final_norm_weight_ + 1 24 | 25 | return 26 | -------------------------------------------------------------------------------- /lightllm/models/gemma_2b/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm/layer_weights/transformer_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import numpy as np 4 | 5 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight 6 | 7 | 8 | class InternlmTransformerLayerWeight(LlamaTransformerLayerWeight): 9 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None): 10 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg) 11 | return 12 | 13 | def _init_weight_names(self): 14 | super()._init_weight_names() 15 | self._q_bias_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.bias" 16 | self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias" 17 | self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias" 18 | self._o_bias_name = f"model.layers.{self.layer_num_}.self_attn.o_proj.bias" 19 | -------------------------------------------------------------------------------- /lightllm/models/internlm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from lightllm.models.registry import ModelRegistry 5 | from lightllm.models.internlm.layer_weights.transformer_layer_weight import InternlmTransformerLayerWeight 6 | from lightllm.models.llama.model import LlamaTpPartModel 7 | 8 | 9 | @ModelRegistry("internlm") 10 | class InternlmTpPartModel(LlamaTpPartModel): 11 | # weight class 12 | transformer_weight_class = InternlmTransformerLayerWeight 13 | 14 | def __init__(self, kvargs): 15 | super().__init__(kvargs) 16 | -------------------------------------------------------------------------------- /lightllm/models/internlm2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 4 | 5 | 6 | class Internlm2PreAndPostLayerWeight(LlamaPreAndPostLayerWeight): 7 | def __init__(self, data_type, network_config, mode): 8 | super().__init__(data_type, network_config, mode) 9 | return 10 | 11 | def load_hf_weights(self, weights): 12 | vob_size = self.network_config_["vocab_size"] 13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64) 14 | split_start = split_indexes[self.tp_rank_] 15 | split_end = split_indexes[self.tp_rank_ + 1] 16 | if "model.tok_embeddings.weight" in weights: 17 | self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :]) 18 | if "output.weight" in weights: 19 | self.lm_head_weight_ = self._cuda(weights["output.weight"][split_start:split_end, :]) 20 | if "model.norm.weight" in weights: 21 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"]) 22 | 23 | return 24 | -------------------------------------------------------------------------------- /lightllm/models/internlm2/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | 5 | from lightllm.models.registry import ModelRegistry 6 | from lightllm.models.internlm2.layer_weights.transformer_layer_weight import Internlm2TransformerLayerWeight 7 | from lightllm.models.internlm2.layer_weights.pre_and_post_layer_weight import Internlm2PreAndPostLayerWeight 8 | from lightllm.models.internlm.model import InternlmTpPartModel 9 | 10 | 11 | @ModelRegistry("internlm2") 12 | class Internlm2TpPartModel(InternlmTpPartModel): 13 | # weight class 14 | pre_and_post_weight_class = Internlm2PreAndPostLayerWeight 15 | transformer_weight_class = Internlm2TransformerLayerWeight 16 | 17 | def __init__(self, kvargs): 18 | super().__init__(kvargs) 19 | -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/layer_infer/post_layer_infer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | import numpy as np 4 | 5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo 6 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer 7 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 8 | from einops import rearrange 9 | 10 | 11 | class Internlm2RewardPostLayerInfer(LlamaPostLayerInfer): 12 | def token_forward(self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight): 13 | last_input, token_num = self._slice_get_last_input(input_embdings, infer_state) 14 | 15 | input_embdings = None 16 | last_input = self._norm(last_input, infer_state, layer_weight) 17 | score = torch.mm(last_input, layer_weight.lm_head_weight_) 18 | 19 | return score 20 | -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 4 | 5 | 6 | class Internlm2RewardPreAndPostLayerWeight(LlamaPreAndPostLayerWeight): 7 | def __init__(self, data_type, network_config, mode): 8 | super().__init__(data_type, network_config, mode) 9 | return 10 | 11 | def load_hf_weights(self, weights): 12 | vob_size = self.network_config_["vocab_size"] 13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64) 14 | split_start = split_indexes[self.tp_rank_] 15 | split_end = split_indexes[self.tp_rank_ + 1] 16 | if "model.tok_embeddings.weight" in weights: 17 | self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :]) 18 | if "v_head.weight" in weights: 19 | self.lm_head_weight_ = self._cuda(weights["v_head.weight"]).transpose(0, 1) 20 | if "model.norm.weight" in weights: 21 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"]) 22 | 23 | return 24 | -------------------------------------------------------------------------------- /lightllm/models/internlm2_reward/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from lightllm.models.registry import ModelRegistry, is_reward_model 5 | from lightllm.models.internlm2_reward.layer_infer.post_layer_infer import Internlm2RewardPostLayerInfer 6 | from lightllm.models.internlm2_reward.layer_weights.pre_and_post_layer_weight import ( 7 | Internlm2RewardPreAndPostLayerWeight, 8 | ) 9 | from lightllm.models.internlm2.model import Internlm2TpPartModel 10 | 11 | 12 | @ModelRegistry("internlm2", condition=is_reward_model()) 13 | class Internlm2RewardTpPartModel(Internlm2TpPartModel): 14 | # weight class 15 | pre_and_post_weight_class = Internlm2RewardPreAndPostLayerWeight 16 | 17 | post_layer_infer_class = Internlm2RewardPostLayerInfer 18 | 19 | def __init__(self, kvargs): 20 | super().__init__(kvargs) 21 | -------------------------------------------------------------------------------- /lightllm/models/internvl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/__init__.py -------------------------------------------------------------------------------- /lightllm/models/internvl/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llama/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llama/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llama/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llama/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llama/yarn_rotary_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | 5 | # Inverse dim formula to find dim based on number of rotations 6 | def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): 7 | return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) 8 | 9 | 10 | # Find dim range bounds based on rotations 11 | def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): 12 | low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) 13 | high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) 14 | return max(low, 0), min(high, dim - 1) # Clamp values just in case 15 | 16 | 17 | def linear_ramp_mask(min, max, dim): 18 | if min == max: 19 | max += 0.001 # Prevent singularity 20 | 21 | linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) 22 | ramp_func = torch.clamp(linear_func, 0, 1) 23 | return ramp_func 24 | 25 | 26 | def get_mscale(scale=1): 27 | if scale <= 1: 28 | return 1.0 29 | return 0.1 * math.log(scale) + 1.0 30 | 31 | 32 | def get_deepseek_mscale(scale=1, mscale=1): 33 | if scale <= 1: 34 | return 1.0 35 | return 0.1 * mscale * math.log(scale) + 1.0 36 | -------------------------------------------------------------------------------- /lightllm/models/llava/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llava/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/llava/layer_weights/pre_and_post_layer_weight.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight 4 | 5 | 6 | # add key: language_model.xxx -> xxx 7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now 8 | def rename_weight_keys(weights): 9 | prefix = "language_model." 10 | keys = list(weights.keys()) 11 | for k in keys: 12 | if prefix in k: 13 | weights[k[len(prefix) :]] = weights[k] 14 | 15 | 16 | class LlavaPreAndPostLayerWeight(LlamaPreAndPostLayerWeight): 17 | def __init__(self, data_type, network_config, mode): 18 | super().__init__(data_type, network_config, mode) 19 | return 20 | 21 | def load_hf_weights(self, weights): 22 | rename_weight_keys(weights) 23 | super().load_hf_weights(weights) 24 | return 25 | -------------------------------------------------------------------------------- /lightllm/models/minicpm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/__init__.py -------------------------------------------------------------------------------- /lightllm/models/minicpm/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/minicpm/layer_weights/transformer_layer_weight.py: -------------------------------------------------------------------------------- 1 | import math 2 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight 3 | 4 | 5 | class MiniCPMTransformerLayerWeight(LlamaTransformerLayerWeight): 6 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None): 7 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg) 8 | return 9 | 10 | def _parse_config(self): 11 | super()._parse_config() 12 | num_hidden_layers = self.network_config_["num_hidden_layers"] 13 | scale_depth = self.network_config_.get("scale_depth", math.sqrt(num_hidden_layers)) 14 | self.layer_scale = scale_depth / math.sqrt(num_hidden_layers) 15 | 16 | def load_hf_weights(self, weights): 17 | if self._o_weight_name in weights: 18 | weights[self._o_weight_name] *= self.layer_scale 19 | if self._down_weight_name in weights: 20 | weights[self._down_weight_name] *= self.layer_scale 21 | super().load_hf_weights(weights) 22 | -------------------------------------------------------------------------------- /lightllm/models/minicpm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from lightllm.models.registry import ModelRegistry 5 | from lightllm.models.minicpm.layer_weights.transformer_layer_weight import MiniCPMTransformerLayerWeight 6 | from lightllm.models.minicpm.layer_weights.pre_and_post_layer_weight import MiniCPMPreAndPostLayerWeight 7 | from lightllm.models.llama.model import LlamaTpPartModel 8 | 9 | 10 | @ModelRegistry("minicpm") 11 | class MiniCPMTpPartModel(LlamaTpPartModel): 12 | # weight class 13 | transformer_weight_class = MiniCPMTransformerLayerWeight 14 | pre_and_post_weight_class = MiniCPMPreAndPostLayerWeight 15 | 16 | def __init__(self, kvargs): 17 | super().__init__(kvargs) 18 | -------------------------------------------------------------------------------- /lightllm/models/mistral/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/__init__.py -------------------------------------------------------------------------------- /lightllm/models/mistral/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/mistral/layer_infer/transformer_layer_infer.py: -------------------------------------------------------------------------------- 1 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer 2 | 3 | 4 | class MistralTransformerLayerInfer(LlamaTransformerLayerInfer): 5 | """ """ 6 | 7 | def __init__(self, layer_num, network_config, mode=[]): 8 | super().__init__(layer_num, network_config, mode) 9 | self.head_dim_ = network_config.get("head_dim", self.head_dim_) 10 | return 11 | -------------------------------------------------------------------------------- /lightllm/models/mistral/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/mistral/triton_kernel/init_att_sliding_window_info.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | @triton.jit 8 | def _fwd_kernel_init_att_window_info( 9 | b_seq_len, 10 | b_att_seq_len, 11 | batch_size, 12 | sliding_window, 13 | BLOCK_SIZE: tl.constexpr, 14 | ): 15 | cur_index = tl.program_id(0) 16 | cur_start = cur_index * BLOCK_SIZE 17 | offsets = cur_start + tl.arange(0, BLOCK_SIZE) 18 | mask = offsets < batch_size 19 | 20 | cur_seq_len = tl.load(b_seq_len + offsets, mask=mask) 21 | b_att_seq_len_data = tl.minimum(cur_seq_len, sliding_window) 22 | 23 | tl.store(b_att_seq_len + offsets, b_att_seq_len_data, mask=mask) 24 | return 25 | 26 | 27 | @torch.no_grad() 28 | def init_att_window_info_fwd(batch_size, b_seq_len, b_att_seq_len, sliding_window): 29 | # shape constraints 30 | assert batch_size == b_seq_len.shape[0] == b_att_seq_len.shape[0] 31 | 32 | BLOCK_SIZE = 32 33 | num_warps = 1 34 | grid = (triton.cdiv(batch_size, BLOCK_SIZE),) 35 | 36 | _fwd_kernel_init_att_window_info[grid]( 37 | b_seq_len, 38 | b_att_seq_len, 39 | batch_size=batch_size, 40 | sliding_window=sliding_window, 41 | BLOCK_SIZE=BLOCK_SIZE, 42 | num_warps=num_warps, 43 | num_stages=1, 44 | ) 45 | return 46 | -------------------------------------------------------------------------------- /lightllm/models/mixtral/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/__init__.py -------------------------------------------------------------------------------- /lightllm/models/mixtral/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/mixtral/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/phi3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/__init__.py -------------------------------------------------------------------------------- /lightllm/models/phi3/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/phi3/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/phi3/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from lightllm.models.registry import ModelRegistry 5 | from lightllm.models.phi3.layer_weights.transformer_layer_weight import Phi3TransformerLayerWeight 6 | from lightllm.models.phi3.layer_infer.transformer_layer_infer import Phi3TransformerLayerInfer 7 | from lightllm.models.llama.model import LlamaTpPartModel 8 | 9 | 10 | @ModelRegistry("phi3") 11 | class Phi3TpPartModel(LlamaTpPartModel): 12 | # weight class 13 | transformer_weight_class = Phi3TransformerLayerWeight 14 | 15 | transformer_layer_infer_class = Phi3TransformerLayerInfer 16 | 17 | def __init__(self, kvargs): 18 | super().__init__(kvargs) 19 | -------------------------------------------------------------------------------- /lightllm/models/phi3/triton_kernel/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lightllm/models/qwen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2_5_vl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_5_vl/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2_reward/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_reward/layer_infer/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_reward/layer_infer/post_layer_infer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo 4 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer 5 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight 6 | from einops import rearrange 7 | 8 | 9 | class Qwen2RewardPostLayerInfer(LlamaPostLayerInfer): 10 | def token_forward( 11 | self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: Qwen2RewardPreAndPostLayerWeight 12 | ): 13 | last_input, token_num = self._slice_get_last_input(input_embdings, infer_state) 14 | 15 | input_embdings = None 16 | last_input = self._norm(last_input, infer_state, layer_weight) 17 | 18 | last_input = torch.addmm(layer_weight.score_up_bias, last_input, layer_weight.score_up_weight) 19 | last_input = torch.nn.functional.relu(last_input) 20 | score = torch.addmm(layer_weight.score_down_bias, last_input, layer_weight.score_down_weight) 21 | 22 | return score 23 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_reward/layer_weights/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_reward/model.py: -------------------------------------------------------------------------------- 1 | from lightllm.models.registry import ModelRegistry, is_reward_model 2 | from lightllm.models.qwen2_reward.layer_infer.post_layer_infer import Qwen2RewardPostLayerInfer 3 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight 4 | from lightllm.models.qwen2.model import Qwen2TpPartModel 5 | 6 | 7 | @ModelRegistry("qwen2", condition=is_reward_model()) 8 | class Qwen2RewardTpPartModel(Qwen2TpPartModel): 9 | 10 | pre_and_post_weight_class = Qwen2RewardPreAndPostLayerWeight 11 | post_layer_infer_class = Qwen2RewardPostLayerInfer 12 | 13 | def __init__(self, kvargs): 14 | super().__init__(kvargs) 15 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_vl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2_vl/infer_struct.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo 4 | from lightllm.common.basemodel.infer_struct import InferStateInfo 5 | 6 | 7 | class Qwen2VLInferStateInfo(LlamaInferStateInfo): 8 | def __init__(self): 9 | super().__init__() 10 | self.position_cos = None 11 | self.position_sin = None 12 | 13 | def init_some_extra_state(self, model, input_ids: torch.Tensor): 14 | InferStateInfo.init_some_extra_state(self, model, input_ids) 15 | if self.is_prefill: 16 | position_ids = self.position_ids 17 | self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1) 18 | self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1) 19 | position_ids = None 20 | else: 21 | position_ids = self.position_ids 22 | self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1) 23 | self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1) 24 | return 25 | -------------------------------------------------------------------------------- /lightllm/models/qwen2_vl/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen2_vl/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import final 3 | from lightllm.models.registry import ModelRegistry 4 | from lightllm.models.qwen3.layer_infer.transformer_layer_infer import Qwen3TransformerLayerInfer 5 | from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight 6 | from lightllm.models.qwen2.model import Qwen2TpPartModel 7 | from lightllm.utils.log_utils import init_logger 8 | 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | @ModelRegistry("qwen3") 14 | class Qwen3TpPartModel(Qwen2TpPartModel): 15 | # weight class 16 | transformer_weight_class = Qwen3TransformerLayerWeight 17 | 18 | # infer class 19 | transformer_layer_infer_class = Qwen3TransformerLayerInfer 20 | 21 | def __init__(self, kvargs): 22 | super().__init__(kvargs) 23 | return 24 | -------------------------------------------------------------------------------- /lightllm/models/qwen3_moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3_moe/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3_moe/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen3_moe/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import final 3 | from lightllm.models.registry import ModelRegistry 4 | from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer 5 | from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight 6 | from lightllm.models.qwen3.model import Qwen3TpPartModel 7 | from lightllm.utils.log_utils import init_logger 8 | 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | @ModelRegistry("qwen3_moe") 14 | class Qwen3MOEModel(Qwen3TpPartModel): 15 | # weight class 16 | transformer_weight_class = Qwen3MOETransformerLayerWeight 17 | 18 | # infer class 19 | transformer_layer_infer_class = Qwen3MOETransformerLayerInfer 20 | 21 | def __init__(self, kvargs): 22 | super().__init__(kvargs) 23 | return 24 | -------------------------------------------------------------------------------- /lightllm/models/qwen_vl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/__init__.py -------------------------------------------------------------------------------- /lightllm/models/qwen_vl/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/stablelm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/__init__.py -------------------------------------------------------------------------------- /lightllm/models/stablelm/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/stablelm/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/stablelm/layer_weights/transformer_layer_weight.py: -------------------------------------------------------------------------------- 1 | from lightllm.models.qwen2.layer_weights.transformer_layer_weight import Qwen2TransformerLayerWeight 2 | from lightllm.common.basemodel.layer_weights.meta_weights import NormWeight 3 | 4 | 5 | class StablelmTransformerLayerWeight(Qwen2TransformerLayerWeight): 6 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None): 7 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg) 8 | return 9 | 10 | def _init_weight_names(self): 11 | super()._init_weight_names() 12 | self._att_norm_bias_name = f"model.layers.{self.layer_num_}.input_layernorm.bias" 13 | self._ffn_norm_bias_name = f"model.layers.{self.layer_num_}.post_attention_layernorm.bias" 14 | -------------------------------------------------------------------------------- /lightllm/models/stablelm/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from lightllm.models.registry import ModelRegistry 5 | from lightllm.models.stablelm.layer_infer.transformer_layer_infer import StablelmTransformerLayerInfer 6 | from lightllm.models.bloom.layer_infer.post_layer_infer import BloomPostLayerInfer 7 | from lightllm.models.stablelm.layer_weights.pre_and_post_layer_weight import StableLMPreAndPostLayerWeight 8 | from lightllm.models.stablelm.layer_weights.transformer_layer_weight import StablelmTransformerLayerWeight 9 | from lightllm.models.llama.model import LlamaTpPartModel 10 | from lightllm.common.build_utils import repair_config 11 | 12 | 13 | @ModelRegistry("stablelm") 14 | class StablelmTpPartModel(LlamaTpPartModel): 15 | # weight class 16 | pre_and_post_weight_class = StableLMPreAndPostLayerWeight 17 | transformer_weight_class = StablelmTransformerLayerWeight 18 | 19 | # infer class 20 | transformer_layer_infer_class = StablelmTransformerLayerInfer 21 | post_layer_infer_class = BloomPostLayerInfer 22 | 23 | def __init__(self, kvargs): 24 | super().__init__(kvargs) 25 | 26 | def _init_config(self): 27 | super()._init_config() 28 | repair_config(self.config, same_names=["rms_norm_eps", "layer_norm_eps", "layer_norm_epsilon"]) 29 | return 30 | -------------------------------------------------------------------------------- /lightllm/models/starcoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/__init__.py -------------------------------------------------------------------------------- /lightllm/models/starcoder/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/starcoder/layer_infer/transformer_layer_infer.py: -------------------------------------------------------------------------------- 1 | from lightllm.models.bloom.layer_infer.transformer_layer_infer import BloomTransformerLayerInfer 2 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer 3 | 4 | 5 | class StarcoderTransformerLayerInfer(BloomTransformerLayerInfer): 6 | """ """ 7 | 8 | def __init__(self, layer_num, network_config, mode=[]): 9 | super().__init__(layer_num, network_config, mode) 10 | self.tp_k_head_num_ = 1 11 | self.tp_v_head_num_ = 1 12 | self._bind_func() 13 | return 14 | 15 | def _bind_func(self): 16 | LlamaTransformerLayerInfer._bind_attention(self) 17 | return 18 | -------------------------------------------------------------------------------- /lightllm/models/starcoder/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/starcoder2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/starcoder2/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/starcoder2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/tarsier2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/__init__.py -------------------------------------------------------------------------------- /lightllm/models/tarsier2/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/vit/layer_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/models/vit/layer_weights/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_weights/__init__.py -------------------------------------------------------------------------------- /lightllm/models/vit/triton_kernel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/triton_kernel/__init__.py -------------------------------------------------------------------------------- /lightllm/models/vit/triton_kernel/gelu_vit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import triton 3 | import triton.language as tl 4 | from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager 5 | 6 | 7 | @triton.jit 8 | def gelu(x): 9 | x_fp32 = x.to(tl.float32) 10 | x_gelu = 0.5 * x_fp32 * (1 + tl.math.erf(x_fp32 * 0.7071067811)) 11 | return x_gelu 12 | 13 | 14 | @triton.jit 15 | def gelu_kernel(output_ptr, input_ptr, n_elements, BLOCK_SIZE: tl.constexpr): 16 | pid = tl.program_id(axis=0) 17 | block_start = pid * BLOCK_SIZE 18 | offsets = block_start + tl.arange(0, BLOCK_SIZE) 19 | mask = offsets < n_elements 20 | input = tl.load(input_ptr + offsets, mask=mask) 21 | output = gelu(input) 22 | tl.store(output_ptr + offsets, output, mask=mask) 23 | 24 | 25 | def gelu_fwd(input, use_custom_tensor_mananger=False): 26 | if use_custom_tensor_mananger: 27 | shape = input.shape 28 | dtype = input.dtype 29 | device = input.device 30 | output = g_cache_manager.alloc_tensor(shape, dtype, device=device) 31 | else: 32 | output = torch.empty_like(input) 33 | assert input.is_contiguous(), "Input tensor must be contiguous" 34 | n_elements = input.numel() 35 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) 36 | gelu_kernel[grid](output, input, n_elements, BLOCK_SIZE=1024) 37 | return output 38 | -------------------------------------------------------------------------------- /lightllm/models/whisper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/whisper/__init__.py -------------------------------------------------------------------------------- /lightllm/models/whisper/defaults.py: -------------------------------------------------------------------------------- 1 | MIN_AUDIO_LEN = 480 # 最短音频长度 2 | -------------------------------------------------------------------------------- /lightllm/server/__init__.py: -------------------------------------------------------------------------------- 1 | from .router.token_load import TokenLoad 2 | -------------------------------------------------------------------------------- /lightllm/server/api_server.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .api_cli import make_argument_parser 3 | 4 | if __name__ == "__main__": 5 | torch.multiprocessing.set_start_method("spawn") # this code will not be ok for settings to fork to subprocess 6 | parser = make_argument_parser() 7 | args = parser.parse_args() 8 | from .api_start import pd_master_start, normal_or_p_d_start, config_server_start 9 | 10 | if args.run_mode == "pd_master": 11 | pd_master_start(args) 12 | elif args.run_mode == "config_server": 13 | config_server_start(args) 14 | else: 15 | normal_or_p_d_start(args) 16 | -------------------------------------------------------------------------------- /lightllm/server/audioserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/__init__.py -------------------------------------------------------------------------------- /lightllm/server/audioserver/model_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/model_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/server/config_server/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements a configuration service designed to facilitate the 3 | registration and retrieval of information in a PD separation mode. It 4 | allows various nodes to register their own information and query global 5 | configuration details efficiently. 6 | 7 | Key Features: 8 | - Node registration: Enables nodes to register their specific information. 9 | - Global configuration query: Provides mechanisms for querying shared 10 | configuration data across the system. 11 | - Designed for distributed systems operating in PD separation mode. 12 | """ 13 | -------------------------------------------------------------------------------- /lightllm/server/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/core/__init__.py -------------------------------------------------------------------------------- /lightllm/server/core/objs/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampling_params import SamplingParams 2 | from .req import Req, FinishStatus 3 | from .shm_req_manager import ShmReqManager 4 | from .rpc_shm import RpcShmParams, RpcShmResults, ShmSyncStatusArray 5 | -------------------------------------------------------------------------------- /lightllm/server/core/objs/io_objs/__init__.py: -------------------------------------------------------------------------------- 1 | from .group_req import GroupReqIndexes, GroupReqObjs 2 | -------------------------------------------------------------------------------- /lightllm/server/core/objs/io_objs/group_req.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from lightllm.server.multimodal_params import MultimodalParams 3 | from typing import List 4 | from ..req import Req 5 | 6 | 7 | @dataclass 8 | class GroupReqIndexes: 9 | group_req_id: int 10 | multimodal_params: MultimodalParams 11 | shm_req_indexes: List[int] 12 | time_mark: float 13 | 14 | 15 | @dataclass 16 | class GroupReqObjs: 17 | group_req_id: int 18 | multimodal_params: MultimodalParams 19 | shm_req_objs: List[Req] 20 | time_mark: float 21 | 22 | def to_group_req_index(self): 23 | return GroupReqIndexes( 24 | group_req_id=self.group_req_id, 25 | multimodal_params=self.multimodal_params, 26 | shm_req_indexes=[req.index_in_shm_mem for req in self.shm_req_objs], 27 | time_mark=self.time_mark, 28 | ) 29 | -------------------------------------------------------------------------------- /lightllm/server/detokenization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/detokenization/__init__.py -------------------------------------------------------------------------------- /lightllm/server/detokenization/decode_mode_fix.py: -------------------------------------------------------------------------------- 1 | """ 2 | p d 分离模式下, 对于到达的请求,需要将输入的prompt_ids 中的最后一个id,提前处理,然后移入到outputs中 3 | 这是 p d 分离模式下,decode 节点的特殊处理点。 4 | """ 5 | from .decode_req import DecodeReq 6 | from .decode import decode_token 7 | 8 | from lightllm.utils.log_utils import init_logger 9 | 10 | logger = init_logger(__name__) 11 | 12 | 13 | def decode_mode_fix(req_out: DecodeReq, tokenizer, eos_id): 14 | new_token_id = req_out.prompt_ids[-1] 15 | decode_token(tokenizer, req_out, new_token_id, eos_id) 16 | return req_out 17 | -------------------------------------------------------------------------------- /lightllm/server/embed_cache/__init__.py: -------------------------------------------------------------------------------- 1 | from . import impl -------------------------------------------------------------------------------- /lightllm/server/embed_cache/impl/__init__.py: -------------------------------------------------------------------------------- 1 | from . import naive_memory_cache -------------------------------------------------------------------------------- /lightllm/server/embed_cache/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from io import BytesIO 4 | import multiprocessing.shared_memory as shm 5 | 6 | 7 | def tensor2bytes(t: torch.Tensor): 8 | # t = t.cpu().numpy().tobytes() 9 | # return t 10 | buf = BytesIO() 11 | torch.save(t.detach().cpu(), buf) 12 | buf.seek(0) 13 | return buf.read() 14 | 15 | 16 | def bytes2tensor(b): 17 | # return torch.from_numpy(np.frombuffer(b, dtype=np.float16)).cuda() 18 | return torch.load(BytesIO(b)) 19 | 20 | 21 | def create_shm(name, data): 22 | try: 23 | data_size = len(data) 24 | shared_memory = shm.SharedMemory(name=name, create=True, size=data_size) 25 | mem_view = shared_memory.buf 26 | mem_view[:data_size] = data 27 | except FileExistsError: 28 | print("Warning create shm {} failed because of FileExistsError!".format(name)) 29 | 30 | 31 | def read_shm(name): 32 | shared_memory = shm.SharedMemory(name=name) 33 | data = shared_memory.buf.tobytes() 34 | return data 35 | 36 | 37 | def free_shm(name): 38 | shared_memory = shm.SharedMemory(name=name) 39 | shared_memory.close() 40 | shared_memory.unlink() 41 | 42 | 43 | def get_shm_name_data(uid): 44 | return str(uid) + "-data" 45 | 46 | 47 | def get_shm_name_embed(uid): 48 | return str(uid) + "-embed" 49 | -------------------------------------------------------------------------------- /lightllm/server/health_monitor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/health_monitor/__init__.py -------------------------------------------------------------------------------- /lightllm/server/httpserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver/__init__.py -------------------------------------------------------------------------------- /lightllm/server/httpserver/async_queue.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | class AsyncQueue: 5 | def __init__(self): 6 | self.datas = [] 7 | self.event = asyncio.Event() 8 | self.lock = asyncio.Lock() 9 | 10 | async def wait_to_ready(self): 11 | try: 12 | await asyncio.wait_for(self.event.wait(), timeout=3) 13 | except asyncio.TimeoutError: 14 | pass 15 | 16 | async def get_all_data(self): 17 | async with self.lock: 18 | self.event.clear() 19 | ans = self.datas 20 | self.datas = [] 21 | return ans 22 | 23 | async def put(self, obj): 24 | async with self.lock: 25 | self.datas.append(obj) 26 | self.event.set() 27 | return 28 | 29 | async def wait_to_get_all_data(self): 30 | await self.wait_to_ready() 31 | handle_list = await self.get_all_data() 32 | return handle_list 33 | -------------------------------------------------------------------------------- /lightllm/server/httpserver_for_pd_master/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver_for_pd_master/__init__.py -------------------------------------------------------------------------------- /lightllm/server/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .manager import start_metric_manager 2 | -------------------------------------------------------------------------------- /lightllm/server/router/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/dynamic_prompt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/dynamic_prompt/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/__init__.py: -------------------------------------------------------------------------------- 1 | from .continues_batch.impl import ContinuesBatchBackend 2 | from .continues_batch.impl_for_return_all_prompt_logprobs import ReturnPromptLogProbBackend 3 | from .continues_batch.impl_for_reward_model import RewardModelBackend 4 | from .chunked_prefill.impl import ChunkedPrefillBackend 5 | from .diverse_backend.impl import DiversehBackend 6 | from .chunked_prefill.impl_for_token_healing import TokenHealingBackend 7 | from .chunked_prefill.impl_for_outlines_constraint_mode import OutlinesConstraintBackend 8 | from .chunked_prefill.impl_for_first_token_constraint_mode import FirstTokenConstraintBackend 9 | from .dp_backend.impl import DPChunkedPrefillBackend 10 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl import ChunckedPrefillForPrefillNode 11 | from .continues_batch.pd_mode.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode 12 | from .chunked_prefill.impl_for_xgrammar_mode import XgrammarBackend 13 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl_for_dp_chuncked import DPChunkedForPrefillNode 14 | from .continues_batch.pd_mode.decode_node_impl.decode_impl_for_dp import DPForDecodeNode 15 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/__init__.py: -------------------------------------------------------------------------------- 1 | from .decode_kv_move_manager import start_decode_kv_move_manager_process 2 | from .decode_trans_process import start_decode_trans_process 3 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_task_cache.py: -------------------------------------------------------------------------------- 1 | # 这个里面声明了一个全局变量,主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据 2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了,不用传输特别的 3 | # 数据了,提升rpyc 调用的速度, 只用在 decode_impl.py 和 decode_infer_rpyc.py 文件中 4 | from typing import Dict, List, Tuple 5 | from lightllm.server.pd_io_struct import KVMoveTask 6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode 7 | 8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, List[int]]] = {} 9 | 10 | g_success_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, float]] = {} # 第三个float代表的是时间,用于判断过期条件。 11 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/__init__.py: -------------------------------------------------------------------------------- 1 | from .prefill_trans_process import start_prefill_trans_process 2 | from .prefill_kv_move_manager import start_prefill_kv_move_manager_process 3 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_task_cache.py: -------------------------------------------------------------------------------- 1 | # 这个里面声明了一个全局变量,主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据 2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了,不用传输特别的 3 | # 数据了,提升rpyc 调用的速度, 只用在 prefill_impl.py 和 prefill_infer_rpyc.py 文件中 4 | from typing import Dict, Tuple 5 | from lightllm.server.pd_io_struct import KVMoveTask 6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode 7 | 8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode]] = {} 9 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/utils.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import torch.multiprocessing as mp 3 | from queue import Empty 4 | 5 | 6 | def join_if_alive(thread: threading.Thread): 7 | if thread is not None and thread.is_alive(): 8 | try: 9 | thread.join() 10 | except Exception: 11 | pass 12 | return 13 | 14 | 15 | def clear_queue(queue: mp.Queue): 16 | while not queue.empty(): 17 | try: 18 | queue.get_nowait() 19 | except Empty: 20 | break 21 | -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/req_queue/chunked_prefill/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/chunked_prefill/__init__.py -------------------------------------------------------------------------------- /lightllm/server/router/req_queue/continues_batch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/continues_batch/__init__.py -------------------------------------------------------------------------------- /lightllm/server/visualserver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/__init__.py -------------------------------------------------------------------------------- /lightllm/server/visualserver/model_infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/model_infer/__init__.py -------------------------------------------------------------------------------- /lightllm/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/utils/__init__.py -------------------------------------------------------------------------------- /lightllm/utils/error_utils.py: -------------------------------------------------------------------------------- 1 | class ServerBusyError(Exception): 2 | """Custom exception for server busy/overload situations""" 3 | 4 | def __init__(self, message="Server is busy, please try again later", status_code=503): 5 | """ 6 | Initialize the ServerBusyError 7 | 8 | Args: 9 | message (str): Error message to display 10 | status_code (int): HTTP status code (default 503 Service Unavailable) 11 | """ 12 | super().__init__(message) 13 | self.message = message 14 | self.status_code = status_code # HTTP 503 Service Unavailable 15 | 16 | def __str__(self): 17 | """String representation of the error""" 18 | return f"{self.message} (Status code: {self.status_code})" 19 | -------------------------------------------------------------------------------- /lightllm/utils/graceful_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from lightllm.utils.log_utils import init_logger 3 | 4 | logger = init_logger(__name__) 5 | 6 | 7 | def graceful_registry(sub_module_name): 8 | import signal 9 | 10 | # 子进程在受到 SIGTERM的时候,不能自己就提前退出。 11 | def graceful_shutdown(signum, frame): 12 | logger.info(f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown...") 13 | if signum == signal.SIGTERM: 14 | # 不退出,由主进程来决定退出时机 15 | logger.info(f"{sub_module_name} recive sigterm") 16 | 17 | signal.signal(signal.SIGTERM, graceful_shutdown) 18 | return 19 | -------------------------------------------------------------------------------- /lightllm/utils/light_utils.py: -------------------------------------------------------------------------------- 1 | from lightllm.utils.log_utils import init_logger 2 | 3 | logger = init_logger(__name__) 4 | try: 5 | # TODO: lightllm_kernel release 6 | import lightllm_kernel 7 | 8 | light_ops = getattr(lightllm_kernel, "ops", lightllm_kernel) 9 | HAS_LIGHTLLM_KERNEL = True 10 | except: 11 | light_ops = None 12 | HAS_LIGHTLLM_KERNEL = False 13 | logger.warning("lightllm_kernel is not installed, you can't use the api of it.") 14 | -------------------------------------------------------------------------------- /lightllm/utils/retry_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import functools 3 | from lightllm.utils.log_utils import init_logger 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | def retry(max_attempts=3, wait_time=1): 9 | """ 10 | 被修饰的函数调用失败需要自己抛异常 11 | :param max_attempts: 最大重试次数 12 | :param wait_time: 每次重试之间的等待时间(秒) 13 | """ 14 | 15 | def decorator(func): 16 | @functools.wraps(func) 17 | def wrapper(*args, **kwargs): 18 | attempts = 0 19 | while attempts < max_attempts: 20 | try: 21 | return func(*args, **kwargs) 22 | except Exception as e: 23 | attempts += 1 24 | logger.info(f"try {func.__name__} {attempts}/{max_attempts} fail: {str(e)}") 25 | if attempts < max_attempts: 26 | time.sleep(wait_time) 27 | raise Exception(f"{func.__name__} try all failed") 28 | 29 | return wrapper 30 | 31 | return decorator 32 | -------------------------------------------------------------------------------- /lightllm/utils/sgl_utils.py: -------------------------------------------------------------------------------- 1 | from lightllm.utils.log_utils import init_logger 2 | 3 | logger = init_logger(__name__) 4 | try: 5 | import sgl_kernel 6 | 7 | sgl_ops = sgl_kernel 8 | sgl_allreduce_ops = sgl_ops.allreduce 9 | HAS_SGL_KERNEL = True 10 | except: 11 | sgl_ops = None 12 | sgl_allreduce_ops = None 13 | HAS_SGL_KERNEL = False 14 | logger.warning( 15 | "sgl_kernel is not installed, you can't use the api of it. \ 16 | You can solve it by running `pip install sgl_kernel`." 17 | ) 18 | 19 | try: 20 | from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache 21 | 22 | flash_attn_varlen_func = flash_attn_varlen_func 23 | flash_attn_with_kvcache = flash_attn_with_kvcache 24 | merge_state_v2 = sgl_ops.merge_state_v2 25 | except: 26 | flash_attn_varlen_func = None 27 | flash_attn_with_kvcache = None 28 | merge_state_v2 = None 29 | logger.warning( 30 | "sgl_kernel is not installed, or the installed version did not support fa3. \ 31 | Try to upgrade it." 32 | ) 33 | -------------------------------------------------------------------------------- /lightllm/utils/statics_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from lightllm.utils.log_utils import init_logger 3 | 4 | logger = init_logger(__name__) 5 | 6 | 7 | class MovingAverage: 8 | def __init__(self): 9 | self.total = 0.0 10 | self.count = 0 11 | self.last_time = time.time() 12 | 13 | def add(self, value): 14 | self.total += value 15 | self.count += 1 16 | 17 | def average(self): 18 | if self.count == 0: 19 | return 0.0 20 | return self.total / self.count 21 | 22 | def print_log(self, log_str): 23 | if time.time() - self.last_time >= 30: 24 | logger.info(f"{log_str}: {self.average()} ms") 25 | self.last_time = time.time() 26 | -------------------------------------------------------------------------------- /lightllm/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class TimeChecker: 5 | def __init__(self, threshold): 6 | self.threshold = threshold 7 | self.last_checked = time.time() 8 | 9 | def has_exceeded(self): 10 | current_time = time.time() 11 | if (current_time - self.last_checked) > self.threshold: 12 | self._reset() 13 | return True 14 | return False 15 | 16 | def _reset(self): 17 | self.last_checked = time.time() 18 | -------------------------------------------------------------------------------- /lightllm/utils/vllm_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from lightllm.utils.log_utils import init_logger 3 | 4 | logger = init_logger(__name__) 5 | try: 6 | if not torch.cuda.is_initialized(): 7 | torch.cuda.init() 8 | from vllm import _custom_ops as ops 9 | 10 | vllm_ops = ops 11 | HAS_VLLM = True 12 | cutlass_scaled_mm = torch.ops._C.cutlass_scaled_mm 13 | 14 | except: 15 | HAS_VLLM = False 16 | cutlass_scaled_mm = None 17 | vllm_ops = None 18 | logger.warning( 19 | "vllm is not installed, you can't use the api of it. \ 20 | You can solve it by running `pip install vllm`." 21 | ) 22 | -------------------------------------------------------------------------------- /lightllm/utils/watchdog_utils.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | from lightllm.utils.log_utils import init_logger 4 | 5 | logger = init_logger(__name__) 6 | 7 | 8 | class Watchdog: 9 | def __init__(self, timeout): 10 | self.timeout = timeout 11 | self.last_heartbeat = time.time() 12 | self.running = True 13 | 14 | def start(self): 15 | self.thread = threading.Thread(target=self.run, daemon=True) 16 | self.thread.start() 17 | 18 | def run(self): 19 | while self.running: 20 | time.sleep(2) 21 | if time.time() - self.last_heartbeat > self.timeout: 22 | logger.error("Watchdog: Timeout! Task is not responding.") 23 | self.handle_timeout() 24 | 25 | def handle_timeout(self): 26 | logger.error("Watchdog: time out to exit") 27 | import sys 28 | 29 | sys.exit(-1) 30 | 31 | def stop(self): 32 | self.running = False 33 | self.thread.join() 34 | 35 | def heartbeat(self): 36 | self.last_heartbeat = time.time() 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | package_data = {"lightllm": ["common/all_kernel_configs/*/*.json"]} 4 | setup( 5 | name="lightllm", 6 | version="1.0.1", 7 | packages=find_packages(exclude=("build", "include", "test", "dist", "docs", "benchmarks", "lightllm.egg-info")), 8 | author="model toolchain", 9 | author_email="", 10 | description="lightllm for inference LLM", 11 | long_description="", 12 | long_description_content_type="text/markdown", 13 | url="", 14 | classifiers=[ 15 | "Programming Language :: Python :: 3", 16 | "Operating System :: Linux", 17 | ], 18 | python_requires=">=3.9.16", 19 | install_requires=[ 20 | "pyzmq", 21 | "uvloop", 22 | "transformers", 23 | "einops", 24 | "packaging", 25 | "rpyc", 26 | "ninja", 27 | "safetensors", 28 | "triton", 29 | ], 30 | package_data=package_data, 31 | ) 32 | -------------------------------------------------------------------------------- /test/model/test_settings/process_utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import re 3 | 4 | 5 | def kill_gpu_processes(): 6 | try: 7 | output = subprocess.check_output(["nvidia-smi", "-q", "-x"]) 8 | output = output.decode("utf-8") 9 | 10 | # 使用正则表达式提取进程信息 11 | process_info = re.findall(r"(.*?)", output, re.DOTALL) 12 | 13 | if process_info: 14 | print("找到以下占用显卡的进程:") 15 | for info in process_info: 16 | pid = re.search(r"(.*?)", info).group(1) 17 | process_name = re.search(r"(.*?)", info).group(1) 18 | print("进程ID:", pid) 19 | print("进程名字:", process_name) 20 | 21 | for info in process_info: 22 | pid = re.search(r"(.*?)", info).group(1) 23 | subprocess.call(["sudo", "kill", "-9", pid]) 24 | print("进程ID", pid, "被终止") 25 | else: 26 | print("没有找到占用显卡的进程") 27 | 28 | except subprocess.CalledProcessError: 29 | print("无法执行nvidia-smi命令") 30 | 31 | 32 | if __name__ == "__main__": 33 | kill_gpu_processes() 34 | -------------------------------------------------------------------------------- /test/server/readme.md: -------------------------------------------------------------------------------- 1 | # prompt cache 测试: 2 | 3 | - benchmark_prompt_cache.py: 单次测试脚本。 4 | 5 | 例子: 6 | ```shell 7 | python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama --num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1 8 | ``` 9 | 10 | 使用方法详细说明: 11 | ```shell 12 | python benchmark_prompt_cache.py -h 13 | ``` 14 | 15 | - test_settings.py: 批量测试脚本,可测试多个配置并汇总为md 16 | -------------------------------------------------------------------------------- /test/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/test/test.jpg -------------------------------------------------------------------------------- /test/test_server.py: -------------------------------------------------------------------------------- 1 | import time 2 | import requests 3 | import json 4 | import threading 5 | 6 | 7 | class RequestThread(threading.Thread): 8 | def __init__(self, url, headers, data): 9 | threading.Thread.__init__(self) 10 | self.url = url 11 | self.headers = headers 12 | self.data = data 13 | 14 | def run(self): 15 | response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data)) 16 | if response.status_code == 200: 17 | print(response.json()) 18 | else: 19 | print("Error:", response.status_code, response.text) 20 | 21 | 22 | url = "http://localhost:8000/generate" 23 | headers = {"Content-Type": "application/json"} 24 | 25 | for i in range(1): 26 | data = { 27 | "inputs": "San Francisco is a", 28 | # 'temperature': 0.1, 29 | "parameters": { 30 | "do_sample": False, 31 | }, 32 | } 33 | thread = RequestThread(url, headers, data) 34 | thread.start() 35 | 36 | time.sleep(2) 37 | 38 | for i in range(20): 39 | data = { 40 | "inputs": "San Francisco is a", 41 | "parameters": { 42 | "do_sample": False, 43 | "ignore_eos": True, 44 | "max_new_tokens": 200, 45 | }, 46 | } 47 | thread = RequestThread(url, headers, data) 48 | thread.start() 49 | -------------------------------------------------------------------------------- /tools/resolve_ptx_version: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is used to make old version triton work on generating ptx code up to version 7.8 3 | # See https://github.com/openai/triton/blob/8650b4d1cbc750d659156e2c17a058736614827b/lib/driver/llvm.cc#L149 4 | set -e 5 | 6 | mkdir -p $HOME/.triton/ 7 | 8 | [ $HOME/.triton/resolve_ptx_version.so -nt $0 ] || (echo ' 9 | #include 10 | namespace triton { 11 | namespace driver { 12 | 13 | int vptx(int version) { 14 | // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes 15 | if (version >= 11080) return 78; 16 | if (version >= 11070) return 77; 17 | if (version >= 11060) return 76; 18 | if (version >= 11050) return 75; 19 | if (version >= 11040) return 74; 20 | throw std::runtime_error("Triton requires CUDA 11.4+"); 21 | } 22 | 23 | } 24 | }' \ 25 | | g++ -x c++ -fPIC -shared -o $HOME/.triton/resolve_ptx_version.so -) 26 | 27 | [ -z "$*" ] || env LD_PRELOAD=$LD_PRELOAD:$HOME/.triton/resolve_ptx_version.so "$@" -------------------------------------------------------------------------------- /unit_tests/common/basemodel/triton_kernel/test_add_in_place.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import pytest 4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy 5 | from lightllm.common.basemodel.triton_kernel.add_in_place import add_in_place 6 | from lightllm.utils.log_utils import init_logger 7 | 8 | logger = init_logger(__name__) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "dim1, dim2, alpha", 13 | [ 14 | (dim1, dim2, alpha) 15 | for dim1 in range(1, 1024, 100) 16 | for dim2 in range(1, 1024, 100) 17 | for alpha in [0.1, 0.3, 0.5, 0.7, 0.1] 18 | ], 19 | ) 20 | def test_add_in_place(dim1, dim2, alpha): 21 | input = torch.rand((dim1, dim2), device="cuda") 22 | other = torch.rand((dim1, dim2), device="cuda") 23 | 24 | output = input + other * alpha 25 | add_in_place(input, other, alpha=alpha) 26 | rlt = torch.allclose(input, output, atol=1e-5, rtol=0) 27 | assert rlt 28 | 29 | 30 | if __name__ == "__main__": 31 | pytest.main() 32 | -------------------------------------------------------------------------------- /unit_tests/common/basemodel/triton_kernel/test_gen_decode_params.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytest 3 | import numpy as np 4 | from lightllm.utils.log_utils import init_logger 5 | from lightllm.common.basemodel.triton_kernel.gen_decode_params import gen_decode_params 6 | 7 | 8 | def test_gen_decode_params_basic(): 9 | b_seq_len = torch.ones((9,), dtype=torch.int64, device="cuda") * 8192 10 | ( 11 | b_q_seq_len, 12 | b1_cu_q_seq_len, 13 | b_kv_seq_len, 14 | b1_cu_kv_seq_len, 15 | position_ids, 16 | max_q_seq_len, 17 | max_kv_seq_len, 18 | ) = gen_decode_params(b_seq_len) 19 | 20 | true_b_q_seq_len = torch.ones_like(b_seq_len) 21 | b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len 22 | 23 | assert max_q_seq_len == 1 24 | assert max_kv_seq_len == b_seq_len.max().item() 25 | assert torch.equal(b_q_seq_len, true_b_q_seq_len) 26 | assert torch.equal(b1_cu_q_seq_len, torch.nn.functional.pad(torch.cumsum(true_b_q_seq_len, dim=0), (1, 0), value=0)) 27 | assert torch.equal(b_kv_seq_len, b_seq_len) 28 | assert torch.equal(b1_cu_kv_seq_len, torch.nn.functional.pad(torch.cumsum(b_seq_len, dim=0), (1, 0), value=0)) 29 | assert torch.equal(position_ids, b_seq_len - 1) 30 | 31 | 32 | if __name__ == "__main__": 33 | pytest.main() 34 | -------------------------------------------------------------------------------- /unit_tests/common/basemodel/triton_kernel/test_sp_pad_kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import pytest 4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy 5 | from lightllm.utils.log_utils import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "token_num, hidden_dim, sp_world_size", 12 | [ 13 | (token_num, hidden_dim, sp_world_size) 14 | for token_num in range(3, 6) 15 | for hidden_dim in [257, 2048] 16 | for sp_world_size in range(2, 5) 17 | ], 18 | ) 19 | def test_sp_pad_copy(token_num, hidden_dim, sp_world_size): 20 | 21 | in_tensor = torch.randn((token_num, hidden_dim), dtype=torch.float16, device="cuda") 22 | out_tensors = [ 23 | sp_pad_copy(in_tensor=in_tensor, sp_rank_id=rank_id, sp_world_size=sp_world_size) 24 | for rank_id in range(sp_world_size) 25 | ] 26 | out_tensor = torch.cat(out_tensors, dim=0) 27 | assert torch.equal(in_tensor, out_tensor[0:token_num, :]) 28 | 29 | 30 | if __name__ == "__main__": 31 | pytest.main() 32 | -------------------------------------------------------------------------------- /unit_tests/models/deepseek2/test_rope_repeat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import pytest 4 | from lightllm.models.deepseek2.triton_kernel.repeat_rope import repeat_rope 5 | 6 | 7 | def test_torch_cat(): 8 | source = torch.randn((100, 1, 1077), device="cuda") 9 | dest = torch.randn((100, 7, 1077), device="cuda") 10 | 11 | repeat_rope(dest, source) 12 | torch.equal(dest[:, 0, :], source) 13 | torch.equal(dest[:, -1, :], source) 14 | 15 | source = torch.randn((100, 1, 128), device="cuda") 16 | dest = torch.randn((100, 64, 128), device="cuda") 17 | 18 | repeat_rope(dest, source) 19 | torch.equal(dest[:, 0, :], source) 20 | torch.equal(dest[:, -1, :], source) 21 | return 22 | 23 | 24 | if __name__ == "__main__": 25 | pytest.main() 26 | -------------------------------------------------------------------------------- /unit_tests/utils/test_custom_kernel_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import pytest 4 | from lightllm.utils.custom_kernel_utis import torch_cat_3 5 | 6 | 7 | def test_torch_cat(): 8 | a = torch.tensor([[[1, 2], [3, 4]]], device="cuda") 9 | b = torch.tensor([[[5, 6], [7, 8]]], device="cuda") 10 | c = torch_cat_3([a, b], dim=0) 11 | torch.equal(torch.cat((a, b), dim=0), c) 12 | 13 | d = torch_cat_3([a, b], dim=1) 14 | torch.equal(torch.cat((a, b), dim=1), d) 15 | 16 | e = torch_cat_3([a, b], dim=-1) 17 | torch.equal(torch.cat((a, b), dim=-1), e) 18 | 19 | empty = torch.empty((0, 2), device="cuda") 20 | torch_cat_3([a, empty, b], dim=0) 21 | return 22 | 23 | 24 | if __name__ == "__main__": 25 | pytest.main() 26 | --------------------------------------------------------------------------------