├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
└── workflows
│ ├── docker-publish.yml
│ └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── assets
├── att.gif
├── lightllm.drawio.png
└── logo.png
├── benchmark.md
├── build_and_upload_docker.sh
├── demos
├── qa_server
│ ├── __init__.py
│ ├── chat_server.py
│ ├── qabot.py
│ └── templates
│ │ └── chat.html
└── readme.txt
├── docs
├── CN
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── rebuild.sh
│ ├── requirements-docs.txt
│ └── source
│ │ ├── _static
│ │ └── openapi.json
│ │ ├── assets
│ │ ├── lightllm
│ │ │ ├── ER1.png
│ │ │ ├── ER2.png
│ │ │ ├── ER3.png
│ │ │ ├── ER4.png
│ │ │ ├── HttpServer.png
│ │ │ ├── Performance.png
│ │ │ ├── Performance2.png
│ │ │ ├── Router.png
│ │ │ ├── Visual_Server.png
│ │ │ ├── arch.png
│ │ │ ├── backend.png
│ │ │ └── token_attn.gif
│ │ └── logos
│ │ │ └── lightllm-logo.png
│ │ ├── conf.py
│ │ ├── dev
│ │ ├── router.rst
│ │ └── token_attention.rst
│ │ ├── getting_started
│ │ ├── installation.rst
│ │ └── quickstart.rst
│ │ ├── index.rst
│ │ ├── lightllm
│ │ ├── lightllm_impl.rst
│ │ └── lightllm_intro.rst
│ │ ├── models
│ │ ├── add_new_model.md
│ │ ├── supported_models.rst
│ │ └── test.rst
│ │ ├── server
│ │ ├── api_server_args_zh.rst
│ │ └── benchmark.rst
│ │ └── user
│ │ ├── api_param.rst
│ │ └── openapi_docs.rst
└── EN
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── README.md
│ ├── make.bat
│ ├── rebuild.sh
│ ├── requirements-docs.txt
│ └── source
│ ├── _static
│ └── openapi.json
│ ├── assets
│ ├── lightllm
│ │ ├── ER1.png
│ │ ├── ER2.png
│ │ ├── ER3.png
│ │ ├── ER4.png
│ │ ├── HttpServer.png
│ │ ├── Performance.png
│ │ ├── Performance2.png
│ │ ├── Router.png
│ │ ├── Visual_Server.png
│ │ ├── arch.png
│ │ ├── backend.png
│ │ └── token_attn.gif
│ └── logos
│ │ └── lightllm-logo.png
│ ├── conf.py
│ ├── dev
│ ├── performance.rst
│ ├── router.rst
│ └── token_attention.rst
│ ├── getting_started
│ ├── faq.rst
│ ├── installation.rst
│ └── quickstart.rst
│ ├── index.rst
│ ├── lightllm
│ ├── lightllm_impl.rst
│ └── lightllm_intro.rst
│ ├── models
│ ├── add_new_model.md
│ ├── supported_models.rst
│ └── test.rst
│ ├── server
│ ├── api_server_args.rst
│ └── benchmark.rst
│ └── user
│ ├── api_param.rst
│ └── openapi_docs.rst
├── format.py
├── format_out
├── __init__.py
├── grammer
│ ├── __init__.py
│ ├── core.py
│ ├── dpda.py
│ ├── json.ebnf
│ ├── test.sh
│ ├── test0.py
│ ├── test1.py
│ ├── test2.py
│ ├── test3.py
│ ├── test4.py
│ ├── test5.py
│ └── test6.py
└── impl.py
├── lightllm
├── __init__.py
├── common
│ ├── __init__.py
│ ├── all_kernel_configs
│ │ ├── __init__.py
│ │ ├── bmm_scaled_fp8
│ │ │ ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
│ │ │ └── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ ├── fp8_block_mm
│ │ │ ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
│ │ │ └── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ ├── grouped_moe_gemm_kernel
│ │ │ ├── {K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json
│ │ │ ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json
│ │ │ ├── {K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
│ │ │ ├── {K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json
│ │ │ ├── {K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
│ │ │ ├── {K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json
│ │ │ ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json
│ │ │ ├── {K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json
│ │ │ └── {K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
│ │ ├── mla_decode_attentnion
│ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json
│ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H200.json
│ │ │ └── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json
│ │ ├── moe_silu_and_mul_kernel
│ │ │ ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ │ ├── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
│ │ │ └── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json
│ │ └── moe_sum_reduce_kernel
│ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
│ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
│ │ │ ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
│ │ │ └── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
│ ├── basemodel
│ │ ├── __init__.py
│ │ ├── basemodel.py
│ │ ├── cuda_graph.py
│ │ ├── infer_lock.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── base_layer_infer.py
│ │ │ ├── cache_tensor_manager.py
│ │ │ ├── post_layer_infer.py
│ │ │ ├── pre_layer_infer.py
│ │ │ ├── template
│ │ │ │ ├── __init__.py
│ │ │ │ ├── post_layer_infer_template.py
│ │ │ │ ├── pre_layer_infer_template.py
│ │ │ │ ├── transformer_layer_infer_cohere_template.py
│ │ │ │ └── transformer_layer_infer_template.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── base_layer_weight.py
│ │ │ ├── hf_load_utils.py
│ │ │ ├── meta_weights
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_weight.py
│ │ │ │ ├── fused_moe_weight_ep.py
│ │ │ │ ├── fused_moe_weight_ep_redundancy.py
│ │ │ │ ├── fused_moe_weight_tp.py
│ │ │ │ ├── mm_weight
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── colmm_weight.py
│ │ │ │ │ ├── mm_weight.py
│ │ │ │ │ └── rowmm_weight.py
│ │ │ │ └── norm_weight.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── microbatch_overlap_objs.py
│ │ ├── multimodal_tokenizer.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── add_in_place.py
│ │ │ ├── apply_penalty.py
│ │ │ ├── bmm_scaled_fp8.py
│ │ │ ├── copy_kv_index_to_req.py
│ │ │ ├── dequantize_gemm_int4.py
│ │ │ ├── dequantize_gemm_int8.py
│ │ │ ├── destindex_copy_kv.py
│ │ │ ├── gen_decode_params.py
│ │ │ ├── gen_prefill_params.py
│ │ │ ├── multimodal_emb.py
│ │ │ ├── quantize_gemm_int8.py
│ │ │ ├── redundancy_topk_ids_repair.py
│ │ │ └── sp_pad_copy.py
│ ├── build_utils.py
│ ├── cuda_wrapper.py
│ ├── deepseek2_fp8kv_mem_manager.py
│ ├── deepseek2_mem_manager.py
│ ├── fused_moe
│ │ ├── __init__.py
│ │ ├── deepep_scatter_gather.py
│ │ ├── grouped_fused_moe.py
│ │ ├── grouped_fused_moe_ep.py
│ │ ├── grouped_topk.py
│ │ ├── moe_kernel_configs.py
│ │ ├── moe_silu_and_mul.py
│ │ ├── moe_silu_and_mul_config.py
│ │ ├── moe_silu_and_mul_mix_quant_ep.py
│ │ ├── moe_sum_recude_config.py
│ │ ├── moe_sum_reduce.py
│ │ ├── softmax_topk.py
│ │ └── topk_select.py
│ ├── infer_utils.py
│ ├── int8kv_mem_manager.py
│ ├── kernel_config.py
│ ├── kv_trans_kernel
│ │ ├── __init__.py
│ │ ├── kv_trans.py
│ │ └── kv_trans_v2.py
│ ├── mem_manager.py
│ ├── mem_utils.py
│ ├── ppl_int4kv_mem_manager.py
│ ├── ppl_int8kv_mem_manager.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── configs
│ │ │ └── llamacls-mix-down.yaml
│ │ ├── deepgemm_quant.py
│ │ ├── quantize_method.py
│ │ ├── registry.py
│ │ ├── torchao_quant.py
│ │ ├── triton_quant
│ │ │ ├── __init__.py
│ │ │ ├── fp8
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fp8act_quant_kernel.py
│ │ │ │ └── fp8w8a8_block_gemm_kernel.py
│ │ │ └── triton_quant.py
│ │ └── w8a8_quant.py
│ └── req_manager.py
├── distributed
│ ├── __init__.py
│ ├── communication_op.py
│ ├── custom_all_gather.py
│ ├── custom_all_reduce.py
│ ├── pynccl.py
│ └── pynccl_wrapper.py
├── models
│ ├── __init__.py
│ ├── bloom
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── post_layer_infer.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── hf_load_utils.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── context_flashattention_nopad.py
│ │ │ ├── layernorm.py
│ │ │ ├── token_attention_nopad_att1.py
│ │ │ ├── token_attention_nopad_reduceV.py
│ │ │ ├── token_attention_nopad_softmax.py
│ │ │ └── token_flashattention_nopad.py
│ ├── chatglm2
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ └── rotary_emb.py
│ ├── cohere
│ │ ├── __init__.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── post_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernels
│ │ │ ├── __init__.py
│ │ │ ├── layernorm.py
│ │ │ └── rotary_emb.py
│ ├── deepseek2
│ │ ├── __init__.py
│ │ ├── flashattention_infer_struct.py
│ │ ├── flashinfer_struct.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── context_flashattention_nopad.py
│ │ │ ├── context_flashattention_nopad_fp8.py
│ │ │ ├── context_flashattention_nopad_with_v.py
│ │ │ ├── destindex_copy_kv.py
│ │ │ ├── destindex_copy_kv_fp8.py
│ │ │ ├── gqa_flash_decoding.py
│ │ │ ├── gqa_flash_decoding_config.py
│ │ │ ├── gqa_flash_decoding_fp8.py
│ │ │ ├── gqa_flash_decoding_stage1.py
│ │ │ ├── gqa_flash_decoding_stage1_fp8.py
│ │ │ ├── gqa_flash_decoding_stage2.py
│ │ │ ├── repack_kv_index.py
│ │ │ ├── repeat_rope.py
│ │ │ ├── rotary_emb.py
│ │ │ ├── sample_kv.py
│ │ │ └── weight_dequant.py
│ ├── gemma3
│ │ ├── __init__.py
│ │ ├── gemma3_visual.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── post_layer_infer.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── gemma_2b
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ └── gelu_and_mul.py
│ ├── internlm
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── internlm2
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── internlm2_reward
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── post_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── pre_and_post_layer_weight.py
│ │ └── model.py
│ ├── internvl
│ │ ├── __init__.py
│ │ ├── img_process.py
│ │ ├── internvl_visual.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── pre_and_post_layer_weight.py
│ │ └── model.py
│ ├── llama
│ │ ├── __init__.py
│ │ ├── flashattention_infer_struct.py
│ │ ├── flashinfer_struct.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── post_layer_infer.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── ds_load_utils.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ ├── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── context_flashattention_nopad.py
│ │ │ ├── embedding.py
│ │ │ ├── flash_decoding.py
│ │ │ ├── flash_decoding_stage1.py
│ │ │ ├── flash_decoding_stage2.py
│ │ │ ├── gqa_decode_flashattention_nopad.py
│ │ │ ├── gqa_flash_decoding.py
│ │ │ ├── gqa_flash_decoding_stage1.py
│ │ │ ├── gqa_flash_decoding_stage2.py
│ │ │ ├── gqa_flash_decoding_vsm.py
│ │ │ ├── ppl_fp16_flash_decoding.py
│ │ │ ├── ppl_int4kv_copy_kv.py
│ │ │ ├── ppl_int4kv_flash_decoding.py
│ │ │ ├── ppl_int8kv_flash_decoding.py
│ │ │ ├── ppl_quant_copy_kv.py
│ │ │ ├── rmsnorm.py
│ │ │ ├── rotary_emb.py
│ │ │ ├── silu_and_mul.py
│ │ │ ├── token_attention_nopad_att1.py
│ │ │ ├── token_attention_nopad_reduceV.py
│ │ │ ├── token_attention_nopad_softmax.py
│ │ │ └── token_attention_softmax_and_reducev.py
│ │ └── yarn_rotary_utils.py
│ ├── llava
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── pre_and_post_layer_weight.py
│ │ ├── llava_visual.py
│ │ └── model.py
│ ├── minicpm
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── mistral
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── context_flashattention_nopad.py
│ │ │ ├── init_att_sliding_window_info.py
│ │ │ ├── token_attention_nopad_att1.py
│ │ │ ├── token_attention_nopad_reduceV.py
│ │ │ └── token_attention_softmax_and_reducev.py
│ ├── mixtral
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── _custom_ops.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── phi3
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── context_flashattention_nopad.py
│ │ │ ├── destindex_copy_kv.py
│ │ │ ├── flash_decoding.py
│ │ │ ├── flash_decoding_stage1.py
│ │ │ ├── flash_decoding_stage2.py
│ │ │ └── rotary_emb.py
│ ├── qwen
│ │ ├── __init__.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── qwen2
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── qwen2_5_vl
│ │ ├── __init__.py
│ │ └── qwen2_5_visual.py
│ ├── qwen2_reward
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── post_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── pre_and_post_layer_weight.py
│ │ └── model.py
│ ├── qwen2_vl
│ │ ├── __init__.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── model.py
│ │ ├── qwen2_visual.py
│ │ ├── triton_kernel
│ │ │ ├── __init__.py
│ │ │ └── mrope.py
│ │ └── vision_process.py
│ ├── qwen3
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── qwen3_moe
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── qwen_vl
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── pre_layer_infer.py
│ │ ├── model.py
│ │ └── qwen_visual.py
│ ├── registry.py
│ ├── stablelm
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── starcoder
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── starcoder2
│ │ ├── __init__.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ └── model.py
│ ├── tarsier2
│ │ ├── __init__.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ └── pre_and_post_layer_weight.py
│ │ ├── model.py
│ │ └── tarsier2_visual.py
│ ├── vit
│ │ ├── __init__.py
│ │ ├── infer_struct.py
│ │ ├── layer_infer
│ │ │ ├── __init__.py
│ │ │ ├── post_layer_infer.py
│ │ │ ├── pre_layer_infer.py
│ │ │ └── transformer_layer_infer.py
│ │ ├── layer_weights
│ │ │ ├── __init__.py
│ │ │ ├── hf_load_utils.py
│ │ │ ├── pre_and_post_layer_weight.py
│ │ │ └── transformer_layer_weight.py
│ │ ├── model.py
│ │ └── triton_kernel
│ │ │ ├── __init__.py
│ │ │ ├── flashattention_nopad.py
│ │ │ ├── gelu_vit.py
│ │ │ └── rms_norm_vit.py
│ └── whisper
│ │ ├── __init__.py
│ │ ├── defaults.py
│ │ ├── modeling_whisper.py
│ │ └── whisper_audio.py
├── server
│ ├── __init__.py
│ ├── api_cli.py
│ ├── api_http.py
│ ├── api_lightllm.py
│ ├── api_models.py
│ ├── api_openai.py
│ ├── api_server.py
│ ├── api_start.py
│ ├── api_tgi.py
│ ├── audioserver
│ │ ├── __init__.py
│ │ ├── manager.py
│ │ └── model_infer
│ │ │ ├── __init__.py
│ │ │ └── model_rpc.py
│ ├── build_prompt.py
│ ├── config_server
│ │ ├── __init__.py
│ │ ├── api_http.py
│ │ └── nccl_tcp_store.py
│ ├── core
│ │ ├── __init__.py
│ │ └── objs
│ │ │ ├── __init__.py
│ │ │ ├── atomic_array_lock.py
│ │ │ ├── atomic_lock.py
│ │ │ ├── io_objs
│ │ │ ├── __init__.py
│ │ │ └── group_req.py
│ │ │ ├── out_token_circlequeue.py
│ │ │ ├── py_sampling_params.py
│ │ │ ├── req.py
│ │ │ ├── rpc_shm.py
│ │ │ ├── sampling_params.py
│ │ │ ├── shm_array.py
│ │ │ ├── shm_req_manager.py
│ │ │ └── start_args_type.py
│ ├── detokenization
│ │ ├── __init__.py
│ │ ├── decode.py
│ │ ├── decode_mode_fix.py
│ │ ├── decode_req.py
│ │ └── manager.py
│ ├── embed_cache
│ │ ├── __init__.py
│ │ ├── impl
│ │ │ ├── __init__.py
│ │ │ └── naive_memory_cache.py
│ │ ├── interface.py
│ │ ├── manager.py
│ │ └── utils.py
│ ├── function_call_parser.py
│ ├── health_monitor
│ │ ├── __init__.py
│ │ └── manager.py
│ ├── httpserver
│ │ ├── __init__.py
│ │ ├── async_queue.py
│ │ ├── manager.py
│ │ └── pd_loop.py
│ ├── httpserver_for_pd_master
│ │ ├── __init__.py
│ │ ├── manager.py
│ │ └── register_loop.py
│ ├── metrics
│ │ ├── __init__.py
│ │ ├── manager.py
│ │ └── metrics.py
│ ├── multimodal_params.py
│ ├── pd_io_struct.py
│ ├── req_id_generator.py
│ ├── router
│ │ ├── __init__.py
│ │ ├── batch.py
│ │ ├── dynamic_prompt
│ │ │ ├── __init__.py
│ │ │ ├── radix_cache.py
│ │ │ └── shared_arr.py
│ │ ├── manager.py
│ │ ├── model_infer
│ │ │ ├── __init__.py
│ │ │ ├── infer_batch.py
│ │ │ ├── mode_backend
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_backend.py
│ │ │ │ ├── chunked_prefill
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── impl.py
│ │ │ │ │ ├── impl_for_first_token_constraint_mode.py
│ │ │ │ │ ├── impl_for_outlines_constraint_mode.py
│ │ │ │ │ ├── impl_for_token_healing.py
│ │ │ │ │ └── impl_for_xgrammar_mode.py
│ │ │ │ ├── continues_batch
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── impl.py
│ │ │ │ │ ├── impl_for_return_all_prompt_logprobs.py
│ │ │ │ │ ├── impl_for_reward_model.py
│ │ │ │ │ └── pd_mode
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── decode_node_impl
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── decode_impl.py
│ │ │ │ │ │ ├── decode_impl_for_dp.py
│ │ │ │ │ │ ├── decode_infer_rpyc.py
│ │ │ │ │ │ ├── decode_kv_move_manager.py
│ │ │ │ │ │ ├── decode_task_cache.py
│ │ │ │ │ │ ├── decode_trans_obj.py
│ │ │ │ │ │ ├── decode_trans_process.py
│ │ │ │ │ │ └── up_status.py
│ │ │ │ │ │ ├── p2p_fix.py
│ │ │ │ │ │ ├── prefill_node_impl
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── prefill_impl.py
│ │ │ │ │ │ ├── prefill_impl_for_dp_chuncked.py
│ │ │ │ │ │ ├── prefill_infer_rpyc.py
│ │ │ │ │ │ ├── prefill_kv_move_manager.py
│ │ │ │ │ │ ├── prefill_task_cache.py
│ │ │ │ │ │ ├── prefill_trans_obj.py
│ │ │ │ │ │ └── prefill_trans_process.py
│ │ │ │ │ │ ├── task_queue.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ ├── diverse_backend
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── impl.py
│ │ │ │ ├── dp_backend
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── impl.py
│ │ │ │ │ └── pre_process.py
│ │ │ │ ├── generic_post_process.py
│ │ │ │ ├── generic_pre_process.py
│ │ │ │ └── redundancy_expert_manager.py
│ │ │ └── model_rpc.py
│ │ ├── pause_strategy.py
│ │ ├── req_queue
│ │ │ ├── __init__.py
│ │ │ ├── base_queue.py
│ │ │ ├── chunked_prefill
│ │ │ │ ├── __init__.py
│ │ │ │ ├── beam_impl.py
│ │ │ │ ├── impl.py
│ │ │ │ └── impl_for_pd_prefill.py
│ │ │ ├── continues_batch
│ │ │ │ ├── __init__.py
│ │ │ │ ├── impl.py
│ │ │ │ └── impl_for_pd_decode.py
│ │ │ └── dp_base_queue.py
│ │ ├── stats.py
│ │ └── token_load.py
│ ├── tokenizer.py
│ └── visualserver
│ │ ├── __init__.py
│ │ ├── manager.py
│ │ └── model_infer
│ │ ├── __init__.py
│ │ └── model_rpc.py
└── utils
│ ├── __init__.py
│ ├── config_utils.py
│ ├── custom_kernel_utis.py
│ ├── device_utils.py
│ ├── dist_utils.py
│ ├── envs_utils.py
│ ├── error_utils.py
│ ├── graceful_utils.py
│ ├── health_check.py
│ ├── infer_utils.py
│ ├── light_utils.py
│ ├── log_utils.py
│ ├── multimodal_utils.py
│ ├── multinode_utils.py
│ ├── net_utils.py
│ ├── petrel_helper.py
│ ├── process_check.py
│ ├── profile_max_tokens.py
│ ├── retry_utils.py
│ ├── rpyc_fix_utils.py
│ ├── sgl_utils.py
│ ├── start_utils.py
│ ├── statics_utils.py
│ ├── time_utils.py
│ ├── tuning_utils.py
│ ├── vllm_utils.py
│ └── watchdog_utils.py
├── requirements.txt
├── setup.py
├── test
├── benchmark_client.py
├── benchmark_mcq.py
├── benchmark_qps.py
├── benchmark_serving.py
├── deepseek.sh
├── format_out
│ ├── gomoku_game.py
│ ├── qabot.py
│ ├── test_constraint_server.py
│ ├── test_demo.py
│ └── test_xgrammar_constraint.py
├── kernel
│ ├── alignment
│ │ └── llama_gqa_decode_vsm.py
│ ├── deepseekv2_bmm_scaled_fp8_tuning.py
│ ├── deepseekv3_fp8_block_gemm_tuning.py
│ ├── fuse_moe_tuning_bf16.py
│ ├── fuse_moe_tuning_fp8.py
│ ├── moe_silu_and_mul_tuning_bf16.py
│ ├── moe_sum_reduce_tuning_bf16.py
│ └── tuning
│ │ ├── deepseekv2_gqa_decode_tuning.py
│ │ └── llama_gqa_decode_vsm_tuning.py
├── model
│ ├── model_infer.py
│ ├── model_infer_vit.py
│ ├── test_model.py
│ ├── test_script.sh
│ └── test_settings
│ │ ├── model_infer_batchs.py
│ │ ├── process_utils.py
│ │ └── test_settings.py
├── server
│ ├── benchmark_prompt_cache.py
│ ├── readme.md
│ └── test_settings.py
├── test.jpg
├── test.sh
├── test_accuracy.py
├── test_constraint_server.py
├── test_function_call_api.py
├── test_multimodal_server.py
├── test_redundancy_expert_config.json
└── test_server.py
├── tools
├── quick_launch_docker.py
└── resolve_ptx_version
└── unit_tests
├── common
├── basemodel
│ └── triton_kernel
│ │ ├── test_add_in_place.py
│ │ ├── test_gen_decode_params.py
│ │ ├── test_gen_prefill_params.py
│ │ ├── test_redundancy_topk_ids_repair.py
│ │ └── test_sp_pad_kernel.py
├── fused_moe
│ ├── test_deepep.py
│ ├── test_grouped_fused_moe.py
│ ├── test_grouped_fused_moe_speed.py
│ ├── test_grouped_topk.py
│ ├── test_moe_silu_and_mul_mix_quant_ep.py
│ └── test_softmax_topk.py
└── kv_trans_kernel
│ └── test_kv_trans_v2.py
├── models
├── deepseek2
│ ├── test_destindex_copy_kv.py
│ ├── test_destindex_copy_kv_fp8.py
│ ├── test_gqa_flash_decoding.py
│ ├── test_gqa_flash_decoding_fp8.py
│ ├── test_repack_kv_index.py
│ └── test_rope_repeat.py
├── llama
│ ├── test_context_flashattention_nopad.py
│ └── test_token_attention_nopad.py
└── qwen2_vl
│ └── test_mrope.py
├── server
├── core
│ └── objs
│ │ ├── test_atomic_array_lock.py
│ │ ├── test_atomic_lock.py
│ │ ├── test_out_token_circlequeue.py
│ │ ├── test_req.py
│ │ ├── test_sampling_params.py
│ │ ├── test_shm_array.py
│ │ └── test_shm_req_manager.py
└── router
│ └── dynamic_prompt
│ └── test_radix_cache.py
└── utils
└── test_custom_kernel_utils.py
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: Pre-commit checks
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | pre-commit:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout code
14 | uses: actions/checkout@v2
15 | with:
16 | fetch-depth: 0 # Fetch all history for all branches and tags
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@v2
20 | with:
21 | python-version: '3.9'
22 |
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install pre-commit
27 | pre-commit install-hooks
28 |
29 | - name: Run pre-commit on modified files
30 | run: |
31 | if [ -n "$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }})" ]; then
32 | pre-commit run --files $(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }})
33 | else
34 | echo "No files to check"
35 | fi
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .pyc
3 | build
4 | dist
5 | *.egg-info
6 | .idea
7 | .vscode
8 | tmp/
9 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 21.12b0
4 | hooks:
5 | - id: black
6 | language_version: python3
7 | args: [--line-length=120]
8 | additional_dependencies: ['click==8.0.4']
9 | - repo: https://github.com/pycqa/flake8
10 | rev: 3.9.0
11 | hooks:
12 | - id: flake8
13 | additional_dependencies: [flake8-typing-imports==1.9.0]
14 | args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606']
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Contributing Guidelines
2 |
3 | ### Coding Style Guide
4 |
5 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html), and we recommend to use `yapf` to format your code.
6 |
7 | In this project, we adopted `pre-commit` to automatic check the code style.
8 |
9 | To begin with, you should follow the step below to install `pre-commit`.
10 |
11 | ```bash
12 | pip install pre-commit
13 | ```
14 |
15 | Then, you should config the pre-commit hook as below.
16 |
17 | ```bash
18 | pre-commit install
19 | ```
20 |
21 | Then when you commit your change, your code will be automatically checked.
22 |
--------------------------------------------------------------------------------
/assets/att.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/att.gif
--------------------------------------------------------------------------------
/assets/lightllm.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/lightllm.drawio.png
--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/logo.png
--------------------------------------------------------------------------------
/benchmark.md:
--------------------------------------------------------------------------------
1 | #### lightllm
2 |
3 | #### Launch service
4 |
5 | ~~~shell
6 | python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
7 | ~~~
8 |
9 | #### Evaluation
10 |
11 | ~~~shell
12 | python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200
13 | ~~~
14 |
15 | #### vllm
16 |
17 | #### Launch service
18 | ~~~shell
19 | python -m vllm.entrypoints.api_server --model /path/llama-7b --swap-space 16 --disable-log-requests --port 9009
20 | ~~~
21 |
22 | #### Evaluation
23 |
24 | ~~~shell
25 | python benchmark_serving_vllm.py --backend vllm --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 --host 127.0.0.1 --port 9009
26 | ~~~
--------------------------------------------------------------------------------
/build_and_upload_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Build and push docker image to AWS ECR.
4 |
5 | set -eo pipefail
6 |
7 | if [ -z "$1" ]; then
8 | echo "Must supply AWS account ID"
9 | exit 1;
10 | fi
11 |
12 | if [ -z "$2" ]; then
13 | echo "Must supply the image tag"
14 | exit 1;
15 | fi
16 |
17 | IMAGE_TAG=$2
18 | ACCOUNT=$1
19 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com
20 | DOCKER_BUILDKIT=1 docker build -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG .
21 | docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG
22 |
--------------------------------------------------------------------------------
/demos/qa_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/demos/qa_server/__init__.py
--------------------------------------------------------------------------------
/demos/readme.txt:
--------------------------------------------------------------------------------
1 | 一些应用demo的目录
--------------------------------------------------------------------------------
/docs/CN/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-20.04
6 | tools:
7 | python: "3.10"
8 |
9 | formats:
10 | - epub
11 |
12 | sphinx:
13 | configuration: docs/CN/source/conf.py
14 |
15 | python:
16 | install:
17 | - requirements: docs/CN/requirements-docs.txt
--------------------------------------------------------------------------------
/docs/CN/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/CN/README.md:
--------------------------------------------------------------------------------
1 | ## Build the docs
2 |
3 | ```bash
4 | # Install lightllm
5 |
6 | # git clone https://github.com/ModelTC/lightllm.git
7 | # cd lightllm
8 | pip install --no-deps .
9 | ```
10 |
11 | ```bash
12 | # Install dependencies.
13 |
14 | # cd docs/CN
15 | pip install -r requirements-docs.txt
16 |
17 | # Build the docs.
18 | make clean
19 | make html
20 | ```
21 |
22 | ## Open the docs with your browser
23 |
24 | ```bash
25 | python -m http.server -d build/html/
26 | ```
27 |
28 | Launch your browser and open localhost:8000.
29 |
--------------------------------------------------------------------------------
/docs/CN/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/CN/rebuild.sh:
--------------------------------------------------------------------------------
1 | make clean
2 | make html
3 | python -m http.server -d build/html/ 8000
--------------------------------------------------------------------------------
/docs/CN/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx == 6.2.1
2 | sphinx-book-theme == 1.0.1
3 | sphinx-copybutton == 0.5.2
4 | myst-parser == 2.0.0
5 | sphinx-argparse
6 | sphinxcontrib.redoc
7 | sphinxcontrib.openapi
8 |
9 | # packages to install to build the documentation
10 | pydantic
11 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
12 | numpy
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER1.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER2.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER3.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER4.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/HttpServer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/HttpServer.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Performance2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance2.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Router.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Router.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Visual_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Visual_Server.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/arch.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/backend.png
--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/token_attn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/token_attn.gif
--------------------------------------------------------------------------------
/docs/CN/source/assets/logos/lightllm-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/logos/lightllm-logo.png
--------------------------------------------------------------------------------
/docs/CN/source/server/api_server_args_zh.rst:
--------------------------------------------------------------------------------
1 | APIServer 参数详解
2 | =============================
3 |
4 |
5 | 使用方法
6 | ++++++++++++
7 |
8 | .. argparse::
9 | :module: lightllm.server.api_cli
10 | :func: make_argument_parser
11 | :prog: python -m lightllm.server.api_server
12 | :nodefaultconst:
13 |
--------------------------------------------------------------------------------
/docs/CN/source/server/benchmark.rst:
--------------------------------------------------------------------------------
1 | 服务性能评测
2 | ==================
3 |
4 | 部署完模型以后,对服务性能进行评测是非常重要的,通过服务性能的表现调整配置从而更好地利用显卡资源。
5 | 本文中,我们使用 LLaMA-7B 模型,在80G的A800显卡上,比较了lightllm 和 vLLM==0.1.2 的性能。
6 | 具体比较方式参考以下步骤:
7 |
8 | 1. 下载数据集
9 | ^^^^^^^^^^^^^^
10 |
11 | .. code-block:: console
12 |
13 | $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
14 |
15 |
16 | 2. 开启模型服务
17 | ^^^^^^^^^^^^^^^^^^^
18 |
19 | .. code-block:: console
20 |
21 | $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
22 |
23 |
24 | 3. 性能评测
25 | ^^^^^^^^^^^^^^^^
26 |
27 | .. code-block:: console
28 |
29 | $ cd test
30 | $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200
31 |
32 |
33 | 输出:
34 |
35 | .. code-block:: console
36 |
37 | read data set finish
38 | total tokens: 494250
39 | Total time: 111.37 s
40 | Throughput: 8.98 requests/s
41 | Average latency: 43.52 s
42 | Average latency per token: 0.15 s
43 | Average latency per output token: 0.73 s
--------------------------------------------------------------------------------
/docs/EN/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-20.04
6 | tools:
7 | python: "3.10"
8 |
9 | formats:
10 | - epub
11 |
12 | sphinx:
13 | configuration: docs/EN/source/conf.py
14 |
15 | python:
16 | install:
17 | - requirements: docs/EN/requirements-docs.txt
--------------------------------------------------------------------------------
/docs/EN/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/EN/README.md:
--------------------------------------------------------------------------------
1 | ## Build the docs
2 |
3 | ```bash
4 | # Install lightllm
5 |
6 | # git clone https://github.com/ModelTC/lightllm.git
7 | # cd lightllm
8 | pip install --no-deps .
9 | ```
10 |
11 | ```bash
12 | # Install dependencies.
13 |
14 | # cd docs/EN
15 | pip install -r requirements-docs.txt
16 |
17 | # Build the docs.
18 | make clean
19 | make html
20 | ```
21 |
22 | ## Open the docs with your browser
23 |
24 | ```bash
25 | python -m http.server -d build/html/
26 | ```
27 |
28 | Launch your browser and open localhost:8000.
29 |
--------------------------------------------------------------------------------
/docs/EN/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/EN/rebuild.sh:
--------------------------------------------------------------------------------
1 | make clean
2 | make html
3 | python -m http.server -d build/html/ 5888
--------------------------------------------------------------------------------
/docs/EN/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx == 6.2.1
2 | sphinx-book-theme == 1.0.1
3 | sphinx-copybutton == 0.5.2
4 | myst-parser == 2.0.0
5 | sphinx-argparse
6 | sphinxcontrib.redoc
7 | sphinxcontrib.openapi
8 |
9 | # packages to install to build the documentation
10 | pydantic
11 | -f https://download.pytorch.org/whl/cpu
12 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
13 | numpy
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER1.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER2.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER3.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER4.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/HttpServer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/HttpServer.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Performance2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance2.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Router.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Router.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Visual_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Visual_Server.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/arch.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/backend.png
--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/token_attn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/token_attn.gif
--------------------------------------------------------------------------------
/docs/EN/source/assets/logos/lightllm-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/logos/lightllm-logo.png
--------------------------------------------------------------------------------
/docs/EN/source/getting_started/faq.rst:
--------------------------------------------------------------------------------
1 | .. _faq:
2 |
3 | - The LLaMA tokenizer fails to load.
4 | - Consider resolving this by running the command:
5 |
6 | .. code-block:: shell
7 |
8 | pip install protobuf==3.20.0
9 |
10 | - ``error : PTX .version 7.4 does not support .target sm_89``
11 | - Launch with:
12 |
13 | .. code-block:: shell
14 |
15 | bash tools/resolve_ptx_version python -m lightllm.server.api_server ...
--------------------------------------------------------------------------------
/docs/EN/source/server/api_server_args.rst:
--------------------------------------------------------------------------------
1 | APIServer Args
2 | =============================
3 |
4 |
5 | Usage
6 | ++++++++++++
7 |
8 | .. argparse::
9 | :module: lightllm.server.api_cli
10 | :func: make_argument_parser
11 | :prog: python -m lightllm.server.api_server
12 | :nodefaultconst:
--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 |
4 | for filename in glob.glob('./**/*.py', recursive=True):
5 | print(filename)
6 | os.system(f"autopep8 --max-line-length 140 --in-place --aggressive --aggressive {filename}")
7 |
--------------------------------------------------------------------------------
/format_out/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/format_out/__init__.py
--------------------------------------------------------------------------------
/format_out/grammer/__init__.py:
--------------------------------------------------------------------------------
1 | # 文法表达形式限制
2 | # 1. 起始表示符一定是 S‘
3 | # 2. 不支持 "ε" 表达式
4 |
5 |
6 | grammar = [
7 | ("S'", ["S"]),
8 | ("S", ["A", "B"]),
9 | ("A", ["a", "A"]),
10 | ("A", ["ε"]),
11 | ("B", ["b", "B"]),
12 | ("B", ["ε"]),
13 | ]
14 |
--------------------------------------------------------------------------------
/format_out/grammer/json.ebnf:
--------------------------------------------------------------------------------
1 | root ::= basic_array | basic_object
2 | basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
3 | basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
4 | basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
5 | basic_string ::= (([\"] basic_string_1 [\"]))
6 | basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
7 | escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]
8 | basic_boolean ::= "true" | "false"
9 | basic_null ::= "null"
10 | basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
11 | basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
12 | ws ::= [ \n\t]*
--------------------------------------------------------------------------------
/format_out/grammer/test.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | python test0.py
3 | python test1.py
4 | python test2.py
5 | python test3.py
6 | python test4.py
7 | python test5.py
8 | python test6.py
9 |
--------------------------------------------------------------------------------
/lightllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/all_kernel_configs/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}
2 |
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}}
2 |
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "2048": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "512": {"BLOCK_M": 4, "BLOCK_N": 64, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 16}, "4096": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 16}, "64": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "512": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 16}, "2048": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "8": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 2}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 256, "num_warps": 2}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "128": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 2}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "512": {"BLOCK_M": 32, "BLOCK_N": 128, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 8, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 2}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 5}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}
--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}
--------------------------------------------------------------------------------
/lightllm/common/basemodel/__init__.py:
--------------------------------------------------------------------------------
1 | from .layer_weights.base_layer_weight import BaseLayerWeight
2 | from .layer_weights.pre_and_post_layer_weight import PreAndPostLayerWeight
3 | from .layer_weights.transformer_layer_weight import TransformerLayerWeight
4 | from .layer_infer.base_layer_infer import BaseLayerInfer
5 | from .layer_infer.pre_layer_infer import PreLayerInfer
6 | from .layer_infer.post_layer_infer import PostLayerInfer
7 | from .layer_infer.transformer_layer_infer import TransformerLayerInfer
8 | from .layer_infer.template.transformer_layer_infer_template import TransformerLayerInferTpl
9 | from .layer_infer.template.pre_layer_infer_template import PreLayerInferTpl
10 | from .layer_infer.template.post_layer_infer_template import PostLayerInferTpl
11 | from .infer_struct import InferStateInfo
12 | from .basemodel import TpPartBaseModel
13 |
14 |
15 | __all__ = [
16 | "BaseLayerWeight",
17 | "PreAndPostLayerWeight",
18 | "TransformerLayerWeight",
19 | "BaseLayerInfer",
20 | "PreLayerInfer",
21 | "PostLayerInfer",
22 | "TransformerLayerInfer",
23 | "TransformerLayerInferTpl",
24 | "InferStateInfo",
25 | "TpPartBaseModel",
26 | "PreLayerInferTpl",
27 | "PostLayerInferTpl",
28 | ]
29 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
1 | from .base_layer_infer import BaseLayerInfer
2 |
3 |
4 | class PostLayerInfer(BaseLayerInfer):
5 | """ """
6 |
7 | def __init__(self, network_config, mode):
8 | super().__init__()
9 | self.network_config_ = network_config
10 | self.mode = mode
11 | return
12 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/pre_layer_infer.py:
--------------------------------------------------------------------------------
1 | from .base_layer_infer import BaseLayerInfer
2 |
3 |
4 | class PreLayerInfer(BaseLayerInfer):
5 | """ """
6 |
7 | def __init__(self, network_config, mode):
8 | super().__init__()
9 | self.network_config_ = network_config
10 | self.mode = mode
11 | return
12 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/template/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/post_layer_infer_template.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from ..post_layer_infer import PostLayerInfer
3 | from typing import Tuple
4 |
5 |
6 | class PostLayerInferTpl(PostLayerInfer):
7 | """ """
8 |
9 | def __init__(self, network_config, mode):
10 | super().__init__(network_config, mode)
11 | self.eps_ = 1e-5
12 | self.vocab_size_ = network_config["vocab_size"]
13 | self.embed_dim_ = network_config["n_embed"]
14 | return
15 |
16 | def _norm(self, input, infer_state, layer_weight) -> torch.Tensor:
17 | raise Exception("need to impl")
18 |
19 | def _slice_get_last_input(self, input, infer_state) -> Tuple[torch.Tensor, int]:
20 | raise Exception("need to impl")
21 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/pre_layer_infer_template.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from ..pre_layer_infer import PreLayerInfer
3 |
4 |
5 | class PreLayerInferTpl(PreLayerInfer):
6 | """ """
7 |
8 | def __init__(self, network_config, mode):
9 | super().__init__(network_config, mode)
10 | self.eps_ = 1e-5
11 | self.vob_start_id_ = -1
12 | self.vob_end_id_ = -1
13 | return
14 |
15 | def _norm(self, input, infer_state, layer_weight) -> torch.Tensor:
16 | raise Exception("need to impl")
17 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
1 | from .base_layer_infer import BaseLayerInfer
2 |
3 |
4 | class TransformerLayerInfer(BaseLayerInfer):
5 | """ """
6 |
7 | def __init__(self, layer_num, network_config, mode):
8 | super().__init__()
9 | self.layer_num_ = layer_num
10 | self.network_config_ = network_config
11 | self.mode = mode
12 | return
13 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_weight import BaseWeight
2 | from .mm_weight import (
3 | MMWeightTpl,
4 | MultiMMWeightTpl,
5 | ROWMMWeight,
6 | COLMMWeight,
7 | MultiROWMMWeight,
8 | ROWBMMWeight,
9 | )
10 | from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight
11 | from .fused_moe_weight_tp import FusedMoeWeightTP
12 | from .fused_moe_weight_ep import FusedMoeWeightEP
13 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py:
--------------------------------------------------------------------------------
1 | from .mm_weight import (
2 | MMWeightTpl,
3 | MultiMMWeightTpl,
4 | )
5 | from .rowmm_weight import (
6 | ROWMMWeight,
7 | ROWBMMWeight,
8 | MultiROWMMWeight,
9 | W8A8B128ROWMMWeight,
10 | W8A8B128ROWBMMWeight,
11 | W8A8B128MultiROWMMWeight,
12 | )
13 | from .colmm_weight import (
14 | COLMMWeight,
15 | W8A8B128COLMMWeight,
16 | )
17 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | from .base_layer_weight import BaseLayerWeight
2 |
3 |
4 | class PreAndPostLayerWeight(BaseLayerWeight):
5 | def __init__(self, data_type, network_config, mode):
6 | super().__init__()
7 | self.data_type_ = data_type
8 | self.network_config_ = network_config
9 | self.mode = mode
10 | self.init_static_params()
11 | return
12 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/microbatch_overlap_objs.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from dataclasses import dataclass
3 |
4 |
5 | @dataclass
6 | class DecodeMicroBatch:
7 | batch_size: int
8 | total_token_num: int
9 | max_len_in_batch: int
10 | input_ids: torch.Tensor
11 | mem_indexes: torch.Tensor
12 | b_req_idx: torch.Tensor
13 | b_seq_len: torch.Tensor
14 |
15 |
16 | @dataclass
17 | class PrefillMicroBatch:
18 | batch_size: int
19 | total_token_num: int
20 | max_len_in_batch: int
21 | input_ids: torch.Tensor
22 | mem_indexes: torch.Tensor
23 | b_req_idx: torch.Tensor
24 | b_seq_len: torch.Tensor
25 | b_ready_cache_len: torch.Tensor
26 | multimodal_params: list
27 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/add_in_place.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import triton
4 | import triton.language as tl
5 |
6 |
7 | @triton.jit
8 | def _add_in_place(
9 | input_ptr,
10 | other_ptr,
11 | n_elements,
12 | alpha,
13 | BLOCK_SIZE: tl.constexpr,
14 | ):
15 | pid = tl.program_id(axis=0)
16 | block_start = pid * BLOCK_SIZE
17 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
18 | mask = offsets < n_elements
19 | x = tl.load(input_ptr + offsets, mask=mask)
20 | y = tl.load(other_ptr + offsets, mask=mask)
21 | x = x + y * alpha
22 | tl.store(input_ptr + offsets, x, mask=mask)
23 |
24 |
25 | @torch.no_grad()
26 | def add_in_place(input: torch.Tensor, other: torch.Tensor, *, alpha=1):
27 | assert input.is_contiguous(), "input tensor must be contiguous"
28 | assert other.is_contiguous(), "other tensor must be contiguous"
29 | n_elements = input.numel()
30 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
31 | _add_in_place[grid](
32 | input,
33 | other,
34 | n_elements,
35 | alpha,
36 | BLOCK_SIZE=1024,
37 | )
38 | return input
39 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/copy_kv_index_to_req.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import triton
4 | import triton.language as tl
5 |
6 |
7 | @triton.jit
8 | def _fwd_kernel_copy_kv_index_to_req(
9 | req_to_token_indexs, b_req_idx, b_seq_len, memindex,
10 | stride_req_to_token_b, stride_req_to_token_s
11 | ):
12 | cur_index = tl.program_id(0)
13 | cur_req_idx = tl.load(b_req_idx + cur_index)
14 | cur_token_index = tl.load(memindex + cur_index)
15 | cur_seq_len = tl.load(b_seq_len + cur_index)
16 | dest_offset = req_to_token_indexs + cur_req_idx * stride_req_to_token_b + (cur_seq_len - 1) * stride_req_to_token_s
17 | tl.store(dest_offset, cur_token_index)
18 | return
19 |
20 |
21 | @torch.no_grad()
22 | def copy_kv_index_to_req(req_to_token_indexs, b_req_idx, b_seq_len, memindex):
23 | seq_len = b_seq_len.shape[0]
24 | assert b_seq_len.shape[0] == memindex.shape[0] and b_req_idx.shape[0] == b_seq_len.shape[0]
25 | grid = (seq_len,)
26 | num_warps = 1
27 |
28 | _fwd_kernel_copy_kv_index_to_req[grid](
29 | req_to_token_indexs, b_req_idx, b_seq_len, memindex,
30 | req_to_token_indexs.stride(0), req_to_token_indexs.stride(1),
31 | num_warps=num_warps,
32 | num_stages=1,
33 | )
34 | return
35 |
--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/gen_decode_params.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 | from .gen_prefill_params import gen_cumsum_pad0_tensor
5 |
6 |
7 | @torch.no_grad()
8 | def gen_decode_params(b_seq_len: torch.Tensor):
9 | b_kv_seq_len = b_seq_len
10 | position_ids = b_seq_len - 1
11 | b_q_seq_len = torch.ones_like(b_seq_len)
12 | b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len)
13 | max_q_seq_len = b_q_seq_len.max().item()
14 | max_kv_seq_len = b_kv_seq_len.max().item()
15 | return b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len
16 |
--------------------------------------------------------------------------------
/lightllm/common/build_utils.py:
--------------------------------------------------------------------------------
1 |
2 | def repair_config(config, same_names):
3 | find_value = None
4 | for name in same_names:
5 | if name in config and config[name] is not None:
6 | find_value = config[name]
7 | break
8 | for name in same_names:
9 | config[name] = find_value
10 | return
--------------------------------------------------------------------------------
/lightllm/common/deepseek2_fp8kv_mem_manager.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .deepseek2_mem_manager import Deepseek2MemoryManager
3 |
4 |
5 | class Deepseek2FP8KVMemoryManager(Deepseek2MemoryManager):
6 | def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
7 | # scale被追加到kv_buffer末尾, 因此加2, dtype统一改成uint8
8 | super().__init__(size, torch.uint8, head_num, head_dim + 2, layer_num, always_copy, mem_fraction)
9 |
--------------------------------------------------------------------------------
/lightllm/common/fused_moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/fused_moe/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/infer_utils.py:
--------------------------------------------------------------------------------
1 | def init_req_to_token_indexes(
2 | req_to_token_indexs, b_req_idx, b_seq_len, b_ready_cache_len, max_len_in_batch, alloc_mem_index
3 | ):
4 | start_index = 0
5 | b_seq_len_numpy = b_seq_len.cpu().numpy()
6 | b_ready_cache_len_numpy = b_ready_cache_len.cpu().numpy()
7 | b_req_idx_numpy = b_req_idx.cpu().numpy()
8 | for i in range(len(b_seq_len)):
9 | cur_seq_len = b_seq_len_numpy[i]
10 | cur_ready_cache_len = b_ready_cache_len_numpy[i]
11 | req_to_token_indexs[b_req_idx_numpy[i], cur_ready_cache_len:cur_seq_len] = alloc_mem_index[
12 | start_index : start_index + cur_seq_len - cur_ready_cache_len
13 | ]
14 | start_index += cur_seq_len - cur_ready_cache_len
15 | return
16 |
--------------------------------------------------------------------------------
/lightllm/common/kv_trans_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/kv_trans_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/mem_utils.py:
--------------------------------------------------------------------------------
1 | from lightllm.common.mem_manager import MemoryManager
2 | from lightllm.common.int8kv_mem_manager import INT8KVMemoryManager
3 | from lightllm.common.ppl_int8kv_mem_manager import PPLINT8KVMemoryManager
4 | from lightllm.common.ppl_int4kv_mem_manager import PPLINT4KVMemoryManager
5 | from lightllm.utils.log_utils import init_logger
6 |
7 | logger = init_logger(__name__)
8 |
9 |
10 | def select_mem_manager_class(mode):
11 | logger.info(f"mode setting params: {mode}")
12 | if "ppl_int8kv" in mode or "ppl_int8kv_flashdecoding" in mode:
13 | memory_manager_class = PPLINT8KVMemoryManager
14 | logger.info(f"Model kv cache using mode {mode}")
15 | elif "ppl_int4kv_flashdecoding" in mode:
16 | memory_manager_class = PPLINT4KVMemoryManager
17 | logger.info(f"Model kv cache using mode {mode}")
18 | elif "triton_int8kv" in mode:
19 | memory_manager_class = INT8KVMemoryManager
20 | logger.info("Model kv cache using mode triton int8kv")
21 | elif "triton_fp8kv" in mode:
22 | raise Exception("currently only for deepseek")
23 | else:
24 | memory_manager_class = MemoryManager
25 | logger.info("Model kv cache using mode normal")
26 | return memory_manager_class
27 |
--------------------------------------------------------------------------------
/lightllm/common/quantization/configs/llamacls-mix-down.yaml:
--------------------------------------------------------------------------------
1 | quant_type: vllm-w8a8
2 | mix_bits:
3 | - name: "down_proj"
4 | quant_type: "none"
5 | layer_nums: [1, 2, 3] # Defaults to all layers, or you can specify a layer_num list.
--------------------------------------------------------------------------------
/lightllm/common/quantization/quantize_method.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from abc import ABC, abstractmethod
3 | from lightllm.utils.dist_utils import get_current_device_id
4 |
5 |
6 | class QuantizationMethod(ABC):
7 | def __init__(self):
8 | super().__init__()
9 | self.device_id_ = get_current_device_id()
10 | self.weight_scale_suffix = None
11 | self.act_scale_suffix = None
12 |
13 | @abstractmethod
14 | def quantize(self, weights: torch.Tensor):
15 | pass
16 |
17 | @abstractmethod
18 | def apply(self, input_tensor, weight, bias=None, out=None, use_custom_tensor_mananger=True):
19 | pass
20 |
--------------------------------------------------------------------------------
/lightllm/common/quantization/registry.py:
--------------------------------------------------------------------------------
1 | class QuantMethodFactory:
2 | def __init__(self):
3 | self._quant_methods = {}
4 |
5 | def register(self, names):
6 | def decorator(cls):
7 | local_names = names
8 | if isinstance(local_names, str):
9 | local_names = [local_names]
10 | for n in local_names:
11 | self._quant_methods[n] = cls
12 | return cls
13 |
14 | return decorator
15 |
16 | def get(self, key, *args, **kwargs):
17 | if key == "none":
18 | return None
19 | quant_method_class = self._quant_methods.get(key)
20 | if not quant_method_class:
21 | raise ValueError(f"QuantMethod '{key}' not supported.")
22 | return quant_method_class()
23 |
24 |
25 | QUANTMETHODS = QuantMethodFactory()
26 |
--------------------------------------------------------------------------------
/lightllm/common/quantization/triton_quant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/__init__.py
--------------------------------------------------------------------------------
/lightllm/common/quantization/triton_quant/fp8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/fp8/__init__.py
--------------------------------------------------------------------------------
/lightllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 |
--------------------------------------------------------------------------------
/lightllm/models/bloom/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/bloom/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/bloom/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/bloom/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/chatglm2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/chatglm2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/chatglm2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/chatglm2/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/cohere/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/cohere/infer_struct.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
2 |
3 |
4 | class CohereInferStateInfo(LlamaInferStateInfo):
5 | def __init__(self):
6 | super().__init__()
7 | self._attn_out = None
8 | self._ffn_out = None
9 |
--------------------------------------------------------------------------------
/lightllm/models/cohere/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/cohere/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/cohere/triton_kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/triton_kernels/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/deepseek2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/deepseek2/infer_struct.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import numpy as np
4 | import torch.distributed as dist
5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
6 |
7 |
8 | class Deepseek2InferStateInfo(LlamaInferStateInfo):
9 | def __init__(self):
10 | super().__init__()
11 | self.kv_starts = None
12 |
13 | def init_some_extra_state(self, model, input_ids: torch.Tensor):
14 | super().init_some_extra_state(model, input_ids)
15 | if not self.is_prefill:
16 | self.kv_starts = self.b1_cu_kv_seq_len
17 |
18 | if self.is_prefill:
19 | self.b1_kv_start_loc = self.b1_cu_kv_seq_len
20 | self.max_value_in_b_seq_len = self.b_seq_len.max().item()
21 | return
22 |
--------------------------------------------------------------------------------
/lightllm/models/deepseek2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/deepseek2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/deepseek2/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
4 |
5 |
6 | # add key: language_model.xxx -> xxx
7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now
8 | def rename_weight_keys(weights):
9 | prefix = "language_model."
10 | keys = list(weights.keys())
11 | for k in keys:
12 | if prefix in k:
13 | weights[k[len(prefix) :]] = weights[k]
14 |
15 |
16 | class Gemma3PreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
17 | def __init__(self, data_type, network_config, mode):
18 | network_config["tie_word_embeddingse"] = True
19 | super().__init__(data_type, network_config, mode)
20 | return
21 |
22 | def load_hf_weights(self, weights):
23 | rename_weight_keys(weights)
24 | super().load_hf_weights(weights)
25 | return
26 |
--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
4 |
5 |
6 | class Gemma_2bPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
7 | def __init__(self, data_type, network_config, mode):
8 | super().__init__(data_type, network_config, mode)
9 | return
10 |
11 | def load_hf_weights(self, weights):
12 | vob_size = self.network_config_["vocab_size"]
13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 | split_start = split_indexes[self.tp_rank_]
15 | split_end = split_indexes[self.tp_rank_ + 1]
16 | if "model.embed_tokens.weight" in weights:
17 | # print(weights['model.embed_tokens.weight'].shape)
18 | self.wte_weight_ = self._cuda(weights["model.embed_tokens.weight"][split_start:split_end, :])
19 | self.lm_head_weight_ = self.wte_weight_
20 |
21 | if "model.norm.weight" in weights:
22 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
23 | self.final_norm_weight_ = self.final_norm_weight_ + 1
24 |
25 | return
26 |
--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import math
3 | import numpy as np
4 |
5 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
6 |
7 |
8 | class InternlmTransformerLayerWeight(LlamaTransformerLayerWeight):
9 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
10 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
11 | return
12 |
13 | def _init_weight_names(self):
14 | super()._init_weight_names()
15 | self._q_bias_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.bias"
16 | self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias"
17 | self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias"
18 | self._o_bias_name = f"model.layers.{self.layer_num_}.self_attn.o_proj.bias"
19 |
--------------------------------------------------------------------------------
/lightllm/models/internlm/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | from lightllm.models.registry import ModelRegistry
5 | from lightllm.models.internlm.layer_weights.transformer_layer_weight import InternlmTransformerLayerWeight
6 | from lightllm.models.llama.model import LlamaTpPartModel
7 |
8 |
9 | @ModelRegistry("internlm")
10 | class InternlmTpPartModel(LlamaTpPartModel):
11 | # weight class
12 | transformer_weight_class = InternlmTransformerLayerWeight
13 |
14 | def __init__(self, kvargs):
15 | super().__init__(kvargs)
16 |
--------------------------------------------------------------------------------
/lightllm/models/internlm2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
4 |
5 |
6 | class Internlm2PreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
7 | def __init__(self, data_type, network_config, mode):
8 | super().__init__(data_type, network_config, mode)
9 | return
10 |
11 | def load_hf_weights(self, weights):
12 | vob_size = self.network_config_["vocab_size"]
13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 | split_start = split_indexes[self.tp_rank_]
15 | split_end = split_indexes[self.tp_rank_ + 1]
16 | if "model.tok_embeddings.weight" in weights:
17 | self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :])
18 | if "output.weight" in weights:
19 | self.lm_head_weight_ = self._cuda(weights["output.weight"][split_start:split_end, :])
20 | if "model.norm.weight" in weights:
21 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
22 |
23 | return
24 |
--------------------------------------------------------------------------------
/lightllm/models/internlm2/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 |
5 | from lightllm.models.registry import ModelRegistry
6 | from lightllm.models.internlm2.layer_weights.transformer_layer_weight import Internlm2TransformerLayerWeight
7 | from lightllm.models.internlm2.layer_weights.pre_and_post_layer_weight import Internlm2PreAndPostLayerWeight
8 | from lightllm.models.internlm.model import InternlmTpPartModel
9 |
10 |
11 | @ModelRegistry("internlm2")
12 | class Internlm2TpPartModel(InternlmTpPartModel):
13 | # weight class
14 | pre_and_post_weight_class = Internlm2PreAndPostLayerWeight
15 | transformer_weight_class = Internlm2TransformerLayerWeight
16 |
17 | def __init__(self, kvargs):
18 | super().__init__(kvargs)
19 |
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributed as dist
3 | import numpy as np
4 |
5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
6 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
7 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
8 | from einops import rearrange
9 |
10 |
11 | class Internlm2RewardPostLayerInfer(LlamaPostLayerInfer):
12 | def token_forward(self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
13 | last_input, token_num = self._slice_get_last_input(input_embdings, infer_state)
14 |
15 | input_embdings = None
16 | last_input = self._norm(last_input, infer_state, layer_weight)
17 | score = torch.mm(last_input, layer_weight.lm_head_weight_)
18 |
19 | return score
20 |
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
4 |
5 |
6 | class Internlm2RewardPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
7 | def __init__(self, data_type, network_config, mode):
8 | super().__init__(data_type, network_config, mode)
9 | return
10 |
11 | def load_hf_weights(self, weights):
12 | vob_size = self.network_config_["vocab_size"]
13 | split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 | split_start = split_indexes[self.tp_rank_]
15 | split_end = split_indexes[self.tp_rank_ + 1]
16 | if "model.tok_embeddings.weight" in weights:
17 | self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :])
18 | if "v_head.weight" in weights:
19 | self.lm_head_weight_ = self._cuda(weights["v_head.weight"]).transpose(0, 1)
20 | if "model.norm.weight" in weights:
21 | self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
22 |
23 | return
24 |
--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | from lightllm.models.registry import ModelRegistry, is_reward_model
5 | from lightllm.models.internlm2_reward.layer_infer.post_layer_infer import Internlm2RewardPostLayerInfer
6 | from lightllm.models.internlm2_reward.layer_weights.pre_and_post_layer_weight import (
7 | Internlm2RewardPreAndPostLayerWeight,
8 | )
9 | from lightllm.models.internlm2.model import Internlm2TpPartModel
10 |
11 |
12 | @ModelRegistry("internlm2", condition=is_reward_model())
13 | class Internlm2RewardTpPartModel(Internlm2TpPartModel):
14 | # weight class
15 | pre_and_post_weight_class = Internlm2RewardPreAndPostLayerWeight
16 |
17 | post_layer_infer_class = Internlm2RewardPostLayerInfer
18 |
19 | def __init__(self, kvargs):
20 | super().__init__(kvargs)
21 |
--------------------------------------------------------------------------------
/lightllm/models/internvl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/internvl/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llama/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llama/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llama/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llama/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llama/yarn_rotary_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 |
5 | # Inverse dim formula to find dim based on number of rotations
6 | def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
7 | return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
8 |
9 |
10 | # Find dim range bounds based on rotations
11 | def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
12 | low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
13 | high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
14 | return max(low, 0), min(high, dim - 1) # Clamp values just in case
15 |
16 |
17 | def linear_ramp_mask(min, max, dim):
18 | if min == max:
19 | max += 0.001 # Prevent singularity
20 |
21 | linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
22 | ramp_func = torch.clamp(linear_func, 0, 1)
23 | return ramp_func
24 |
25 |
26 | def get_mscale(scale=1):
27 | if scale <= 1:
28 | return 1.0
29 | return 0.1 * math.log(scale) + 1.0
30 |
31 |
32 | def get_deepseek_mscale(scale=1, mscale=1):
33 | if scale <= 1:
34 | return 1.0
35 | return 0.1 * mscale * math.log(scale) + 1.0
36 |
--------------------------------------------------------------------------------
/lightllm/models/llava/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llava/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/llava/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
4 |
5 |
6 | # add key: language_model.xxx -> xxx
7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now
8 | def rename_weight_keys(weights):
9 | prefix = "language_model."
10 | keys = list(weights.keys())
11 | for k in keys:
12 | if prefix in k:
13 | weights[k[len(prefix) :]] = weights[k]
14 |
15 |
16 | class LlavaPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
17 | def __init__(self, data_type, network_config, mode):
18 | super().__init__(data_type, network_config, mode)
19 | return
20 |
21 | def load_hf_weights(self, weights):
22 | rename_weight_keys(weights)
23 | super().load_hf_weights(weights)
24 | return
25 |
--------------------------------------------------------------------------------
/lightllm/models/minicpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/minicpm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/minicpm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
1 | import math
2 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
3 |
4 |
5 | class MiniCPMTransformerLayerWeight(LlamaTransformerLayerWeight):
6 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
7 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
8 | return
9 |
10 | def _parse_config(self):
11 | super()._parse_config()
12 | num_hidden_layers = self.network_config_["num_hidden_layers"]
13 | scale_depth = self.network_config_.get("scale_depth", math.sqrt(num_hidden_layers))
14 | self.layer_scale = scale_depth / math.sqrt(num_hidden_layers)
15 |
16 | def load_hf_weights(self, weights):
17 | if self._o_weight_name in weights:
18 | weights[self._o_weight_name] *= self.layer_scale
19 | if self._down_weight_name in weights:
20 | weights[self._down_weight_name] *= self.layer_scale
21 | super().load_hf_weights(weights)
22 |
--------------------------------------------------------------------------------
/lightllm/models/minicpm/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | from lightllm.models.registry import ModelRegistry
5 | from lightllm.models.minicpm.layer_weights.transformer_layer_weight import MiniCPMTransformerLayerWeight
6 | from lightllm.models.minicpm.layer_weights.pre_and_post_layer_weight import MiniCPMPreAndPostLayerWeight
7 | from lightllm.models.llama.model import LlamaTpPartModel
8 |
9 |
10 | @ModelRegistry("minicpm")
11 | class MiniCPMTpPartModel(LlamaTpPartModel):
12 | # weight class
13 | transformer_weight_class = MiniCPMTransformerLayerWeight
14 | pre_and_post_weight_class = MiniCPMPreAndPostLayerWeight
15 |
16 | def __init__(self, kvargs):
17 | super().__init__(kvargs)
18 |
--------------------------------------------------------------------------------
/lightllm/models/mistral/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/mistral/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/mistral/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
2 |
3 |
4 | class MistralTransformerLayerInfer(LlamaTransformerLayerInfer):
5 | """ """
6 |
7 | def __init__(self, layer_num, network_config, mode=[]):
8 | super().__init__(layer_num, network_config, mode)
9 | self.head_dim_ = network_config.get("head_dim", self.head_dim_)
10 | return
11 |
--------------------------------------------------------------------------------
/lightllm/models/mistral/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/mistral/triton_kernel/init_att_sliding_window_info.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import triton
4 | import triton.language as tl
5 |
6 |
7 | @triton.jit
8 | def _fwd_kernel_init_att_window_info(
9 | b_seq_len,
10 | b_att_seq_len,
11 | batch_size,
12 | sliding_window,
13 | BLOCK_SIZE: tl.constexpr,
14 | ):
15 | cur_index = tl.program_id(0)
16 | cur_start = cur_index * BLOCK_SIZE
17 | offsets = cur_start + tl.arange(0, BLOCK_SIZE)
18 | mask = offsets < batch_size
19 |
20 | cur_seq_len = tl.load(b_seq_len + offsets, mask=mask)
21 | b_att_seq_len_data = tl.minimum(cur_seq_len, sliding_window)
22 |
23 | tl.store(b_att_seq_len + offsets, b_att_seq_len_data, mask=mask)
24 | return
25 |
26 |
27 | @torch.no_grad()
28 | def init_att_window_info_fwd(batch_size, b_seq_len, b_att_seq_len, sliding_window):
29 | # shape constraints
30 | assert batch_size == b_seq_len.shape[0] == b_att_seq_len.shape[0]
31 |
32 | BLOCK_SIZE = 32
33 | num_warps = 1
34 | grid = (triton.cdiv(batch_size, BLOCK_SIZE),)
35 |
36 | _fwd_kernel_init_att_window_info[grid](
37 | b_seq_len,
38 | b_att_seq_len,
39 | batch_size=batch_size,
40 | sliding_window=sliding_window,
41 | BLOCK_SIZE=BLOCK_SIZE,
42 | num_warps=num_warps,
43 | num_stages=1,
44 | )
45 | return
46 |
--------------------------------------------------------------------------------
/lightllm/models/mixtral/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/mixtral/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/mixtral/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/phi3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/phi3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/phi3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/phi3/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | from lightllm.models.registry import ModelRegistry
5 | from lightllm.models.phi3.layer_weights.transformer_layer_weight import Phi3TransformerLayerWeight
6 | from lightllm.models.phi3.layer_infer.transformer_layer_infer import Phi3TransformerLayerInfer
7 | from lightllm.models.llama.model import LlamaTpPartModel
8 |
9 |
10 | @ModelRegistry("phi3")
11 | class Phi3TpPartModel(LlamaTpPartModel):
12 | # weight class
13 | transformer_weight_class = Phi3TransformerLayerWeight
14 |
15 | transformer_layer_infer_class = Phi3TransformerLayerInfer
16 |
17 | def __init__(self, kvargs):
18 | super().__init__(kvargs)
19 |
--------------------------------------------------------------------------------
/lightllm/models/phi3/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/lightllm/models/qwen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2_5_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_5_vl/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_infer/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
4 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
5 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight
6 | from einops import rearrange
7 |
8 |
9 | class Qwen2RewardPostLayerInfer(LlamaPostLayerInfer):
10 | def token_forward(
11 | self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: Qwen2RewardPreAndPostLayerWeight
12 | ):
13 | last_input, token_num = self._slice_get_last_input(input_embdings, infer_state)
14 |
15 | input_embdings = None
16 | last_input = self._norm(last_input, infer_state, layer_weight)
17 |
18 | last_input = torch.addmm(layer_weight.score_up_bias, last_input, layer_weight.score_up_weight)
19 | last_input = torch.nn.functional.relu(last_input)
20 | score = torch.addmm(layer_weight.score_down_bias, last_input, layer_weight.score_down_weight)
21 |
22 | return score
23 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_weights/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/model.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.registry import ModelRegistry, is_reward_model
2 | from lightllm.models.qwen2_reward.layer_infer.post_layer_infer import Qwen2RewardPostLayerInfer
3 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight
4 | from lightllm.models.qwen2.model import Qwen2TpPartModel
5 |
6 |
7 | @ModelRegistry("qwen2", condition=is_reward_model())
8 | class Qwen2RewardTpPartModel(Qwen2TpPartModel):
9 |
10 | pre_and_post_weight_class = Qwen2RewardPreAndPostLayerWeight
11 | post_layer_infer_class = Qwen2RewardPostLayerInfer
12 |
13 | def __init__(self, kvargs):
14 | super().__init__(kvargs)
15 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/infer_struct.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
4 | from lightllm.common.basemodel.infer_struct import InferStateInfo
5 |
6 |
7 | class Qwen2VLInferStateInfo(LlamaInferStateInfo):
8 | def __init__(self):
9 | super().__init__()
10 | self.position_cos = None
11 | self.position_sin = None
12 |
13 | def init_some_extra_state(self, model, input_ids: torch.Tensor):
14 | InferStateInfo.init_some_extra_state(self, model, input_ids)
15 | if self.is_prefill:
16 | position_ids = self.position_ids
17 | self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
18 | self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
19 | position_ids = None
20 | else:
21 | position_ids = self.position_ids
22 | self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
23 | self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
24 | return
25 |
--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import final
3 | from lightllm.models.registry import ModelRegistry
4 | from lightllm.models.qwen3.layer_infer.transformer_layer_infer import Qwen3TransformerLayerInfer
5 | from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
6 | from lightllm.models.qwen2.model import Qwen2TpPartModel
7 | from lightllm.utils.log_utils import init_logger
8 |
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | @ModelRegistry("qwen3")
14 | class Qwen3TpPartModel(Qwen2TpPartModel):
15 | # weight class
16 | transformer_weight_class = Qwen3TransformerLayerWeight
17 |
18 | # infer class
19 | transformer_layer_infer_class = Qwen3TransformerLayerInfer
20 |
21 | def __init__(self, kvargs):
22 | super().__init__(kvargs)
23 | return
24 |
--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import final
3 | from lightllm.models.registry import ModelRegistry
4 | from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
5 | from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
6 | from lightllm.models.qwen3.model import Qwen3TpPartModel
7 | from lightllm.utils.log_utils import init_logger
8 |
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | @ModelRegistry("qwen3_moe")
14 | class Qwen3MOEModel(Qwen3TpPartModel):
15 | # weight class
16 | transformer_weight_class = Qwen3MOETransformerLayerWeight
17 |
18 | # infer class
19 | transformer_layer_infer_class = Qwen3MOETransformerLayerInfer
20 |
21 | def __init__(self, kvargs):
22 | super().__init__(kvargs)
23 | return
24 |
--------------------------------------------------------------------------------
/lightllm/models/qwen_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/qwen_vl/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/stablelm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.qwen2.layer_weights.transformer_layer_weight import Qwen2TransformerLayerWeight
2 | from lightllm.common.basemodel.layer_weights.meta_weights import NormWeight
3 |
4 |
5 | class StablelmTransformerLayerWeight(Qwen2TransformerLayerWeight):
6 | def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
7 | super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
8 | return
9 |
10 | def _init_weight_names(self):
11 | super()._init_weight_names()
12 | self._att_norm_bias_name = f"model.layers.{self.layer_num_}.input_layernorm.bias"
13 | self._ffn_norm_bias_name = f"model.layers.{self.layer_num_}.post_attention_layernorm.bias"
14 |
--------------------------------------------------------------------------------
/lightllm/models/stablelm/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import torch
4 | from lightllm.models.registry import ModelRegistry
5 | from lightllm.models.stablelm.layer_infer.transformer_layer_infer import StablelmTransformerLayerInfer
6 | from lightllm.models.bloom.layer_infer.post_layer_infer import BloomPostLayerInfer
7 | from lightllm.models.stablelm.layer_weights.pre_and_post_layer_weight import StableLMPreAndPostLayerWeight
8 | from lightllm.models.stablelm.layer_weights.transformer_layer_weight import StablelmTransformerLayerWeight
9 | from lightllm.models.llama.model import LlamaTpPartModel
10 | from lightllm.common.build_utils import repair_config
11 |
12 |
13 | @ModelRegistry("stablelm")
14 | class StablelmTpPartModel(LlamaTpPartModel):
15 | # weight class
16 | pre_and_post_weight_class = StableLMPreAndPostLayerWeight
17 | transformer_weight_class = StablelmTransformerLayerWeight
18 |
19 | # infer class
20 | transformer_layer_infer_class = StablelmTransformerLayerInfer
21 | post_layer_infer_class = BloomPostLayerInfer
22 |
23 | def __init__(self, kvargs):
24 | super().__init__(kvargs)
25 |
26 | def _init_config(self):
27 | super()._init_config()
28 | repair_config(self.config, same_names=["rms_norm_eps", "layer_norm_eps", "layer_norm_epsilon"])
29 | return
30 |
--------------------------------------------------------------------------------
/lightllm/models/starcoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.bloom.layer_infer.transformer_layer_infer import BloomTransformerLayerInfer
2 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
3 |
4 |
5 | class StarcoderTransformerLayerInfer(BloomTransformerLayerInfer):
6 | """ """
7 |
8 | def __init__(self, layer_num, network_config, mode=[]):
9 | super().__init__(layer_num, network_config, mode)
10 | self.tp_k_head_num_ = 1
11 | self.tp_v_head_num_ = 1
12 | self._bind_func()
13 | return
14 |
15 | def _bind_func(self):
16 | LlamaTransformerLayerInfer._bind_attention(self)
17 | return
18 |
--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/starcoder2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/starcoder2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/starcoder2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/tarsier2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/tarsier2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/vit/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/vit/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_weights/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/vit/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/triton_kernel/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/vit/triton_kernel/gelu_vit.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import triton
3 | import triton.language as tl
4 | from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
5 |
6 |
7 | @triton.jit
8 | def gelu(x):
9 | x_fp32 = x.to(tl.float32)
10 | x_gelu = 0.5 * x_fp32 * (1 + tl.math.erf(x_fp32 * 0.7071067811))
11 | return x_gelu
12 |
13 |
14 | @triton.jit
15 | def gelu_kernel(output_ptr, input_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
16 | pid = tl.program_id(axis=0)
17 | block_start = pid * BLOCK_SIZE
18 | offsets = block_start + tl.arange(0, BLOCK_SIZE)
19 | mask = offsets < n_elements
20 | input = tl.load(input_ptr + offsets, mask=mask)
21 | output = gelu(input)
22 | tl.store(output_ptr + offsets, output, mask=mask)
23 |
24 |
25 | def gelu_fwd(input, use_custom_tensor_mananger=False):
26 | if use_custom_tensor_mananger:
27 | shape = input.shape
28 | dtype = input.dtype
29 | device = input.device
30 | output = g_cache_manager.alloc_tensor(shape, dtype, device=device)
31 | else:
32 | output = torch.empty_like(input)
33 | assert input.is_contiguous(), "Input tensor must be contiguous"
34 | n_elements = input.numel()
35 | grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
36 | gelu_kernel[grid](output, input, n_elements, BLOCK_SIZE=1024)
37 | return output
38 |
--------------------------------------------------------------------------------
/lightllm/models/whisper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/whisper/__init__.py
--------------------------------------------------------------------------------
/lightllm/models/whisper/defaults.py:
--------------------------------------------------------------------------------
1 | MIN_AUDIO_LEN = 480 # 最短音频长度
2 |
--------------------------------------------------------------------------------
/lightllm/server/__init__.py:
--------------------------------------------------------------------------------
1 | from .router.token_load import TokenLoad
2 |
--------------------------------------------------------------------------------
/lightllm/server/api_server.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .api_cli import make_argument_parser
3 |
4 | if __name__ == "__main__":
5 | torch.multiprocessing.set_start_method("spawn") # this code will not be ok for settings to fork to subprocess
6 | parser = make_argument_parser()
7 | args = parser.parse_args()
8 | from .api_start import pd_master_start, normal_or_p_d_start, config_server_start
9 |
10 | if args.run_mode == "pd_master":
11 | pd_master_start(args)
12 | elif args.run_mode == "config_server":
13 | config_server_start(args)
14 | else:
15 | normal_or_p_d_start(args)
16 |
--------------------------------------------------------------------------------
/lightllm/server/audioserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/audioserver/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/model_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/config_server/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This module implements a configuration service designed to facilitate the
3 | registration and retrieval of information in a PD separation mode. It
4 | allows various nodes to register their own information and query global
5 | configuration details efficiently.
6 |
7 | Key Features:
8 | - Node registration: Enables nodes to register their specific information.
9 | - Global configuration query: Provides mechanisms for querying shared
10 | configuration data across the system.
11 | - Designed for distributed systems operating in PD separation mode.
12 | """
13 |
--------------------------------------------------------------------------------
/lightllm/server/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/core/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/core/objs/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampling_params import SamplingParams
2 | from .req import Req, FinishStatus
3 | from .shm_req_manager import ShmReqManager
4 | from .rpc_shm import RpcShmParams, RpcShmResults, ShmSyncStatusArray
5 |
--------------------------------------------------------------------------------
/lightllm/server/core/objs/io_objs/__init__.py:
--------------------------------------------------------------------------------
1 | from .group_req import GroupReqIndexes, GroupReqObjs
2 |
--------------------------------------------------------------------------------
/lightllm/server/core/objs/io_objs/group_req.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from lightllm.server.multimodal_params import MultimodalParams
3 | from typing import List
4 | from ..req import Req
5 |
6 |
7 | @dataclass
8 | class GroupReqIndexes:
9 | group_req_id: int
10 | multimodal_params: MultimodalParams
11 | shm_req_indexes: List[int]
12 | time_mark: float
13 |
14 |
15 | @dataclass
16 | class GroupReqObjs:
17 | group_req_id: int
18 | multimodal_params: MultimodalParams
19 | shm_req_objs: List[Req]
20 | time_mark: float
21 |
22 | def to_group_req_index(self):
23 | return GroupReqIndexes(
24 | group_req_id=self.group_req_id,
25 | multimodal_params=self.multimodal_params,
26 | shm_req_indexes=[req.index_in_shm_mem for req in self.shm_req_objs],
27 | time_mark=self.time_mark,
28 | )
29 |
--------------------------------------------------------------------------------
/lightllm/server/detokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/detokenization/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/detokenization/decode_mode_fix.py:
--------------------------------------------------------------------------------
1 | """
2 | p d 分离模式下, 对于到达的请求,需要将输入的prompt_ids 中的最后一个id,提前处理,然后移入到outputs中
3 | 这是 p d 分离模式下,decode 节点的特殊处理点。
4 | """
5 | from .decode_req import DecodeReq
6 | from .decode import decode_token
7 |
8 | from lightllm.utils.log_utils import init_logger
9 |
10 | logger = init_logger(__name__)
11 |
12 |
13 | def decode_mode_fix(req_out: DecodeReq, tokenizer, eos_id):
14 | new_token_id = req_out.prompt_ids[-1]
15 | decode_token(tokenizer, req_out, new_token_id, eos_id)
16 | return req_out
17 |
--------------------------------------------------------------------------------
/lightllm/server/embed_cache/__init__.py:
--------------------------------------------------------------------------------
1 | from . import impl
--------------------------------------------------------------------------------
/lightllm/server/embed_cache/impl/__init__.py:
--------------------------------------------------------------------------------
1 | from . import naive_memory_cache
--------------------------------------------------------------------------------
/lightllm/server/embed_cache/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from io import BytesIO
4 | import multiprocessing.shared_memory as shm
5 |
6 |
7 | def tensor2bytes(t: torch.Tensor):
8 | # t = t.cpu().numpy().tobytes()
9 | # return t
10 | buf = BytesIO()
11 | torch.save(t.detach().cpu(), buf)
12 | buf.seek(0)
13 | return buf.read()
14 |
15 |
16 | def bytes2tensor(b):
17 | # return torch.from_numpy(np.frombuffer(b, dtype=np.float16)).cuda()
18 | return torch.load(BytesIO(b))
19 |
20 |
21 | def create_shm(name, data):
22 | try:
23 | data_size = len(data)
24 | shared_memory = shm.SharedMemory(name=name, create=True, size=data_size)
25 | mem_view = shared_memory.buf
26 | mem_view[:data_size] = data
27 | except FileExistsError:
28 | print("Warning create shm {} failed because of FileExistsError!".format(name))
29 |
30 |
31 | def read_shm(name):
32 | shared_memory = shm.SharedMemory(name=name)
33 | data = shared_memory.buf.tobytes()
34 | return data
35 |
36 |
37 | def free_shm(name):
38 | shared_memory = shm.SharedMemory(name=name)
39 | shared_memory.close()
40 | shared_memory.unlink()
41 |
42 |
43 | def get_shm_name_data(uid):
44 | return str(uid) + "-data"
45 |
46 |
47 | def get_shm_name_embed(uid):
48 | return str(uid) + "-embed"
49 |
--------------------------------------------------------------------------------
/lightllm/server/health_monitor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/health_monitor/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/httpserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/httpserver/async_queue.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 |
4 | class AsyncQueue:
5 | def __init__(self):
6 | self.datas = []
7 | self.event = asyncio.Event()
8 | self.lock = asyncio.Lock()
9 |
10 | async def wait_to_ready(self):
11 | try:
12 | await asyncio.wait_for(self.event.wait(), timeout=3)
13 | except asyncio.TimeoutError:
14 | pass
15 |
16 | async def get_all_data(self):
17 | async with self.lock:
18 | self.event.clear()
19 | ans = self.datas
20 | self.datas = []
21 | return ans
22 |
23 | async def put(self, obj):
24 | async with self.lock:
25 | self.datas.append(obj)
26 | self.event.set()
27 | return
28 |
29 | async def wait_to_get_all_data(self):
30 | await self.wait_to_ready()
31 | handle_list = await self.get_all_data()
32 | return handle_list
33 |
--------------------------------------------------------------------------------
/lightllm/server/httpserver_for_pd_master/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver_for_pd_master/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .manager import start_metric_manager
2 |
--------------------------------------------------------------------------------
/lightllm/server/router/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/dynamic_prompt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/dynamic_prompt/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/__init__.py:
--------------------------------------------------------------------------------
1 | from .continues_batch.impl import ContinuesBatchBackend
2 | from .continues_batch.impl_for_return_all_prompt_logprobs import ReturnPromptLogProbBackend
3 | from .continues_batch.impl_for_reward_model import RewardModelBackend
4 | from .chunked_prefill.impl import ChunkedPrefillBackend
5 | from .diverse_backend.impl import DiversehBackend
6 | from .chunked_prefill.impl_for_token_healing import TokenHealingBackend
7 | from .chunked_prefill.impl_for_outlines_constraint_mode import OutlinesConstraintBackend
8 | from .chunked_prefill.impl_for_first_token_constraint_mode import FirstTokenConstraintBackend
9 | from .dp_backend.impl import DPChunkedPrefillBackend
10 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl import ChunckedPrefillForPrefillNode
11 | from .continues_batch.pd_mode.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode
12 | from .chunked_prefill.impl_for_xgrammar_mode import XgrammarBackend
13 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl_for_dp_chuncked import DPChunkedForPrefillNode
14 | from .continues_batch.pd_mode.decode_node_impl.decode_impl_for_dp import DPForDecodeNode
15 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/__init__.py:
--------------------------------------------------------------------------------
1 | from .decode_kv_move_manager import start_decode_kv_move_manager_process
2 | from .decode_trans_process import start_decode_trans_process
3 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_task_cache.py:
--------------------------------------------------------------------------------
1 | # 这个里面声明了一个全局变量,主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据
2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了,不用传输特别的
3 | # 数据了,提升rpyc 调用的速度, 只用在 decode_impl.py 和 decode_infer_rpyc.py 文件中
4 | from typing import Dict, List, Tuple
5 | from lightllm.server.pd_io_struct import KVMoveTask
6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode
7 |
8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, List[int]]] = {}
9 |
10 | g_success_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, float]] = {} # 第三个float代表的是时间,用于判断过期条件。
11 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/__init__.py:
--------------------------------------------------------------------------------
1 | from .prefill_trans_process import start_prefill_trans_process
2 | from .prefill_kv_move_manager import start_prefill_kv_move_manager_process
3 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_task_cache.py:
--------------------------------------------------------------------------------
1 | # 这个里面声明了一个全局变量,主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据
2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了,不用传输特别的
3 | # 数据了,提升rpyc 调用的速度, 只用在 prefill_impl.py 和 prefill_infer_rpyc.py 文件中
4 | from typing import Dict, Tuple
5 | from lightllm.server.pd_io_struct import KVMoveTask
6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode
7 |
8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode]] = {}
9 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/utils.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import torch.multiprocessing as mp
3 | from queue import Empty
4 |
5 |
6 | def join_if_alive(thread: threading.Thread):
7 | if thread is not None and thread.is_alive():
8 | try:
9 | thread.join()
10 | except Exception:
11 | pass
12 | return
13 |
14 |
15 | def clear_queue(queue: mp.Queue):
16 | while not queue.empty():
17 | try:
18 | queue.get_nowait()
19 | except Empty:
20 | break
21 |
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/req_queue/chunked_prefill/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/chunked_prefill/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/router/req_queue/continues_batch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/continues_batch/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/visualserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/__init__.py
--------------------------------------------------------------------------------
/lightllm/server/visualserver/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/model_infer/__init__.py
--------------------------------------------------------------------------------
/lightllm/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/utils/__init__.py
--------------------------------------------------------------------------------
/lightllm/utils/error_utils.py:
--------------------------------------------------------------------------------
1 | class ServerBusyError(Exception):
2 | """Custom exception for server busy/overload situations"""
3 |
4 | def __init__(self, message="Server is busy, please try again later", status_code=503):
5 | """
6 | Initialize the ServerBusyError
7 |
8 | Args:
9 | message (str): Error message to display
10 | status_code (int): HTTP status code (default 503 Service Unavailable)
11 | """
12 | super().__init__(message)
13 | self.message = message
14 | self.status_code = status_code # HTTP 503 Service Unavailable
15 |
16 | def __str__(self):
17 | """String representation of the error"""
18 | return f"{self.message} (Status code: {self.status_code})"
19 |
--------------------------------------------------------------------------------
/lightllm/utils/graceful_utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from lightllm.utils.log_utils import init_logger
3 |
4 | logger = init_logger(__name__)
5 |
6 |
7 | def graceful_registry(sub_module_name):
8 | import signal
9 |
10 | # 子进程在受到 SIGTERM的时候,不能自己就提前退出。
11 | def graceful_shutdown(signum, frame):
12 | logger.info(f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown...")
13 | if signum == signal.SIGTERM:
14 | # 不退出,由主进程来决定退出时机
15 | logger.info(f"{sub_module_name} recive sigterm")
16 |
17 | signal.signal(signal.SIGTERM, graceful_shutdown)
18 | return
19 |
--------------------------------------------------------------------------------
/lightllm/utils/light_utils.py:
--------------------------------------------------------------------------------
1 | from lightllm.utils.log_utils import init_logger
2 |
3 | logger = init_logger(__name__)
4 | try:
5 | # TODO: lightllm_kernel release
6 | import lightllm_kernel
7 |
8 | light_ops = getattr(lightllm_kernel, "ops", lightllm_kernel)
9 | HAS_LIGHTLLM_KERNEL = True
10 | except:
11 | light_ops = None
12 | HAS_LIGHTLLM_KERNEL = False
13 | logger.warning("lightllm_kernel is not installed, you can't use the api of it.")
14 |
--------------------------------------------------------------------------------
/lightllm/utils/retry_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | import functools
3 | from lightllm.utils.log_utils import init_logger
4 |
5 | logger = init_logger(__name__)
6 |
7 |
8 | def retry(max_attempts=3, wait_time=1):
9 | """
10 | 被修饰的函数调用失败需要自己抛异常
11 | :param max_attempts: 最大重试次数
12 | :param wait_time: 每次重试之间的等待时间(秒)
13 | """
14 |
15 | def decorator(func):
16 | @functools.wraps(func)
17 | def wrapper(*args, **kwargs):
18 | attempts = 0
19 | while attempts < max_attempts:
20 | try:
21 | return func(*args, **kwargs)
22 | except Exception as e:
23 | attempts += 1
24 | logger.info(f"try {func.__name__} {attempts}/{max_attempts} fail: {str(e)}")
25 | if attempts < max_attempts:
26 | time.sleep(wait_time)
27 | raise Exception(f"{func.__name__} try all failed")
28 |
29 | return wrapper
30 |
31 | return decorator
32 |
--------------------------------------------------------------------------------
/lightllm/utils/sgl_utils.py:
--------------------------------------------------------------------------------
1 | from lightllm.utils.log_utils import init_logger
2 |
3 | logger = init_logger(__name__)
4 | try:
5 | import sgl_kernel
6 |
7 | sgl_ops = sgl_kernel
8 | sgl_allreduce_ops = sgl_ops.allreduce
9 | HAS_SGL_KERNEL = True
10 | except:
11 | sgl_ops = None
12 | sgl_allreduce_ops = None
13 | HAS_SGL_KERNEL = False
14 | logger.warning(
15 | "sgl_kernel is not installed, you can't use the api of it. \
16 | You can solve it by running `pip install sgl_kernel`."
17 | )
18 |
19 | try:
20 | from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
21 |
22 | flash_attn_varlen_func = flash_attn_varlen_func
23 | flash_attn_with_kvcache = flash_attn_with_kvcache
24 | merge_state_v2 = sgl_ops.merge_state_v2
25 | except:
26 | flash_attn_varlen_func = None
27 | flash_attn_with_kvcache = None
28 | merge_state_v2 = None
29 | logger.warning(
30 | "sgl_kernel is not installed, or the installed version did not support fa3. \
31 | Try to upgrade it."
32 | )
33 |
--------------------------------------------------------------------------------
/lightllm/utils/statics_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | from lightllm.utils.log_utils import init_logger
3 |
4 | logger = init_logger(__name__)
5 |
6 |
7 | class MovingAverage:
8 | def __init__(self):
9 | self.total = 0.0
10 | self.count = 0
11 | self.last_time = time.time()
12 |
13 | def add(self, value):
14 | self.total += value
15 | self.count += 1
16 |
17 | def average(self):
18 | if self.count == 0:
19 | return 0.0
20 | return self.total / self.count
21 |
22 | def print_log(self, log_str):
23 | if time.time() - self.last_time >= 30:
24 | logger.info(f"{log_str}: {self.average()} ms")
25 | self.last_time = time.time()
26 |
--------------------------------------------------------------------------------
/lightllm/utils/time_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 |
4 | class TimeChecker:
5 | def __init__(self, threshold):
6 | self.threshold = threshold
7 | self.last_checked = time.time()
8 |
9 | def has_exceeded(self):
10 | current_time = time.time()
11 | if (current_time - self.last_checked) > self.threshold:
12 | self._reset()
13 | return True
14 | return False
15 |
16 | def _reset(self):
17 | self.last_checked = time.time()
18 |
--------------------------------------------------------------------------------
/lightllm/utils/vllm_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from lightllm.utils.log_utils import init_logger
3 |
4 | logger = init_logger(__name__)
5 | try:
6 | if not torch.cuda.is_initialized():
7 | torch.cuda.init()
8 | from vllm import _custom_ops as ops
9 |
10 | vllm_ops = ops
11 | HAS_VLLM = True
12 | cutlass_scaled_mm = torch.ops._C.cutlass_scaled_mm
13 |
14 | except:
15 | HAS_VLLM = False
16 | cutlass_scaled_mm = None
17 | vllm_ops = None
18 | logger.warning(
19 | "vllm is not installed, you can't use the api of it. \
20 | You can solve it by running `pip install vllm`."
21 | )
22 |
--------------------------------------------------------------------------------
/lightllm/utils/watchdog_utils.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import time
3 | from lightllm.utils.log_utils import init_logger
4 |
5 | logger = init_logger(__name__)
6 |
7 |
8 | class Watchdog:
9 | def __init__(self, timeout):
10 | self.timeout = timeout
11 | self.last_heartbeat = time.time()
12 | self.running = True
13 |
14 | def start(self):
15 | self.thread = threading.Thread(target=self.run, daemon=True)
16 | self.thread.start()
17 |
18 | def run(self):
19 | while self.running:
20 | time.sleep(2)
21 | if time.time() - self.last_heartbeat > self.timeout:
22 | logger.error("Watchdog: Timeout! Task is not responding.")
23 | self.handle_timeout()
24 |
25 | def handle_timeout(self):
26 | logger.error("Watchdog: time out to exit")
27 | import sys
28 |
29 | sys.exit(-1)
30 |
31 | def stop(self):
32 | self.running = False
33 | self.thread.join()
34 |
35 | def heartbeat(self):
36 | self.last_heartbeat = time.time()
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | package_data = {"lightllm": ["common/all_kernel_configs/*/*.json"]}
4 | setup(
5 | name="lightllm",
6 | version="1.0.1",
7 | packages=find_packages(exclude=("build", "include", "test", "dist", "docs", "benchmarks", "lightllm.egg-info")),
8 | author="model toolchain",
9 | author_email="",
10 | description="lightllm for inference LLM",
11 | long_description="",
12 | long_description_content_type="text/markdown",
13 | url="",
14 | classifiers=[
15 | "Programming Language :: Python :: 3",
16 | "Operating System :: Linux",
17 | ],
18 | python_requires=">=3.9.16",
19 | install_requires=[
20 | "pyzmq",
21 | "uvloop",
22 | "transformers",
23 | "einops",
24 | "packaging",
25 | "rpyc",
26 | "ninja",
27 | "safetensors",
28 | "triton",
29 | ],
30 | package_data=package_data,
31 | )
32 |
--------------------------------------------------------------------------------
/test/model/test_settings/process_utils.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import re
3 |
4 |
5 | def kill_gpu_processes():
6 | try:
7 | output = subprocess.check_output(["nvidia-smi", "-q", "-x"])
8 | output = output.decode("utf-8")
9 |
10 | # 使用正则表达式提取进程信息
11 | process_info = re.findall(r"(.*?)", output, re.DOTALL)
12 |
13 | if process_info:
14 | print("找到以下占用显卡的进程:")
15 | for info in process_info:
16 | pid = re.search(r"(.*?)", info).group(1)
17 | process_name = re.search(r"(.*?)", info).group(1)
18 | print("进程ID:", pid)
19 | print("进程名字:", process_name)
20 |
21 | for info in process_info:
22 | pid = re.search(r"(.*?)", info).group(1)
23 | subprocess.call(["sudo", "kill", "-9", pid])
24 | print("进程ID", pid, "被终止")
25 | else:
26 | print("没有找到占用显卡的进程")
27 |
28 | except subprocess.CalledProcessError:
29 | print("无法执行nvidia-smi命令")
30 |
31 |
32 | if __name__ == "__main__":
33 | kill_gpu_processes()
34 |
--------------------------------------------------------------------------------
/test/server/readme.md:
--------------------------------------------------------------------------------
1 | # prompt cache 测试:
2 |
3 | - benchmark_prompt_cache.py: 单次测试脚本。
4 |
5 | 例子:
6 | ```shell
7 | python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama --num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1
8 | ```
9 |
10 | 使用方法详细说明:
11 | ```shell
12 | python benchmark_prompt_cache.py -h
13 | ```
14 |
15 | - test_settings.py: 批量测试脚本,可测试多个配置并汇总为md
16 |
--------------------------------------------------------------------------------
/test/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/test/test.jpg
--------------------------------------------------------------------------------
/test/test_server.py:
--------------------------------------------------------------------------------
1 | import time
2 | import requests
3 | import json
4 | import threading
5 |
6 |
7 | class RequestThread(threading.Thread):
8 | def __init__(self, url, headers, data):
9 | threading.Thread.__init__(self)
10 | self.url = url
11 | self.headers = headers
12 | self.data = data
13 |
14 | def run(self):
15 | response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data))
16 | if response.status_code == 200:
17 | print(response.json())
18 | else:
19 | print("Error:", response.status_code, response.text)
20 |
21 |
22 | url = "http://localhost:8000/generate"
23 | headers = {"Content-Type": "application/json"}
24 |
25 | for i in range(1):
26 | data = {
27 | "inputs": "San Francisco is a",
28 | # 'temperature': 0.1,
29 | "parameters": {
30 | "do_sample": False,
31 | },
32 | }
33 | thread = RequestThread(url, headers, data)
34 | thread.start()
35 |
36 | time.sleep(2)
37 |
38 | for i in range(20):
39 | data = {
40 | "inputs": "San Francisco is a",
41 | "parameters": {
42 | "do_sample": False,
43 | "ignore_eos": True,
44 | "max_new_tokens": 200,
45 | },
46 | }
47 | thread = RequestThread(url, headers, data)
48 | thread.start()
49 |
--------------------------------------------------------------------------------
/tools/resolve_ptx_version:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script is used to make old version triton work on generating ptx code up to version 7.8
3 | # See https://github.com/openai/triton/blob/8650b4d1cbc750d659156e2c17a058736614827b/lib/driver/llvm.cc#L149
4 | set -e
5 |
6 | mkdir -p $HOME/.triton/
7 |
8 | [ $HOME/.triton/resolve_ptx_version.so -nt $0 ] || (echo '
9 | #include
10 | namespace triton {
11 | namespace driver {
12 |
13 | int vptx(int version) {
14 | // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
15 | if (version >= 11080) return 78;
16 | if (version >= 11070) return 77;
17 | if (version >= 11060) return 76;
18 | if (version >= 11050) return 75;
19 | if (version >= 11040) return 74;
20 | throw std::runtime_error("Triton requires CUDA 11.4+");
21 | }
22 |
23 | }
24 | }' \
25 | | g++ -x c++ -fPIC -shared -o $HOME/.triton/resolve_ptx_version.so -)
26 |
27 | [ -z "$*" ] || env LD_PRELOAD=$LD_PRELOAD:$HOME/.triton/resolve_ptx_version.so "$@"
--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_add_in_place.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import pytest
4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy
5 | from lightllm.common.basemodel.triton_kernel.add_in_place import add_in_place
6 | from lightllm.utils.log_utils import init_logger
7 |
8 | logger = init_logger(__name__)
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "dim1, dim2, alpha",
13 | [
14 | (dim1, dim2, alpha)
15 | for dim1 in range(1, 1024, 100)
16 | for dim2 in range(1, 1024, 100)
17 | for alpha in [0.1, 0.3, 0.5, 0.7, 0.1]
18 | ],
19 | )
20 | def test_add_in_place(dim1, dim2, alpha):
21 | input = torch.rand((dim1, dim2), device="cuda")
22 | other = torch.rand((dim1, dim2), device="cuda")
23 |
24 | output = input + other * alpha
25 | add_in_place(input, other, alpha=alpha)
26 | rlt = torch.allclose(input, output, atol=1e-5, rtol=0)
27 | assert rlt
28 |
29 |
30 | if __name__ == "__main__":
31 | pytest.main()
32 |
--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_gen_decode_params.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytest
3 | import numpy as np
4 | from lightllm.utils.log_utils import init_logger
5 | from lightllm.common.basemodel.triton_kernel.gen_decode_params import gen_decode_params
6 |
7 |
8 | def test_gen_decode_params_basic():
9 | b_seq_len = torch.ones((9,), dtype=torch.int64, device="cuda") * 8192
10 | (
11 | b_q_seq_len,
12 | b1_cu_q_seq_len,
13 | b_kv_seq_len,
14 | b1_cu_kv_seq_len,
15 | position_ids,
16 | max_q_seq_len,
17 | max_kv_seq_len,
18 | ) = gen_decode_params(b_seq_len)
19 |
20 | true_b_q_seq_len = torch.ones_like(b_seq_len)
21 | b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len
22 |
23 | assert max_q_seq_len == 1
24 | assert max_kv_seq_len == b_seq_len.max().item()
25 | assert torch.equal(b_q_seq_len, true_b_q_seq_len)
26 | assert torch.equal(b1_cu_q_seq_len, torch.nn.functional.pad(torch.cumsum(true_b_q_seq_len, dim=0), (1, 0), value=0))
27 | assert torch.equal(b_kv_seq_len, b_seq_len)
28 | assert torch.equal(b1_cu_kv_seq_len, torch.nn.functional.pad(torch.cumsum(b_seq_len, dim=0), (1, 0), value=0))
29 | assert torch.equal(position_ids, b_seq_len - 1)
30 |
31 |
32 | if __name__ == "__main__":
33 | pytest.main()
34 |
--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_sp_pad_kernel.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import pytest
4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy
5 | from lightllm.utils.log_utils import init_logger
6 |
7 | logger = init_logger(__name__)
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "token_num, hidden_dim, sp_world_size",
12 | [
13 | (token_num, hidden_dim, sp_world_size)
14 | for token_num in range(3, 6)
15 | for hidden_dim in [257, 2048]
16 | for sp_world_size in range(2, 5)
17 | ],
18 | )
19 | def test_sp_pad_copy(token_num, hidden_dim, sp_world_size):
20 |
21 | in_tensor = torch.randn((token_num, hidden_dim), dtype=torch.float16, device="cuda")
22 | out_tensors = [
23 | sp_pad_copy(in_tensor=in_tensor, sp_rank_id=rank_id, sp_world_size=sp_world_size)
24 | for rank_id in range(sp_world_size)
25 | ]
26 | out_tensor = torch.cat(out_tensors, dim=0)
27 | assert torch.equal(in_tensor, out_tensor[0:token_num, :])
28 |
29 |
30 | if __name__ == "__main__":
31 | pytest.main()
32 |
--------------------------------------------------------------------------------
/unit_tests/models/deepseek2/test_rope_repeat.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import pytest
4 | from lightllm.models.deepseek2.triton_kernel.repeat_rope import repeat_rope
5 |
6 |
7 | def test_torch_cat():
8 | source = torch.randn((100, 1, 1077), device="cuda")
9 | dest = torch.randn((100, 7, 1077), device="cuda")
10 |
11 | repeat_rope(dest, source)
12 | torch.equal(dest[:, 0, :], source)
13 | torch.equal(dest[:, -1, :], source)
14 |
15 | source = torch.randn((100, 1, 128), device="cuda")
16 | dest = torch.randn((100, 64, 128), device="cuda")
17 |
18 | repeat_rope(dest, source)
19 | torch.equal(dest[:, 0, :], source)
20 | torch.equal(dest[:, -1, :], source)
21 | return
22 |
23 |
24 | if __name__ == "__main__":
25 | pytest.main()
26 |
--------------------------------------------------------------------------------
/unit_tests/utils/test_custom_kernel_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import time
3 | import pytest
4 | from lightllm.utils.custom_kernel_utis import torch_cat_3
5 |
6 |
7 | def test_torch_cat():
8 | a = torch.tensor([[[1, 2], [3, 4]]], device="cuda")
9 | b = torch.tensor([[[5, 6], [7, 8]]], device="cuda")
10 | c = torch_cat_3([a, b], dim=0)
11 | torch.equal(torch.cat((a, b), dim=0), c)
12 |
13 | d = torch_cat_3([a, b], dim=1)
14 | torch.equal(torch.cat((a, b), dim=1), d)
15 |
16 | e = torch_cat_3([a, b], dim=-1)
17 | torch.equal(torch.cat((a, b), dim=-1), e)
18 |
19 | empty = torch.empty((0, 2), device="cuda")
20 | torch_cat_3([a, empty, b], dim=0)
21 | return
22 |
23 |
24 | if __name__ == "__main__":
25 | pytest.main()
26 |
--------------------------------------------------------------------------------