├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   ├── docker-publish.yml
    │   └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── assets
    ├── att.gif
    ├── lightllm.drawio.png
    └── logo.png
├── benchmark.md
├── build_and_upload_docker.sh
├── demos
    ├── qa_server
    │   ├── __init__.py
    │   ├── chat_server.py
    │   ├── qabot.py
    │   └── templates
    │   │   └── chat.html
    └── readme.txt
├── docs
    ├── CN
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── README.md
    │   ├── make.bat
    │   ├── rebuild.sh
    │   ├── requirements-docs.txt
    │   └── source
    │   │   ├── _static
    │   │       └── openapi.json
    │   │   ├── assets
    │   │       ├── lightllm
    │   │       │   ├── ER1.png
    │   │       │   ├── ER2.png
    │   │       │   ├── ER3.png
    │   │       │   ├── ER4.png
    │   │       │   ├── HttpServer.png
    │   │       │   ├── Performance.png
    │   │       │   ├── Performance2.png
    │   │       │   ├── Router.png
    │   │       │   ├── Visual_Server.png
    │   │       │   ├── arch.png
    │   │       │   ├── backend.png
    │   │       │   └── token_attn.gif
    │   │       └── logos
    │   │       │   └── lightllm-logo.png
    │   │   ├── conf.py
    │   │   ├── dev
    │   │       ├── router.rst
    │   │       └── token_attention.rst
    │   │   ├── getting_started
    │   │       ├── installation.rst
    │   │       └── quickstart.rst
    │   │   ├── index.rst
    │   │   ├── lightllm
    │   │       ├── lightllm_impl.rst
    │   │       └── lightllm_intro.rst
    │   │   ├── models
    │   │       ├── add_new_model.md
    │   │       ├── supported_models.rst
    │   │       └── test.rst
    │   │   ├── server
    │   │       ├── api_server_args_zh.rst
    │   │       └── benchmark.rst
    │   │   └── user
    │   │       ├── api_param.rst
    │   │       └── openapi_docs.rst
    └── EN
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── README.md
    │   ├── make.bat
    │   ├── rebuild.sh
    │   ├── requirements-docs.txt
    │   └── source
    │       ├── _static
    │           └── openapi.json
    │       ├── assets
    │           ├── lightllm
    │           │   ├── ER1.png
    │           │   ├── ER2.png
    │           │   ├── ER3.png
    │           │   ├── ER4.png
    │           │   ├── HttpServer.png
    │           │   ├── Performance.png
    │           │   ├── Performance2.png
    │           │   ├── Router.png
    │           │   ├── Visual_Server.png
    │           │   ├── arch.png
    │           │   ├── backend.png
    │           │   └── token_attn.gif
    │           └── logos
    │           │   └── lightllm-logo.png
    │       ├── conf.py
    │       ├── dev
    │           ├── performance.rst
    │           ├── router.rst
    │           └── token_attention.rst
    │       ├── getting_started
    │           ├── faq.rst
    │           ├── installation.rst
    │           └── quickstart.rst
    │       ├── index.rst
    │       ├── lightllm
    │           ├── lightllm_impl.rst
    │           └── lightllm_intro.rst
    │       ├── models
    │           ├── add_new_model.md
    │           ├── supported_models.rst
    │           └── test.rst
    │       ├── server
    │           ├── api_server_args.rst
    │           └── benchmark.rst
    │       └── user
    │           ├── api_param.rst
    │           └── openapi_docs.rst
├── format.py
├── format_out
    ├── __init__.py
    ├── grammer
    │   ├── __init__.py
    │   ├── core.py
    │   ├── dpda.py
    │   ├── json.ebnf
    │   ├── test.sh
    │   ├── test0.py
    │   ├── test1.py
    │   ├── test2.py
    │   ├── test3.py
    │   ├── test4.py
    │   ├── test5.py
    │   └── test6.py
    └── impl.py
├── lightllm
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── all_kernel_configs
    │   │   ├── __init__.py
    │   │   ├── bmm_scaled_fp8
    │   │   │   ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=128,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=256,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=128,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=256,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   ├── {B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json
    │   │   │   └── {B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   ├── fp8_block_mm
    │   │   │   ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   ├── {K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json
    │   │   │   └── {K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   ├── grouped_moe_gemm_kernel
    │   │   │   ├── {K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json
    │   │   │   ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json
    │   │   │   ├── {K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
    │   │   │   ├── {K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json
    │   │   │   ├── {K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
    │   │   │   ├── {K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json
    │   │   │   ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json
    │   │   │   ├── {K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json
    │   │   │   └── {K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
    │   │   ├── mla_decode_attentnion
    │   │   │   ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json
    │   │   │   ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H200.json
    │   │   │   └── {out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json
    │   │   ├── moe_silu_and_mul_kernel
    │   │   │   ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   │   ├── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   └── {N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json
    │   │   └── moe_sum_reduce_kernel
    │   │   │   ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json
    │   │   │   ├── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json
    │   │   │   └── {hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json
    │   ├── basemodel
    │   │   ├── __init__.py
    │   │   ├── basemodel.py
    │   │   ├── cuda_graph.py
    │   │   ├── infer_lock.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── base_layer_infer.py
    │   │   │   ├── cache_tensor_manager.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   ├── template
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── post_layer_infer_template.py
    │   │   │   │   ├── pre_layer_infer_template.py
    │   │   │   │   ├── transformer_layer_infer_cohere_template.py
    │   │   │   │   └── transformer_layer_infer_template.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── base_layer_weight.py
    │   │   │   ├── hf_load_utils.py
    │   │   │   ├── meta_weights
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base_weight.py
    │   │   │   │   ├── fused_moe_weight_ep.py
    │   │   │   │   ├── fused_moe_weight_ep_redundancy.py
    │   │   │   │   ├── fused_moe_weight_tp.py
    │   │   │   │   ├── mm_weight
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── colmm_weight.py
    │   │   │   │   │   ├── mm_weight.py
    │   │   │   │   │   └── rowmm_weight.py
    │   │   │   │   └── norm_weight.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── microbatch_overlap_objs.py
    │   │   ├── multimodal_tokenizer.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── add_in_place.py
    │   │   │   ├── apply_penalty.py
    │   │   │   ├── bmm_scaled_fp8.py
    │   │   │   ├── copy_kv_index_to_req.py
    │   │   │   ├── dequantize_gemm_int4.py
    │   │   │   ├── dequantize_gemm_int8.py
    │   │   │   ├── destindex_copy_kv.py
    │   │   │   ├── gen_decode_params.py
    │   │   │   ├── gen_prefill_params.py
    │   │   │   ├── multimodal_emb.py
    │   │   │   ├── quantize_gemm_int8.py
    │   │   │   ├── redundancy_topk_ids_repair.py
    │   │   │   └── sp_pad_copy.py
    │   ├── build_utils.py
    │   ├── cuda_wrapper.py
    │   ├── deepseek2_fp8kv_mem_manager.py
    │   ├── deepseek2_mem_manager.py
    │   ├── fused_moe
    │   │   ├── __init__.py
    │   │   ├── deepep_scatter_gather.py
    │   │   ├── grouped_fused_moe.py
    │   │   ├── grouped_fused_moe_ep.py
    │   │   ├── grouped_topk.py
    │   │   ├── moe_kernel_configs.py
    │   │   ├── moe_silu_and_mul.py
    │   │   ├── moe_silu_and_mul_config.py
    │   │   ├── moe_silu_and_mul_mix_quant_ep.py
    │   │   ├── moe_sum_recude_config.py
    │   │   ├── moe_sum_reduce.py
    │   │   ├── softmax_topk.py
    │   │   └── topk_select.py
    │   ├── infer_utils.py
    │   ├── int8kv_mem_manager.py
    │   ├── kernel_config.py
    │   ├── kv_trans_kernel
    │   │   ├── __init__.py
    │   │   ├── kv_trans.py
    │   │   └── kv_trans_v2.py
    │   ├── mem_manager.py
    │   ├── mem_utils.py
    │   ├── ppl_int4kv_mem_manager.py
    │   ├── ppl_int8kv_mem_manager.py
    │   ├── quantization
    │   │   ├── __init__.py
    │   │   ├── configs
    │   │   │   └── llamacls-mix-down.yaml
    │   │   ├── deepgemm_quant.py
    │   │   ├── quantize_method.py
    │   │   ├── registry.py
    │   │   ├── torchao_quant.py
    │   │   ├── triton_quant
    │   │   │   ├── __init__.py
    │   │   │   ├── fp8
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── fp8act_quant_kernel.py
    │   │   │   │   └── fp8w8a8_block_gemm_kernel.py
    │   │   │   └── triton_quant.py
    │   │   └── w8a8_quant.py
    │   └── req_manager.py
    ├── distributed
    │   ├── __init__.py
    │   ├── communication_op.py
    │   ├── custom_all_gather.py
    │   ├── custom_all_reduce.py
    │   ├── pynccl.py
    │   └── pynccl_wrapper.py
    ├── models
    │   ├── __init__.py
    │   ├── bloom
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── hf_load_utils.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── context_flashattention_nopad.py
    │   │   │   ├── layernorm.py
    │   │   │   ├── token_attention_nopad_att1.py
    │   │   │   ├── token_attention_nopad_reduceV.py
    │   │   │   ├── token_attention_nopad_softmax.py
    │   │   │   └── token_flashattention_nopad.py
    │   ├── chatglm2
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   └── rotary_emb.py
    │   ├── cohere
    │   │   ├── __init__.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernels
    │   │   │   ├── __init__.py
    │   │   │   ├── layernorm.py
    │   │   │   └── rotary_emb.py
    │   ├── deepseek2
    │   │   ├── __init__.py
    │   │   ├── flashattention_infer_struct.py
    │   │   ├── flashinfer_struct.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── context_flashattention_nopad.py
    │   │   │   ├── context_flashattention_nopad_fp8.py
    │   │   │   ├── context_flashattention_nopad_with_v.py
    │   │   │   ├── destindex_copy_kv.py
    │   │   │   ├── destindex_copy_kv_fp8.py
    │   │   │   ├── gqa_flash_decoding.py
    │   │   │   ├── gqa_flash_decoding_config.py
    │   │   │   ├── gqa_flash_decoding_fp8.py
    │   │   │   ├── gqa_flash_decoding_stage1.py
    │   │   │   ├── gqa_flash_decoding_stage1_fp8.py
    │   │   │   ├── gqa_flash_decoding_stage2.py
    │   │   │   ├── repack_kv_index.py
    │   │   │   ├── repeat_rope.py
    │   │   │   ├── rotary_emb.py
    │   │   │   ├── sample_kv.py
    │   │   │   └── weight_dequant.py
    │   ├── gemma3
    │   │   ├── __init__.py
    │   │   ├── gemma3_visual.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── gemma_2b
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   └── gelu_and_mul.py
    │   ├── internlm
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── internlm2
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── internlm2_reward
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── post_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── pre_and_post_layer_weight.py
    │   │   └── model.py
    │   ├── internvl
    │   │   ├── __init__.py
    │   │   ├── img_process.py
    │   │   ├── internvl_visual.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── pre_and_post_layer_weight.py
    │   │   └── model.py
    │   ├── llama
    │   │   ├── __init__.py
    │   │   ├── flashattention_infer_struct.py
    │   │   ├── flashinfer_struct.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── ds_load_utils.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   ├── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── context_flashattention_nopad.py
    │   │   │   ├── embedding.py
    │   │   │   ├── flash_decoding.py
    │   │   │   ├── flash_decoding_stage1.py
    │   │   │   ├── flash_decoding_stage2.py
    │   │   │   ├── gqa_decode_flashattention_nopad.py
    │   │   │   ├── gqa_flash_decoding.py
    │   │   │   ├── gqa_flash_decoding_stage1.py
    │   │   │   ├── gqa_flash_decoding_stage2.py
    │   │   │   ├── gqa_flash_decoding_vsm.py
    │   │   │   ├── ppl_fp16_flash_decoding.py
    │   │   │   ├── ppl_int4kv_copy_kv.py
    │   │   │   ├── ppl_int4kv_flash_decoding.py
    │   │   │   ├── ppl_int8kv_flash_decoding.py
    │   │   │   ├── ppl_quant_copy_kv.py
    │   │   │   ├── rmsnorm.py
    │   │   │   ├── rotary_emb.py
    │   │   │   ├── silu_and_mul.py
    │   │   │   ├── token_attention_nopad_att1.py
    │   │   │   ├── token_attention_nopad_reduceV.py
    │   │   │   ├── token_attention_nopad_softmax.py
    │   │   │   └── token_attention_softmax_and_reducev.py
    │   │   └── yarn_rotary_utils.py
    │   ├── llava
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── pre_and_post_layer_weight.py
    │   │   ├── llava_visual.py
    │   │   └── model.py
    │   ├── minicpm
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── mistral
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── context_flashattention_nopad.py
    │   │   │   ├── init_att_sliding_window_info.py
    │   │   │   ├── token_attention_nopad_att1.py
    │   │   │   ├── token_attention_nopad_reduceV.py
    │   │   │   └── token_attention_softmax_and_reducev.py
    │   ├── mixtral
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── _custom_ops.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── phi3
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── context_flashattention_nopad.py
    │   │   │   ├── destindex_copy_kv.py
    │   │   │   ├── flash_decoding.py
    │   │   │   ├── flash_decoding_stage1.py
    │   │   │   ├── flash_decoding_stage2.py
    │   │   │   └── rotary_emb.py
    │   ├── qwen
    │   │   ├── __init__.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── qwen2
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── qwen2_5_vl
    │   │   ├── __init__.py
    │   │   └── qwen2_5_visual.py
    │   ├── qwen2_reward
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── post_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── pre_and_post_layer_weight.py
    │   │   └── model.py
    │   ├── qwen2_vl
    │   │   ├── __init__.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── model.py
    │   │   ├── qwen2_visual.py
    │   │   ├── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   └── mrope.py
    │   │   └── vision_process.py
    │   ├── qwen3
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── qwen3_moe
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── qwen_vl
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── pre_layer_infer.py
    │   │   ├── model.py
    │   │   └── qwen_visual.py
    │   ├── registry.py
    │   ├── stablelm
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── starcoder
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── starcoder2
    │   │   ├── __init__.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   └── model.py
    │   ├── tarsier2
    │   │   ├── __init__.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   └── pre_and_post_layer_weight.py
    │   │   ├── model.py
    │   │   └── tarsier2_visual.py
    │   ├── vit
    │   │   ├── __init__.py
    │   │   ├── infer_struct.py
    │   │   ├── layer_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── post_layer_infer.py
    │   │   │   ├── pre_layer_infer.py
    │   │   │   └── transformer_layer_infer.py
    │   │   ├── layer_weights
    │   │   │   ├── __init__.py
    │   │   │   ├── hf_load_utils.py
    │   │   │   ├── pre_and_post_layer_weight.py
    │   │   │   └── transformer_layer_weight.py
    │   │   ├── model.py
    │   │   └── triton_kernel
    │   │   │   ├── __init__.py
    │   │   │   ├── flashattention_nopad.py
    │   │   │   ├── gelu_vit.py
    │   │   │   └── rms_norm_vit.py
    │   └── whisper
    │   │   ├── __init__.py
    │   │   ├── defaults.py
    │   │   ├── modeling_whisper.py
    │   │   └── whisper_audio.py
    ├── server
    │   ├── __init__.py
    │   ├── api_cli.py
    │   ├── api_http.py
    │   ├── api_lightllm.py
    │   ├── api_models.py
    │   ├── api_openai.py
    │   ├── api_server.py
    │   ├── api_start.py
    │   ├── api_tgi.py
    │   ├── audioserver
    │   │   ├── __init__.py
    │   │   ├── manager.py
    │   │   └── model_infer
    │   │   │   ├── __init__.py
    │   │   │   └── model_rpc.py
    │   ├── build_prompt.py
    │   ├── config_server
    │   │   ├── __init__.py
    │   │   ├── api_http.py
    │   │   └── nccl_tcp_store.py
    │   ├── core
    │   │   ├── __init__.py
    │   │   └── objs
    │   │   │   ├── __init__.py
    │   │   │   ├── atomic_array_lock.py
    │   │   │   ├── atomic_lock.py
    │   │   │   ├── io_objs
    │   │   │       ├── __init__.py
    │   │   │       └── group_req.py
    │   │   │   ├── out_token_circlequeue.py
    │   │   │   ├── py_sampling_params.py
    │   │   │   ├── req.py
    │   │   │   ├── rpc_shm.py
    │   │   │   ├── sampling_params.py
    │   │   │   ├── shm_array.py
    │   │   │   ├── shm_req_manager.py
    │   │   │   └── start_args_type.py
    │   ├── detokenization
    │   │   ├── __init__.py
    │   │   ├── decode.py
    │   │   ├── decode_mode_fix.py
    │   │   ├── decode_req.py
    │   │   └── manager.py
    │   ├── embed_cache
    │   │   ├── __init__.py
    │   │   ├── impl
    │   │   │   ├── __init__.py
    │   │   │   └── naive_memory_cache.py
    │   │   ├── interface.py
    │   │   ├── manager.py
    │   │   └── utils.py
    │   ├── function_call_parser.py
    │   ├── health_monitor
    │   │   ├── __init__.py
    │   │   └── manager.py
    │   ├── httpserver
    │   │   ├── __init__.py
    │   │   ├── async_queue.py
    │   │   ├── manager.py
    │   │   └── pd_loop.py
    │   ├── httpserver_for_pd_master
    │   │   ├── __init__.py
    │   │   ├── manager.py
    │   │   └── register_loop.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── manager.py
    │   │   └── metrics.py
    │   ├── multimodal_params.py
    │   ├── pd_io_struct.py
    │   ├── req_id_generator.py
    │   ├── router
    │   │   ├── __init__.py
    │   │   ├── batch.py
    │   │   ├── dynamic_prompt
    │   │   │   ├── __init__.py
    │   │   │   ├── radix_cache.py
    │   │   │   └── shared_arr.py
    │   │   ├── manager.py
    │   │   ├── model_infer
    │   │   │   ├── __init__.py
    │   │   │   ├── infer_batch.py
    │   │   │   ├── mode_backend
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base_backend.py
    │   │   │   │   ├── chunked_prefill
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── impl.py
    │   │   │   │   │   ├── impl_for_first_token_constraint_mode.py
    │   │   │   │   │   ├── impl_for_outlines_constraint_mode.py
    │   │   │   │   │   ├── impl_for_token_healing.py
    │   │   │   │   │   └── impl_for_xgrammar_mode.py
    │   │   │   │   ├── continues_batch
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── impl.py
    │   │   │   │   │   ├── impl_for_return_all_prompt_logprobs.py
    │   │   │   │   │   ├── impl_for_reward_model.py
    │   │   │   │   │   └── pd_mode
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── decode_node_impl
    │   │   │   │   │   │       ├── __init__.py
    │   │   │   │   │   │       ├── decode_impl.py
    │   │   │   │   │   │       ├── decode_impl_for_dp.py
    │   │   │   │   │   │       ├── decode_infer_rpyc.py
    │   │   │   │   │   │       ├── decode_kv_move_manager.py
    │   │   │   │   │   │       ├── decode_task_cache.py
    │   │   │   │   │   │       ├── decode_trans_obj.py
    │   │   │   │   │   │       ├── decode_trans_process.py
    │   │   │   │   │   │       └── up_status.py
    │   │   │   │   │   │   ├── p2p_fix.py
    │   │   │   │   │   │   ├── prefill_node_impl
    │   │   │   │   │   │       ├── __init__.py
    │   │   │   │   │   │       ├── prefill_impl.py
    │   │   │   │   │   │       ├── prefill_impl_for_dp_chuncked.py
    │   │   │   │   │   │       ├── prefill_infer_rpyc.py
    │   │   │   │   │   │       ├── prefill_kv_move_manager.py
    │   │   │   │   │   │       ├── prefill_task_cache.py
    │   │   │   │   │   │       ├── prefill_trans_obj.py
    │   │   │   │   │   │       └── prefill_trans_process.py
    │   │   │   │   │   │   ├── task_queue.py
    │   │   │   │   │   │   └── utils.py
    │   │   │   │   ├── diverse_backend
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── impl.py
    │   │   │   │   ├── dp_backend
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── impl.py
    │   │   │   │   │   └── pre_process.py
    │   │   │   │   ├── generic_post_process.py
    │   │   │   │   ├── generic_pre_process.py
    │   │   │   │   └── redundancy_expert_manager.py
    │   │   │   └── model_rpc.py
    │   │   ├── pause_strategy.py
    │   │   ├── req_queue
    │   │   │   ├── __init__.py
    │   │   │   ├── base_queue.py
    │   │   │   ├── chunked_prefill
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── beam_impl.py
    │   │   │   │   ├── impl.py
    │   │   │   │   └── impl_for_pd_prefill.py
    │   │   │   ├── continues_batch
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── impl.py
    │   │   │   │   └── impl_for_pd_decode.py
    │   │   │   └── dp_base_queue.py
    │   │   ├── stats.py
    │   │   └── token_load.py
    │   ├── tokenizer.py
    │   └── visualserver
    │   │   ├── __init__.py
    │   │   ├── manager.py
    │   │   └── model_infer
    │   │       ├── __init__.py
    │   │       └── model_rpc.py
    └── utils
    │   ├── __init__.py
    │   ├── config_utils.py
    │   ├── custom_kernel_utis.py
    │   ├── device_utils.py
    │   ├── dist_utils.py
    │   ├── envs_utils.py
    │   ├── error_utils.py
    │   ├── graceful_utils.py
    │   ├── health_check.py
    │   ├── infer_utils.py
    │   ├── light_utils.py
    │   ├── log_utils.py
    │   ├── multimodal_utils.py
    │   ├── multinode_utils.py
    │   ├── net_utils.py
    │   ├── petrel_helper.py
    │   ├── process_check.py
    │   ├── profile_max_tokens.py
    │   ├── retry_utils.py
    │   ├── rpyc_fix_utils.py
    │   ├── sgl_utils.py
    │   ├── start_utils.py
    │   ├── statics_utils.py
    │   ├── time_utils.py
    │   ├── tuning_utils.py
    │   ├── vllm_utils.py
    │   └── watchdog_utils.py
├── requirements.txt
├── setup.py
├── test
    ├── benchmark_client.py
    ├── benchmark_mcq.py
    ├── benchmark_qps.py
    ├── benchmark_serving.py
    ├── deepseek.sh
    ├── format_out
    │   ├── gomoku_game.py
    │   ├── qabot.py
    │   ├── test_constraint_server.py
    │   ├── test_demo.py
    │   └── test_xgrammar_constraint.py
    ├── kernel
    │   ├── alignment
    │   │   └── llama_gqa_decode_vsm.py
    │   ├── deepseekv2_bmm_scaled_fp8_tuning.py
    │   ├── deepseekv3_fp8_block_gemm_tuning.py
    │   ├── fuse_moe_tuning_bf16.py
    │   ├── fuse_moe_tuning_fp8.py
    │   ├── moe_silu_and_mul_tuning_bf16.py
    │   ├── moe_sum_reduce_tuning_bf16.py
    │   └── tuning
    │   │   ├── deepseekv2_gqa_decode_tuning.py
    │   │   └── llama_gqa_decode_vsm_tuning.py
    ├── model
    │   ├── model_infer.py
    │   ├── model_infer_vit.py
    │   ├── test_model.py
    │   ├── test_script.sh
    │   └── test_settings
    │   │   ├── model_infer_batchs.py
    │   │   ├── process_utils.py
    │   │   └── test_settings.py
    ├── server
    │   ├── benchmark_prompt_cache.py
    │   ├── readme.md
    │   └── test_settings.py
    ├── test.jpg
    ├── test.sh
    ├── test_accuracy.py
    ├── test_constraint_server.py
    ├── test_function_call_api.py
    ├── test_multimodal_server.py
    ├── test_redundancy_expert_config.json
    └── test_server.py
├── tools
    ├── quick_launch_docker.py
    └── resolve_ptx_version
└── unit_tests
    ├── common
        ├── basemodel
        │   └── triton_kernel
        │   │   ├── test_add_in_place.py
        │   │   ├── test_gen_decode_params.py
        │   │   ├── test_gen_prefill_params.py
        │   │   ├── test_redundancy_topk_ids_repair.py
        │   │   └── test_sp_pad_kernel.py
        ├── fused_moe
        │   ├── test_deepep.py
        │   ├── test_grouped_fused_moe.py
        │   ├── test_grouped_fused_moe_speed.py
        │   ├── test_grouped_topk.py
        │   ├── test_moe_silu_and_mul_mix_quant_ep.py
        │   └── test_softmax_topk.py
        └── kv_trans_kernel
        │   └── test_kv_trans_v2.py
    ├── models
        ├── deepseek2
        │   ├── test_destindex_copy_kv.py
        │   ├── test_destindex_copy_kv_fp8.py
        │   ├── test_gqa_flash_decoding.py
        │   ├── test_gqa_flash_decoding_fp8.py
        │   ├── test_repack_kv_index.py
        │   └── test_rope_repeat.py
        ├── llama
        │   ├── test_context_flashattention_nopad.py
        │   └── test_token_attention_nopad.py
        └── qwen2_vl
        │   └── test_mrope.py
    ├── server
        ├── core
        │   └── objs
        │   │   ├── test_atomic_array_lock.py
        │   │   ├── test_atomic_lock.py
        │   │   ├── test_out_token_circlequeue.py
        │   │   ├── test_req.py
        │   │   ├── test_sampling_params.py
        │   │   ├── test_shm_array.py
        │   │   └── test_shm_req_manager.py
        └── router
        │   └── dynamic_prompt
        │       └── test_radix_cache.py
    └── utils
        └── test_custom_kernel_utils.py


/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: Pre-commit checks
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   pre-commit:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - name: Checkout code
14 |       uses: actions/checkout@v2
15 |       with:
16 |         fetch-depth: 0  # Fetch all history for all branches and tags
17 |         
18 |     - name: Set up Python
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: '3.9'
22 | 
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install pre-commit
27 |         pre-commit install-hooks
28 | 
29 |     - name: Run pre-commit on modified files
30 |       run: |
31 |         if [ -n "$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }})" ]; then
32 |           pre-commit run --files $(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }})
33 |         else
34 |           echo "No files to check"
35 |         fi
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .pyc
3 | build
4 | dist
5 | *.egg-info
6 | .idea
7 | .vscode
8 | tmp/
9 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/psf/black
 3 |     rev: 21.12b0
 4 |     hooks:
 5 |     -   id: black
 6 |         language_version: python3
 7 |         args: [--line-length=120]
 8 |         additional_dependencies: ['click==8.0.4']
 9 | -   repo: https://github.com/pycqa/flake8
10 |     rev: 3.9.0
11 |     hooks:
12 |       - id: flake8
13 |         additional_dependencies: [flake8-typing-imports==1.9.0]
14 |         args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606']


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing Guidelines
 2 | 
 3 | ### Coding Style Guide
 4 | 
 5 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html), and we recommend to use `yapf` to format your code.
 6 | 
 7 | In this project, we adopted `pre-commit` to automatic check the code style.
 8 | 
 9 | To begin with, you should follow the step below to install `pre-commit`.
10 | 
11 | ```bash
12 | pip install pre-commit
13 | ```
14 | 
15 | Then, you should config the pre-commit hook as below.
16 | 
17 | ```bash
18 | pre-commit install
19 | ```
20 | 
21 | Then when you commit your change, your code will be automatically checked.
22 | 


--------------------------------------------------------------------------------
/assets/att.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/att.gif


--------------------------------------------------------------------------------
/assets/lightllm.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/lightllm.drawio.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/assets/logo.png


--------------------------------------------------------------------------------
/benchmark.md:
--------------------------------------------------------------------------------
 1 | #### lightllm 
 2 | 
 3 | #### Launch service
 4 | 
 5 | ~~~shell
 6 | python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
 7 | ~~~
 8 | 
 9 | #### Evaluation
10 | 
11 | ~~~shell
12 | python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200
13 | ~~~
14 | 
15 | #### vllm 
16 | 
17 | #### Launch service
18 | ~~~shell
19 | python -m vllm.entrypoints.api_server --model /path/llama-7b --swap-space 16 --disable-log-requests --port 9009
20 | ~~~
21 | 
22 | #### Evaluation
23 | 
24 | ~~~shell
25 | python benchmark_serving_vllm.py --backend vllm --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 --host 127.0.0.1 --port 9009
26 | ~~~


--------------------------------------------------------------------------------
/build_and_upload_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Build and push docker image to AWS ECR.
 4 | 
 5 | set -eo pipefail
 6 | 
 7 | if [ -z "$1" ]; then
 8 |   echo "Must supply AWS account ID"
 9 |   exit 1;
10 | fi
11 | 
12 | if [ -z "$2" ]; then
13 |   echo "Must supply the image tag"
14 |   exit 1;
15 | fi
16 | 
17 | IMAGE_TAG=$2
18 | ACCOUNT=$1
19 | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com
20 | DOCKER_BUILDKIT=1 docker build -t $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG .
21 | docker push $ACCOUNT.dkr.ecr.us-west-2.amazonaws.com/lightllm:$IMAGE_TAG
22 | 


--------------------------------------------------------------------------------
/demos/qa_server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/demos/qa_server/__init__.py


--------------------------------------------------------------------------------
/demos/readme.txt:
--------------------------------------------------------------------------------
1 | 一些应用demo的目录


--------------------------------------------------------------------------------
/docs/CN/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-20.04
 6 |   tools:
 7 |     python: "3.10"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/CN/source/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: docs/CN/requirements-docs.txt


--------------------------------------------------------------------------------
/docs/CN/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/CN/README.md:
--------------------------------------------------------------------------------
 1 | ## Build the docs
 2 | 
 3 | ```bash
 4 | # Install lightllm
 5 | 
 6 | # git clone https://github.com/ModelTC/lightllm.git
 7 | # cd lightllm
 8 | pip install --no-deps .  
 9 | ```
10 | 
11 | ```bash
12 | # Install dependencies.
13 | 
14 | # cd docs/CN
15 | pip install -r requirements-docs.txt
16 | 
17 | # Build the docs.
18 | make clean
19 | make html
20 | ```
21 | 
22 | ## Open the docs with your browser
23 | 
24 | ```bash
25 | python -m http.server -d build/html/ 
26 | ```
27 | 
28 | Launch your browser and open localhost:8000.
29 | 


--------------------------------------------------------------------------------
/docs/CN/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/CN/rebuild.sh:
--------------------------------------------------------------------------------
1 | make clean
2 | make html
3 | python -m http.server -d build/html/ 8000


--------------------------------------------------------------------------------
/docs/CN/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx == 6.2.1
 2 | sphinx-book-theme == 1.0.1
 3 | sphinx-copybutton == 0.5.2
 4 | myst-parser == 2.0.0
 5 | sphinx-argparse
 6 | sphinxcontrib.redoc
 7 | sphinxcontrib.openapi
 8 | 
 9 | # packages to install to build the documentation
10 | pydantic
11 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
12 | numpy


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER1.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER2.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER3.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/ER4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/ER4.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/HttpServer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/HttpServer.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Performance2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Performance2.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Router.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Router.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/Visual_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/Visual_Server.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/arch.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/backend.png


--------------------------------------------------------------------------------
/docs/CN/source/assets/lightllm/token_attn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/lightllm/token_attn.gif


--------------------------------------------------------------------------------
/docs/CN/source/assets/logos/lightllm-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/CN/source/assets/logos/lightllm-logo.png


--------------------------------------------------------------------------------
/docs/CN/source/server/api_server_args_zh.rst:
--------------------------------------------------------------------------------
 1 | APIServer 参数详解
 2 | =============================
 3 | 
 4 | 
 5 | 使用方法
 6 | ++++++++++++
 7 | 
 8 | .. argparse::
 9 |     :module: lightllm.server.api_cli
10 |     :func: make_argument_parser
11 |     :prog: python -m lightllm.server.api_server
12 |     :nodefaultconst:
13 | 


--------------------------------------------------------------------------------
/docs/CN/source/server/benchmark.rst:
--------------------------------------------------------------------------------
 1 | 服务性能评测
 2 | ==================
 3 | 
 4 | 部署完模型以后，对服务性能进行评测是非常重要的，通过服务性能的表现调整配置从而更好地利用显卡资源。
 5 | 本文中，我们使用 LLaMA-7B 模型，在80G的A800显卡上，比较了lightllm 和 vLLM==0.1.2 的性能。
 6 | 具体比较方式参考以下步骤：
 7 | 
 8 | 1. 下载数据集
 9 | ^^^^^^^^^^^^^^
10 | 
11 | .. code-block:: console
12 | 
13 |     $ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
14 | 
15 | 
16 | 2. 开启模型服务
17 | ^^^^^^^^^^^^^^^^^^^
18 | 
19 | .. code-block:: console
20 | 
21 |     $ python -m lightllm.server.api_server --model_dir /path/llama-7b --tp 1 --max_total_token_num 121060 --tokenizer_mode auto
22 | 
23 | 
24 | 3. 性能评测
25 | ^^^^^^^^^^^^^^^^
26 | 
27 | .. code-block:: console
28 | 
29 |    $ cd test
30 |    $ python benchmark_serving.py --tokenizer /path/llama-7b --dataset /path/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 2000 --request-rate 200 
31 | 
32 | 
33 | 输出：
34 | 
35 | .. code-block:: console
36 |     
37 |     read data set finish
38 |     total tokens: 494250
39 |     Total time: 111.37 s
40 |     Throughput: 8.98 requests/s
41 |     Average latency: 43.52 s
42 |     Average latency per token: 0.15 s
43 |     Average latency per output token: 0.73 s


--------------------------------------------------------------------------------
/docs/EN/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | # Set the version of Python and other tools you might need
 4 | build:
 5 |   os: ubuntu-20.04
 6 |   tools:
 7 |     python: "3.10"
 8 | 
 9 | formats:
10 |     - epub
11 | 
12 | sphinx:
13 |   configuration: docs/EN/source/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: docs/EN/requirements-docs.txt


--------------------------------------------------------------------------------
/docs/EN/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/EN/README.md:
--------------------------------------------------------------------------------
 1 | ## Build the docs
 2 | 
 3 | ```bash
 4 | # Install lightllm
 5 | 
 6 | # git clone https://github.com/ModelTC/lightllm.git
 7 | # cd lightllm
 8 | pip install --no-deps .  
 9 | ```
10 | 
11 | ```bash
12 | # Install dependencies.
13 | 
14 | # cd docs/EN
15 | pip install -r requirements-docs.txt
16 | 
17 | # Build the docs.
18 | make clean
19 | make html
20 | ```
21 | 
22 | ## Open the docs with your browser
23 | 
24 | ```bash
25 | python -m http.server -d build/html/ 
26 | ```
27 | 
28 | Launch your browser and open localhost:8000.
29 | 


--------------------------------------------------------------------------------
/docs/EN/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/EN/rebuild.sh:
--------------------------------------------------------------------------------
1 | make clean
2 | make html
3 | python -m http.server -d build/html/ 5888


--------------------------------------------------------------------------------
/docs/EN/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | sphinx == 6.2.1
 2 | sphinx-book-theme == 1.0.1
 3 | sphinx-copybutton == 0.5.2
 4 | myst-parser == 2.0.0
 5 | sphinx-argparse
 6 | sphinxcontrib.redoc
 7 | sphinxcontrib.openapi
 8 | 
 9 | # packages to install to build the documentation
10 | pydantic
11 | -f https://download.pytorch.org/whl/cpu
12 | openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
13 | numpy


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER1.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER2.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER3.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/ER4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/ER4.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/HttpServer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/HttpServer.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Performance2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Performance2.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Router.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Router.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/Visual_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/Visual_Server.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/arch.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/backend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/backend.png


--------------------------------------------------------------------------------
/docs/EN/source/assets/lightllm/token_attn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/lightllm/token_attn.gif


--------------------------------------------------------------------------------
/docs/EN/source/assets/logos/lightllm-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/docs/EN/source/assets/logos/lightllm-logo.png


--------------------------------------------------------------------------------
/docs/EN/source/getting_started/faq.rst:
--------------------------------------------------------------------------------
 1 | .. _faq:
 2 | 
 3 | - The LLaMA tokenizer fails to load.
 4 |     - Consider resolving this by running the command:
 5 | 
 6 |       .. code-block:: shell
 7 | 
 8 |          pip install protobuf==3.20.0
 9 | 
10 | - ``error   : PTX .version 7.4 does not support .target sm_89``
11 |     - Launch with:
12 | 
13 |       .. code-block:: shell
14 | 
15 |          bash tools/resolve_ptx_version python -m lightllm.server.api_server ...


--------------------------------------------------------------------------------
/docs/EN/source/server/api_server_args.rst:
--------------------------------------------------------------------------------
 1 | APIServer Args
 2 | =============================
 3 | 
 4 | 
 5 | Usage
 6 | ++++++++++++
 7 | 
 8 | .. argparse::
 9 |     :module: lightllm.server.api_cli
10 |     :func: make_argument_parser
11 |     :prog: python -m lightllm.server.api_server
12 |     :nodefaultconst:


--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | 
4 | for filename in glob.glob('./**/*.py', recursive=True):
5 |     print(filename)
6 |     os.system(f"autopep8 --max-line-length 140 --in-place --aggressive --aggressive {filename}")
7 | 


--------------------------------------------------------------------------------
/format_out/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/format_out/__init__.py


--------------------------------------------------------------------------------
/format_out/grammer/__init__.py:
--------------------------------------------------------------------------------
 1 | # 文法表达形式限制
 2 | # 1. 起始表示符一定是 S‘
 3 | # 2. 不支持 "ε" 表达式
 4 | 
 5 | 
 6 | grammar = [
 7 |     ("S'", ["S"]),
 8 |     ("S", ["A", "B"]),
 9 |     ("A", ["a", "A"]),
10 |     ("A", ["ε"]),
11 |     ("B", ["b", "B"]),
12 |     ("B", ["ε"]),
13 | ]
14 | 


--------------------------------------------------------------------------------
/format_out/grammer/json.ebnf:
--------------------------------------------------------------------------------
 1 | root ::= basic_array | basic_object
 2 | basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
 3 | basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
 4 | basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
 5 | basic_string ::= (([\"] basic_string_1 [\"]))
 6 | basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
 7 | escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]
 8 | basic_boolean ::= "true" | "false"
 9 | basic_null ::= "null"
10 | basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
11 | basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
12 | ws ::= [ \n\t]*


--------------------------------------------------------------------------------
/format_out/grammer/test.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | python test0.py
3 | python test1.py
4 | python test2.py
5 | python test3.py
6 | python test4.py
7 | python test5.py
8 | python test6.py
9 | 


--------------------------------------------------------------------------------
/lightllm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/all_kernel_configs/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=1,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=16,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=2,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=32,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=4,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=512,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=64,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=128,M=8,N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=1,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=16,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 2, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=2,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=32,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=4,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=512,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"512": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=64,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 4, "num_warps": 4}}, "16": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 4}}, "32": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "64": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H100_80GB_HBM3.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_stages": 2, "num_warps": 4}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/bmm_scaled_fp8/{B=16,K=512,M=8,N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 4, "num_warps": 4}}, "2": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_stages": 3, "num_warps": 8}}, "4": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_stages": 4, "num_warps": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 8}}, "8": {"128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_stages": 3, "num_warps": 4}}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1024,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1152,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=1536,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=16384,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=18432,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=2048,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=2304,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=256,N=7168,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=2048,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=32768,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=512,N=4096,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=1536,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=2304,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=24576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=256,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=36864,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "16": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "32": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8, "num_stages": 5, "num_warps": 2}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}, "2048": {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 8, "num_stages": 3, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=512,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=576,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/fp8_block_mm/{K=7168,N=8072,block_size=[128,128],out_dtype=torch.bfloat16}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"8": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "16": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "32": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "100": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "128": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "1024": {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}, "2048": {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "GROUP_M": 8, "num_stages": 4, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=1408,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=192,N=5120,expert_num=160,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 8, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}
2 | 


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=1408,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=192,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=2048,N=2816,expert_num=64,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 2}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "1024": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=4096,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=5120,N=384,expert_num=160,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=6,use_fp8_w8a8=true}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 3}, "256": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}}
2 | 


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=704,N=2048,expert_num=64,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8, "num_warps": 4, "num_stages": 3}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "512": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 5}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/grouped_moe_gemm_kernel/{K=96,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}, "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "64": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 16, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, "8192": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 4, "num_warps": 8, "num_stages": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1024,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "2048": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=128,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "512": {"BLOCK_M": 4, "BLOCK_N": 64, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 16}, "4096": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 4, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=1408,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 64, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 16}, "64": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "128": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "1024": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "2048": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 2}, "8": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "512": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 128, "num_warps": 16}, "2048": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "8": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 16}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "1024": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 32, "BLOCK_N": 64, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=2048,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 2}, "8": {"BLOCK_M": 2, "BLOCK_N": 128, "num_warps": 4}, "64": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 16}, "128": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 4}, "256": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 2}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 4}, "8192": {"BLOCK_M": 8, "BLOCK_N": 256, "num_warps": 2}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=256,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 256, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "1024": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 16, "BLOCK_N": 64, "num_warps": 8}, "512": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 8, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 2, "BLOCK_N": 256, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=4096,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "8": {"BLOCK_M": 4, "BLOCK_N": 128, "num_warps": 8}, "64": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 4}, "128": {"BLOCK_M": 8, "BLOCK_N": 128, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 1}, "2048": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 4}, "4096": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 8, "BLOCK_N": 64, "num_warps": 2}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=512,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 4}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "128": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 8}, "512": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "2048": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 2}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_N": 512, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "64": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 4}, "128": {"BLOCK_M": 2, "BLOCK_N": 1024, "num_warps": 4}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 1}, "512": {"BLOCK_M": 32, "BLOCK_N": 128, "num_warps": 4}, "1024": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "2048": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "4096": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}, "8192": {"BLOCK_M": 4, "BLOCK_N": 256, "num_warps": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_silu_and_mul_kernel/{N=8192,out_dtype=torch.bfloat16}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_N": 128, "num_warps": 8}, "8": {"BLOCK_M": 1, "BLOCK_N": 512, "num_warps": 16}, "64": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "128": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "256": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "512": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "1024": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 2}, "2048": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "4096": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}, "8192": {"BLOCK_M": 1, "BLOCK_N": 1024, "num_warps": 8}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 8, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=1024,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 2}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 5}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 4, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 4}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 128, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=4096,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 1, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 2}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 16, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 5}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 3}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 4}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 5}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 128, "num_warps": 4, "NUM_STAGE": 2}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=5120,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 2, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 3}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 3}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=1}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "512": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 2}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 2, "NUM_STAGE": 4}, "2048": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 3}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 5}, "8192": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 4, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_A800-SXM4-80GB.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 2, "BLOCK_DIM": 512, "num_warps": 8, "NUM_STAGE": 1}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 16, "NUM_STAGE": 1}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "128": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "512": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "1024": {"BLOCK_M": 2, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "4096": {"BLOCK_M": 2, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 1, "NUM_STAGE": 4}}


--------------------------------------------------------------------------------
/lightllm/common/all_kernel_configs/moe_sum_reduce_kernel/{hidden_dim=8192,out_dtype=torch.bfloat16,topk_num=6}_NVIDIA_H800.json:
--------------------------------------------------------------------------------
1 | {"1": {"BLOCK_M": 1, "BLOCK_DIM": 64, "num_warps": 8, "NUM_STAGE": 3}, "8": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 8, "NUM_STAGE": 5}, "64": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "128": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "256": {"BLOCK_M": 1, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 3}, "512": {"BLOCK_M": 4, "BLOCK_DIM": 512, "num_warps": 2, "NUM_STAGE": 4}, "1024": {"BLOCK_M": 4, "BLOCK_DIM": 1024, "num_warps": 4, "NUM_STAGE": 1}, "2048": {"BLOCK_M": 1, "BLOCK_DIM": 256, "num_warps": 1, "NUM_STAGE": 4}, "4096": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}, "8192": {"BLOCK_M": 1, "BLOCK_DIM": 1024, "num_warps": 8, "NUM_STAGE": 1}}


--------------------------------------------------------------------------------
/lightllm/common/basemodel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .layer_weights.base_layer_weight import BaseLayerWeight
 2 | from .layer_weights.pre_and_post_layer_weight import PreAndPostLayerWeight
 3 | from .layer_weights.transformer_layer_weight import TransformerLayerWeight
 4 | from .layer_infer.base_layer_infer import BaseLayerInfer
 5 | from .layer_infer.pre_layer_infer import PreLayerInfer
 6 | from .layer_infer.post_layer_infer import PostLayerInfer
 7 | from .layer_infer.transformer_layer_infer import TransformerLayerInfer
 8 | from .layer_infer.template.transformer_layer_infer_template import TransformerLayerInferTpl
 9 | from .layer_infer.template.pre_layer_infer_template import PreLayerInferTpl
10 | from .layer_infer.template.post_layer_infer_template import PostLayerInferTpl
11 | from .infer_struct import InferStateInfo
12 | from .basemodel import TpPartBaseModel
13 | 
14 | 
15 | __all__ = [
16 |     "BaseLayerWeight",
17 |     "PreAndPostLayerWeight",
18 |     "TransformerLayerWeight",
19 |     "BaseLayerInfer",
20 |     "PreLayerInfer",
21 |     "PostLayerInfer",
22 |     "TransformerLayerInfer",
23 |     "TransformerLayerInferTpl",
24 |     "InferStateInfo",
25 |     "TpPartBaseModel",
26 |     "PreLayerInferTpl",
27 |     "PostLayerInferTpl",
28 | ]
29 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
 1 | from .base_layer_infer import BaseLayerInfer
 2 | 
 3 | 
 4 | class PostLayerInfer(BaseLayerInfer):
 5 |     """ """
 6 | 
 7 |     def __init__(self, network_config, mode):
 8 |         super().__init__()
 9 |         self.network_config_ = network_config
10 |         self.mode = mode
11 |         return
12 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/pre_layer_infer.py:
--------------------------------------------------------------------------------
 1 | from .base_layer_infer import BaseLayerInfer
 2 | 
 3 | 
 4 | class PreLayerInfer(BaseLayerInfer):
 5 |     """ """
 6 | 
 7 |     def __init__(self, network_config, mode):
 8 |         super().__init__()
 9 |         self.network_config_ = network_config
10 |         self.mode = mode
11 |         return
12 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_infer/template/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/post_layer_infer_template.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ..post_layer_infer import PostLayerInfer
 3 | from typing import Tuple
 4 | 
 5 | 
 6 | class PostLayerInferTpl(PostLayerInfer):
 7 |     """ """
 8 | 
 9 |     def __init__(self, network_config, mode):
10 |         super().__init__(network_config, mode)
11 |         self.eps_ = 1e-5
12 |         self.vocab_size_ = network_config["vocab_size"]
13 |         self.embed_dim_ = network_config["n_embed"]
14 |         return
15 | 
16 |     def _norm(self, input, infer_state, layer_weight) -> torch.Tensor:
17 |         raise Exception("need to impl")
18 | 
19 |     def _slice_get_last_input(self, input, infer_state) -> Tuple[torch.Tensor, int]:
20 |         raise Exception("need to impl")
21 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/template/pre_layer_infer_template.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ..pre_layer_infer import PreLayerInfer
 3 | 
 4 | 
 5 | class PreLayerInferTpl(PreLayerInfer):
 6 |     """ """
 7 | 
 8 |     def __init__(self, network_config, mode):
 9 |         super().__init__(network_config, mode)
10 |         self.eps_ = 1e-5
11 |         self.vob_start_id_ = -1
12 |         self.vob_end_id_ = -1
13 |         return
14 | 
15 |     def _norm(self, input, infer_state, layer_weight) -> torch.Tensor:
16 |         raise Exception("need to impl")
17 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
 1 | from .base_layer_infer import BaseLayerInfer
 2 | 
 3 | 
 4 | class TransformerLayerInfer(BaseLayerInfer):
 5 |     """ """
 6 | 
 7 |     def __init__(self, layer_num, network_config, mode):
 8 |         super().__init__()
 9 |         self.layer_num_ = layer_num
10 |         self.network_config_ = network_config
11 |         self.mode = mode
12 |         return
13 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base_weight import BaseWeight
 2 | from .mm_weight import (
 3 |     MMWeightTpl,
 4 |     MultiMMWeightTpl,
 5 |     ROWMMWeight,
 6 |     COLMMWeight,
 7 |     MultiROWMMWeight,
 8 |     ROWBMMWeight,
 9 | )
10 | from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight
11 | from .fused_moe_weight_tp import FusedMoeWeightTP
12 | from .fused_moe_weight_ep import FusedMoeWeightEP
13 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py:
--------------------------------------------------------------------------------
 1 | from .mm_weight import (
 2 |     MMWeightTpl,
 3 |     MultiMMWeightTpl,
 4 | )
 5 | from .rowmm_weight import (
 6 |     ROWMMWeight,
 7 |     ROWBMMWeight,
 8 |     MultiROWMMWeight,
 9 |     W8A8B128ROWMMWeight,
10 |     W8A8B128ROWBMMWeight,
11 |     W8A8B128MultiROWMMWeight,
12 | )
13 | from .colmm_weight import (
14 |     COLMMWeight,
15 |     W8A8B128COLMMWeight,
16 | )
17 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | from .base_layer_weight import BaseLayerWeight
 2 | 
 3 | 
 4 | class PreAndPostLayerWeight(BaseLayerWeight):
 5 |     def __init__(self, data_type, network_config, mode):
 6 |         super().__init__()
 7 |         self.data_type_ = data_type
 8 |         self.network_config_ = network_config
 9 |         self.mode = mode
10 |         self.init_static_params()
11 |         return
12 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/microbatch_overlap_objs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class DecodeMicroBatch:
 7 |     batch_size: int
 8 |     total_token_num: int
 9 |     max_len_in_batch: int
10 |     input_ids: torch.Tensor
11 |     mem_indexes: torch.Tensor
12 |     b_req_idx: torch.Tensor
13 |     b_seq_len: torch.Tensor
14 | 
15 | 
16 | @dataclass
17 | class PrefillMicroBatch:
18 |     batch_size: int
19 |     total_token_num: int
20 |     max_len_in_batch: int
21 |     input_ids: torch.Tensor
22 |     mem_indexes: torch.Tensor
23 |     b_req_idx: torch.Tensor
24 |     b_seq_len: torch.Tensor
25 |     b_ready_cache_len: torch.Tensor
26 |     multimodal_params: list
27 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/basemodel/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/add_in_place.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | @triton.jit
 8 | def _add_in_place(
 9 |     input_ptr,
10 |     other_ptr,
11 |     n_elements,
12 |     alpha,
13 |     BLOCK_SIZE: tl.constexpr,
14 | ):
15 |     pid = tl.program_id(axis=0)
16 |     block_start = pid * BLOCK_SIZE
17 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
18 |     mask = offsets < n_elements
19 |     x = tl.load(input_ptr + offsets, mask=mask)
20 |     y = tl.load(other_ptr + offsets, mask=mask)
21 |     x = x + y * alpha
22 |     tl.store(input_ptr + offsets, x, mask=mask)
23 | 
24 | 
25 | @torch.no_grad()
26 | def add_in_place(input: torch.Tensor, other: torch.Tensor, *, alpha=1):
27 |     assert input.is_contiguous(), "input tensor must be contiguous"
28 |     assert other.is_contiguous(), "other tensor must be contiguous"
29 |     n_elements = input.numel()
30 |     grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
31 |     _add_in_place[grid](
32 |         input,
33 |         other,
34 |         n_elements,
35 |         alpha,
36 |         BLOCK_SIZE=1024,
37 |     )
38 |     return input
39 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/copy_kv_index_to_req.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | @triton.jit
 8 | def _fwd_kernel_copy_kv_index_to_req(
 9 |     req_to_token_indexs, b_req_idx, b_seq_len, memindex,
10 |     stride_req_to_token_b, stride_req_to_token_s
11 | ):
12 |     cur_index = tl.program_id(0)
13 |     cur_req_idx = tl.load(b_req_idx + cur_index)
14 |     cur_token_index = tl.load(memindex + cur_index)
15 |     cur_seq_len = tl.load(b_seq_len + cur_index)
16 |     dest_offset = req_to_token_indexs + cur_req_idx * stride_req_to_token_b + (cur_seq_len - 1) * stride_req_to_token_s
17 |     tl.store(dest_offset, cur_token_index)
18 |     return
19 | 
20 | 
21 | @torch.no_grad()
22 | def copy_kv_index_to_req(req_to_token_indexs, b_req_idx, b_seq_len, memindex):
23 |     seq_len = b_seq_len.shape[0]
24 |     assert b_seq_len.shape[0] == memindex.shape[0] and b_req_idx.shape[0] == b_seq_len.shape[0]
25 |     grid = (seq_len,)
26 |     num_warps = 1
27 | 
28 |     _fwd_kernel_copy_kv_index_to_req[grid](
29 |         req_to_token_indexs, b_req_idx, b_seq_len, memindex,
30 |         req_to_token_indexs.stride(0), req_to_token_indexs.stride(1),
31 |         num_warps=num_warps,
32 |         num_stages=1,
33 |     )
34 |     return
35 | 


--------------------------------------------------------------------------------
/lightllm/common/basemodel/triton_kernel/gen_decode_params.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | from .gen_prefill_params import gen_cumsum_pad0_tensor
 5 | 
 6 | 
 7 | @torch.no_grad()
 8 | def gen_decode_params(b_seq_len: torch.Tensor):
 9 |     b_kv_seq_len = b_seq_len
10 |     position_ids = b_seq_len - 1
11 |     b_q_seq_len = torch.ones_like(b_seq_len)
12 |     b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len)
13 |     max_q_seq_len = b_q_seq_len.max().item()
14 |     max_kv_seq_len = b_kv_seq_len.max().item()
15 |     return b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len
16 | 


--------------------------------------------------------------------------------
/lightllm/common/build_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def repair_config(config, same_names):
 3 |     find_value = None
 4 |     for name in same_names:
 5 |         if name in config and config[name] is not None:
 6 |             find_value = config[name]
 7 |             break
 8 |     for name in same_names:
 9 |         config[name] = find_value
10 |     return


--------------------------------------------------------------------------------
/lightllm/common/deepseek2_fp8kv_mem_manager.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .deepseek2_mem_manager import Deepseek2MemoryManager
3 | 
4 | 
5 | class Deepseek2FP8KVMemoryManager(Deepseek2MemoryManager):
6 |     def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
7 |         # scale被追加到kv_buffer末尾, 因此加2, dtype统一改成uint8
8 |         super().__init__(size, torch.uint8, head_num, head_dim + 2, layer_num, always_copy, mem_fraction)
9 | 


--------------------------------------------------------------------------------
/lightllm/common/fused_moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/fused_moe/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/infer_utils.py:
--------------------------------------------------------------------------------
 1 | def init_req_to_token_indexes(
 2 |     req_to_token_indexs, b_req_idx, b_seq_len, b_ready_cache_len, max_len_in_batch, alloc_mem_index
 3 | ):
 4 |     start_index = 0
 5 |     b_seq_len_numpy = b_seq_len.cpu().numpy()
 6 |     b_ready_cache_len_numpy = b_ready_cache_len.cpu().numpy()
 7 |     b_req_idx_numpy = b_req_idx.cpu().numpy()
 8 |     for i in range(len(b_seq_len)):
 9 |         cur_seq_len = b_seq_len_numpy[i]
10 |         cur_ready_cache_len = b_ready_cache_len_numpy[i]
11 |         req_to_token_indexs[b_req_idx_numpy[i], cur_ready_cache_len:cur_seq_len] = alloc_mem_index[
12 |             start_index : start_index + cur_seq_len - cur_ready_cache_len
13 |         ]
14 |         start_index += cur_seq_len - cur_ready_cache_len
15 |     return
16 | 


--------------------------------------------------------------------------------
/lightllm/common/kv_trans_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/kv_trans_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/mem_utils.py:
--------------------------------------------------------------------------------
 1 | from lightllm.common.mem_manager import MemoryManager
 2 | from lightllm.common.int8kv_mem_manager import INT8KVMemoryManager
 3 | from lightllm.common.ppl_int8kv_mem_manager import PPLINT8KVMemoryManager
 4 | from lightllm.common.ppl_int4kv_mem_manager import PPLINT4KVMemoryManager
 5 | from lightllm.utils.log_utils import init_logger
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | def select_mem_manager_class(mode):
11 |     logger.info(f"mode setting params: {mode}")
12 |     if "ppl_int8kv" in mode or "ppl_int8kv_flashdecoding" in mode:
13 |         memory_manager_class = PPLINT8KVMemoryManager
14 |         logger.info(f"Model kv cache using mode {mode}")
15 |     elif "ppl_int4kv_flashdecoding" in mode:
16 |         memory_manager_class = PPLINT4KVMemoryManager
17 |         logger.info(f"Model kv cache using mode {mode}")
18 |     elif "triton_int8kv" in mode:
19 |         memory_manager_class = INT8KVMemoryManager
20 |         logger.info("Model kv cache using mode triton int8kv")
21 |     elif "triton_fp8kv" in mode:
22 |         raise Exception("currently only for deepseek")
23 |     else:
24 |         memory_manager_class = MemoryManager
25 |         logger.info("Model kv cache using mode normal")
26 |     return memory_manager_class
27 | 


--------------------------------------------------------------------------------
/lightllm/common/quantization/configs/llamacls-mix-down.yaml:
--------------------------------------------------------------------------------
1 | quant_type: vllm-w8a8
2 | mix_bits:
3 |   - name: "down_proj"
4 |     quant_type: "none"
5 |     layer_nums: [1, 2, 3] # Defaults to all layers, or you can specify a layer_num list.


--------------------------------------------------------------------------------
/lightllm/common/quantization/quantize_method.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from abc import ABC, abstractmethod
 3 | from lightllm.utils.dist_utils import get_current_device_id
 4 | 
 5 | 
 6 | class QuantizationMethod(ABC):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.device_id_ = get_current_device_id()
10 |         self.weight_scale_suffix = None
11 |         self.act_scale_suffix = None
12 | 
13 |     @abstractmethod
14 |     def quantize(self, weights: torch.Tensor):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def apply(self, input_tensor, weight, bias=None, out=None, use_custom_tensor_mananger=True):
19 |         pass
20 | 


--------------------------------------------------------------------------------
/lightllm/common/quantization/registry.py:
--------------------------------------------------------------------------------
 1 | class QuantMethodFactory:
 2 |     def __init__(self):
 3 |         self._quant_methods = {}
 4 | 
 5 |     def register(self, names):
 6 |         def decorator(cls):
 7 |             local_names = names
 8 |             if isinstance(local_names, str):
 9 |                 local_names = [local_names]
10 |             for n in local_names:
11 |                 self._quant_methods[n] = cls
12 |             return cls
13 | 
14 |         return decorator
15 | 
16 |     def get(self, key, *args, **kwargs):
17 |         if key == "none":
18 |             return None
19 |         quant_method_class = self._quant_methods.get(key)
20 |         if not quant_method_class:
21 |             raise ValueError(f"QuantMethod '{key}' not supported.")
22 |         return quant_method_class()
23 | 
24 | 
25 | QUANTMETHODS = QuantMethodFactory()
26 | 


--------------------------------------------------------------------------------
/lightllm/common/quantization/triton_quant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/__init__.py


--------------------------------------------------------------------------------
/lightllm/common/quantization/triton_quant/fp8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/common/quantization/triton_quant/fp8/__init__.py


--------------------------------------------------------------------------------
/lightllm/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | 


--------------------------------------------------------------------------------
/lightllm/models/bloom/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/bloom/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/bloom/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/bloom/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/bloom/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/chatglm2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/chatglm2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/chatglm2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/chatglm2/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/chatglm2/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/cohere/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/cohere/infer_struct.py:
--------------------------------------------------------------------------------
1 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
2 | 
3 | 
4 | class CohereInferStateInfo(LlamaInferStateInfo):
5 |     def __init__(self):
6 |         super().__init__()
7 |         self._attn_out = None
8 |         self._ffn_out = None
9 | 


--------------------------------------------------------------------------------
/lightllm/models/cohere/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/cohere/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/cohere/triton_kernels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/cohere/triton_kernels/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/deepseek2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/deepseek2/infer_struct.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import numpy as np
 4 | import torch.distributed as dist
 5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 6 | 
 7 | 
 8 | class Deepseek2InferStateInfo(LlamaInferStateInfo):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.kv_starts = None
12 | 
13 |     def init_some_extra_state(self, model, input_ids: torch.Tensor):
14 |         super().init_some_extra_state(model, input_ids)
15 |         if not self.is_prefill:
16 |             self.kv_starts = self.b1_cu_kv_seq_len
17 | 
18 |         if self.is_prefill:
19 |             self.b1_kv_start_loc = self.b1_cu_kv_seq_len
20 |             self.max_value_in_b_seq_len = self.b_seq_len.max().item()
21 |         return
22 | 


--------------------------------------------------------------------------------
/lightllm/models/deepseek2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/deepseek2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/deepseek2/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/deepseek2/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma3/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma3/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 4 | 
 5 | 
 6 | # add key: language_model.xxx -> xxx
 7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now
 8 | def rename_weight_keys(weights):
 9 |     prefix = "language_model."
10 |     keys = list(weights.keys())
11 |     for k in keys:
12 |         if prefix in k:
13 |             weights[k[len(prefix) :]] = weights[k]
14 | 
15 | 
16 | class Gemma3PreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
17 |     def __init__(self, data_type, network_config, mode):
18 |         network_config["tie_word_embeddingse"] = True
19 |         super().__init__(data_type, network_config, mode)
20 |         return
21 | 
22 |     def load_hf_weights(self, weights):
23 |         rename_weight_keys(weights)
24 |         super().load_hf_weights(weights)
25 |         return
26 | 


--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 4 | 
 5 | 
 6 | class Gemma_2bPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
 7 |     def __init__(self, data_type, network_config, mode):
 8 |         super().__init__(data_type, network_config, mode)
 9 |         return
10 | 
11 |     def load_hf_weights(self, weights):
12 |         vob_size = self.network_config_["vocab_size"]
13 |         split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 |         split_start = split_indexes[self.tp_rank_]
15 |         split_end = split_indexes[self.tp_rank_ + 1]
16 |         if "model.embed_tokens.weight" in weights:
17 |             # print(weights['model.embed_tokens.weight'].shape)
18 |             self.wte_weight_ = self._cuda(weights["model.embed_tokens.weight"][split_start:split_end, :])
19 |             self.lm_head_weight_ = self.wte_weight_
20 | 
21 |         if "model.norm.weight" in weights:
22 |             self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
23 |             self.final_norm_weight_ = self.final_norm_weight_ + 1
24 | 
25 |         return
26 | 


--------------------------------------------------------------------------------
/lightllm/models/gemma_2b/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/gemma_2b/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import numpy as np
 4 | 
 5 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 6 | 
 7 | 
 8 | class InternlmTransformerLayerWeight(LlamaTransformerLayerWeight):
 9 |     def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
10 |         super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
11 |         return
12 | 
13 |     def _init_weight_names(self):
14 |         super()._init_weight_names()
15 |         self._q_bias_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.bias"
16 |         self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias"
17 |         self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias"
18 |         self._o_bias_name = f"model.layers.{self.layer_num_}.self_attn.o_proj.bias"
19 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from lightllm.models.registry import ModelRegistry
 5 | from lightllm.models.internlm.layer_weights.transformer_layer_weight import InternlmTransformerLayerWeight
 6 | from lightllm.models.llama.model import LlamaTpPartModel
 7 | 
 8 | 
 9 | @ModelRegistry("internlm")
10 | class InternlmTpPartModel(LlamaTpPartModel):
11 |     # weight class
12 |     transformer_weight_class = InternlmTransformerLayerWeight
13 | 
14 |     def __init__(self, kvargs):
15 |         super().__init__(kvargs)
16 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm2/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 4 | 
 5 | 
 6 | class Internlm2PreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
 7 |     def __init__(self, data_type, network_config, mode):
 8 |         super().__init__(data_type, network_config, mode)
 9 |         return
10 | 
11 |     def load_hf_weights(self, weights):
12 |         vob_size = self.network_config_["vocab_size"]
13 |         split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 |         split_start = split_indexes[self.tp_rank_]
15 |         split_end = split_indexes[self.tp_rank_ + 1]
16 |         if "model.tok_embeddings.weight" in weights:
17 |             self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :])
18 |         if "output.weight" in weights:
19 |             self.lm_head_weight_ = self._cuda(weights["output.weight"][split_start:split_end, :])
20 |         if "model.norm.weight" in weights:
21 |             self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
22 | 
23 |         return
24 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm2/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | 
 5 | from lightllm.models.registry import ModelRegistry
 6 | from lightllm.models.internlm2.layer_weights.transformer_layer_weight import Internlm2TransformerLayerWeight
 7 | from lightllm.models.internlm2.layer_weights.pre_and_post_layer_weight import Internlm2PreAndPostLayerWeight
 8 | from lightllm.models.internlm.model import InternlmTpPartModel
 9 | 
10 | 
11 | @ModelRegistry("internlm2")
12 | class Internlm2TpPartModel(InternlmTpPartModel):
13 |     # weight class
14 |     pre_and_post_weight_class = Internlm2PreAndPostLayerWeight
15 |     transformer_weight_class = Internlm2TransformerLayerWeight
16 | 
17 |     def __init__(self, kvargs):
18 |         super().__init__(kvargs)
19 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | import numpy as np
 4 | 
 5 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 6 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
 7 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 8 | from einops import rearrange
 9 | 
10 | 
11 | class Internlm2RewardPostLayerInfer(LlamaPostLayerInfer):
12 |     def token_forward(self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: LlamaPreAndPostLayerWeight):
13 |         last_input, token_num = self._slice_get_last_input(input_embdings, infer_state)
14 | 
15 |         input_embdings = None
16 |         last_input = self._norm(last_input, infer_state, layer_weight)
17 |         score = torch.mm(last_input, layer_weight.lm_head_weight_)
18 | 
19 |         return score
20 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internlm2_reward/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 4 | 
 5 | 
 6 | class Internlm2RewardPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
 7 |     def __init__(self, data_type, network_config, mode):
 8 |         super().__init__(data_type, network_config, mode)
 9 |         return
10 | 
11 |     def load_hf_weights(self, weights):
12 |         vob_size = self.network_config_["vocab_size"]
13 |         split_indexes = np.linspace(0, vob_size, self.tp_world_size_ + 1, dtype=np.int64)
14 |         split_start = split_indexes[self.tp_rank_]
15 |         split_end = split_indexes[self.tp_rank_ + 1]
16 |         if "model.tok_embeddings.weight" in weights:
17 |             self.wte_weight_ = self._cuda(weights["model.tok_embeddings.weight"][split_start:split_end, :])
18 |         if "v_head.weight" in weights:
19 |             self.lm_head_weight_ = self._cuda(weights["v_head.weight"]).transpose(0, 1)
20 |         if "model.norm.weight" in weights:
21 |             self.final_norm_weight_ = self._cuda(weights["model.norm.weight"])
22 | 
23 |         return
24 | 


--------------------------------------------------------------------------------
/lightllm/models/internlm2_reward/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from lightllm.models.registry import ModelRegistry, is_reward_model
 5 | from lightllm.models.internlm2_reward.layer_infer.post_layer_infer import Internlm2RewardPostLayerInfer
 6 | from lightllm.models.internlm2_reward.layer_weights.pre_and_post_layer_weight import (
 7 |     Internlm2RewardPreAndPostLayerWeight,
 8 | )
 9 | from lightllm.models.internlm2.model import Internlm2TpPartModel
10 | 
11 | 
12 | @ModelRegistry("internlm2", condition=is_reward_model())
13 | class Internlm2RewardTpPartModel(Internlm2TpPartModel):
14 |     # weight class
15 |     pre_and_post_weight_class = Internlm2RewardPreAndPostLayerWeight
16 | 
17 |     post_layer_infer_class = Internlm2RewardPostLayerInfer
18 | 
19 |     def __init__(self, kvargs):
20 |         super().__init__(kvargs)
21 | 


--------------------------------------------------------------------------------
/lightllm/models/internvl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/internvl/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/internvl/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llama/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llama/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llama/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llama/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llama/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llama/yarn_rotary_utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | 
 4 | 
 5 | # Inverse dim formula to find dim based on number of rotations
 6 | def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
 7 |     return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
 8 | 
 9 | 
10 | # Find dim range bounds based on rotations
11 | def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
12 |     low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
13 |     high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
14 |     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
15 | 
16 | 
17 | def linear_ramp_mask(min, max, dim):
18 |     if min == max:
19 |         max += 0.001  # Prevent singularity
20 | 
21 |     linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
22 |     ramp_func = torch.clamp(linear_func, 0, 1)
23 |     return ramp_func
24 | 
25 | 
26 | def get_mscale(scale=1):
27 |     if scale <= 1:
28 |         return 1.0
29 |     return 0.1 * math.log(scale) + 1.0
30 | 
31 | 
32 | def get_deepseek_mscale(scale=1, mscale=1):
33 |     if scale <= 1:
34 |         return 1.0
35 |     return 0.1 * mscale * math.log(scale) + 1.0
36 | 


--------------------------------------------------------------------------------
/lightllm/models/llava/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llava/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/llava/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/llava/layer_weights/pre_and_post_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.layer_weights.pre_and_post_layer_weight import LlamaPreAndPostLayerWeight
 4 | 
 5 | 
 6 | # add key: language_model.xxx -> xxx
 7 | # only change keys at PreAndPostLayerWeight load, TransformLayerWeight is correct now
 8 | def rename_weight_keys(weights):
 9 |     prefix = "language_model."
10 |     keys = list(weights.keys())
11 |     for k in keys:
12 |         if prefix in k:
13 |             weights[k[len(prefix) :]] = weights[k]
14 | 
15 | 
16 | class LlavaPreAndPostLayerWeight(LlamaPreAndPostLayerWeight):
17 |     def __init__(self, data_type, network_config, mode):
18 |         super().__init__(data_type, network_config, mode)
19 |         return
20 | 
21 |     def load_hf_weights(self, weights):
22 |         rename_weight_keys(weights)
23 |         super().load_hf_weights(weights)
24 |         return
25 | 


--------------------------------------------------------------------------------
/lightllm/models/minicpm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/minicpm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/minicpm/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/minicpm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 3 | 
 4 | 
 5 | class MiniCPMTransformerLayerWeight(LlamaTransformerLayerWeight):
 6 |     def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
 7 |         super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
 8 |         return
 9 | 
10 |     def _parse_config(self):
11 |         super()._parse_config()
12 |         num_hidden_layers = self.network_config_["num_hidden_layers"]
13 |         scale_depth = self.network_config_.get("scale_depth", math.sqrt(num_hidden_layers))
14 |         self.layer_scale = scale_depth / math.sqrt(num_hidden_layers)
15 | 
16 |     def load_hf_weights(self, weights):
17 |         if self._o_weight_name in weights:
18 |             weights[self._o_weight_name] *= self.layer_scale
19 |         if self._down_weight_name in weights:
20 |             weights[self._down_weight_name] *= self.layer_scale
21 |         super().load_hf_weights(weights)
22 | 


--------------------------------------------------------------------------------
/lightllm/models/minicpm/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from lightllm.models.registry import ModelRegistry
 5 | from lightllm.models.minicpm.layer_weights.transformer_layer_weight import MiniCPMTransformerLayerWeight
 6 | from lightllm.models.minicpm.layer_weights.pre_and_post_layer_weight import MiniCPMPreAndPostLayerWeight
 7 | from lightllm.models.llama.model import LlamaTpPartModel
 8 | 
 9 | 
10 | @ModelRegistry("minicpm")
11 | class MiniCPMTpPartModel(LlamaTpPartModel):
12 |     # weight class
13 |     transformer_weight_class = MiniCPMTransformerLayerWeight
14 |     pre_and_post_weight_class = MiniCPMPreAndPostLayerWeight
15 | 
16 |     def __init__(self, kvargs):
17 |         super().__init__(kvargs)
18 | 


--------------------------------------------------------------------------------
/lightllm/models/mistral/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/mistral/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/mistral/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
 1 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
 2 | 
 3 | 
 4 | class MistralTransformerLayerInfer(LlamaTransformerLayerInfer):
 5 |     """ """
 6 | 
 7 |     def __init__(self, layer_num, network_config, mode=[]):
 8 |         super().__init__(layer_num, network_config, mode)
 9 |         self.head_dim_ = network_config.get("head_dim", self.head_dim_)
10 |         return
11 | 


--------------------------------------------------------------------------------
/lightllm/models/mistral/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mistral/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/mistral/triton_kernel/init_att_sliding_window_info.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | @triton.jit
 8 | def _fwd_kernel_init_att_window_info(
 9 |     b_seq_len,
10 |     b_att_seq_len,
11 |     batch_size,
12 |     sliding_window,
13 |     BLOCK_SIZE: tl.constexpr,
14 | ):
15 |     cur_index = tl.program_id(0)
16 |     cur_start = cur_index * BLOCK_SIZE
17 |     offsets = cur_start + tl.arange(0, BLOCK_SIZE)
18 |     mask = offsets < batch_size
19 | 
20 |     cur_seq_len = tl.load(b_seq_len + offsets, mask=mask)
21 |     b_att_seq_len_data = tl.minimum(cur_seq_len, sliding_window)
22 | 
23 |     tl.store(b_att_seq_len + offsets, b_att_seq_len_data, mask=mask)
24 |     return
25 | 
26 | 
27 | @torch.no_grad()
28 | def init_att_window_info_fwd(batch_size, b_seq_len, b_att_seq_len, sliding_window):
29 |     # shape constraints
30 |     assert batch_size == b_seq_len.shape[0] == b_att_seq_len.shape[0]
31 | 
32 |     BLOCK_SIZE = 32
33 |     num_warps = 1
34 |     grid = (triton.cdiv(batch_size, BLOCK_SIZE),)
35 | 
36 |     _fwd_kernel_init_att_window_info[grid](
37 |         b_seq_len,
38 |         b_att_seq_len,
39 |         batch_size=batch_size,
40 |         sliding_window=sliding_window,
41 |         BLOCK_SIZE=BLOCK_SIZE,
42 |         num_warps=num_warps,
43 |         num_stages=1,
44 |     )
45 |     return
46 | 


--------------------------------------------------------------------------------
/lightllm/models/mixtral/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/mixtral/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/mixtral/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/mixtral/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/phi3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/phi3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/phi3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/phi3/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/phi3/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from lightllm.models.registry import ModelRegistry
 5 | from lightllm.models.phi3.layer_weights.transformer_layer_weight import Phi3TransformerLayerWeight
 6 | from lightllm.models.phi3.layer_infer.transformer_layer_infer import Phi3TransformerLayerInfer
 7 | from lightllm.models.llama.model import LlamaTpPartModel
 8 | 
 9 | 
10 | @ModelRegistry("phi3")
11 | class Phi3TpPartModel(LlamaTpPartModel):
12 |     # weight class
13 |     transformer_weight_class = Phi3TransformerLayerWeight
14 | 
15 |     transformer_layer_infer_class = Phi3TransformerLayerInfer
16 | 
17 |     def __init__(self, kvargs):
18 |         super().__init__(kvargs)
19 | 


--------------------------------------------------------------------------------
/lightllm/models/phi3/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2_5_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_5_vl/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/__init__.py:
--------------------------------------------------------------------------------
1 |  
2 |   


--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_infer/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 |    


--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_infer/post_layer_infer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 4 | from lightllm.models.llama.layer_infer.post_layer_infer import LlamaPostLayerInfer
 5 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight
 6 | from einops import rearrange
 7 | 
 8 | 
 9 | class Qwen2RewardPostLayerInfer(LlamaPostLayerInfer):
10 |     def token_forward(
11 |         self, input_embdings, infer_state: LlamaInferStateInfo, layer_weight: Qwen2RewardPreAndPostLayerWeight
12 |     ):
13 |         last_input, token_num = self._slice_get_last_input(input_embdings, infer_state)
14 | 
15 |         input_embdings = None
16 |         last_input = self._norm(last_input, infer_state, layer_weight)
17 | 
18 |         last_input = torch.addmm(layer_weight.score_up_bias, last_input, layer_weight.score_up_weight)
19 |         last_input = torch.nn.functional.relu(last_input)
20 |         score = torch.addmm(layer_weight.score_down_bias, last_input, layer_weight.score_down_weight)
21 | 
22 |         return score
23 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/layer_weights/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 |    


--------------------------------------------------------------------------------
/lightllm/models/qwen2_reward/model.py:
--------------------------------------------------------------------------------
 1 | from lightllm.models.registry import ModelRegistry, is_reward_model
 2 | from lightllm.models.qwen2_reward.layer_infer.post_layer_infer import Qwen2RewardPostLayerInfer
 3 | from lightllm.models.qwen2_reward.layer_weights.pre_and_post_layer_weight import Qwen2RewardPreAndPostLayerWeight
 4 | from lightllm.models.qwen2.model import Qwen2TpPartModel
 5 | 
 6 | 
 7 | @ModelRegistry("qwen2", condition=is_reward_model())
 8 | class Qwen2RewardTpPartModel(Qwen2TpPartModel):
 9 | 
10 |     pre_and_post_weight_class = Qwen2RewardPreAndPostLayerWeight
11 |     post_layer_infer_class = Qwen2RewardPostLayerInfer
12 | 
13 |     def __init__(self, kvargs):
14 |         super().__init__(kvargs)
15 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/infer_struct.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 4 | from lightllm.common.basemodel.infer_struct import InferStateInfo
 5 | 
 6 | 
 7 | class Qwen2VLInferStateInfo(LlamaInferStateInfo):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.position_cos = None
11 |         self.position_sin = None
12 | 
13 |     def init_some_extra_state(self, model, input_ids: torch.Tensor):
14 |         InferStateInfo.init_some_extra_state(self, model, input_ids)
15 |         if self.is_prefill:
16 |             position_ids = self.position_ids
17 |             self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
18 |             self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
19 |             position_ids = None
20 |         else:
21 |             position_ids = self.position_ids
22 |             self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
23 |             self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
24 |         return
25 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen2_vl/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen2_vl/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import final
 3 | from lightllm.models.registry import ModelRegistry
 4 | from lightllm.models.qwen3.layer_infer.transformer_layer_infer import Qwen3TransformerLayerInfer
 5 | from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
 6 | from lightllm.models.qwen2.model import Qwen2TpPartModel
 7 | from lightllm.utils.log_utils import init_logger
 8 | 
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | @ModelRegistry("qwen3")
14 | class Qwen3TpPartModel(Qwen2TpPartModel):
15 |     # weight class
16 |     transformer_weight_class = Qwen3TransformerLayerWeight
17 | 
18 |     # infer class
19 |     transformer_layer_infer_class = Qwen3TransformerLayerInfer
20 | 
21 |     def __init__(self, kvargs):
22 |         super().__init__(kvargs)
23 |         return
24 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen3_moe/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen3_moe/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import final
 3 | from lightllm.models.registry import ModelRegistry
 4 | from lightllm.models.qwen3_moe.layer_infer.transformer_layer_infer import Qwen3MOETransformerLayerInfer
 5 | from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
 6 | from lightllm.models.qwen3.model import Qwen3TpPartModel
 7 | from lightllm.utils.log_utils import init_logger
 8 | 
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | @ModelRegistry("qwen3_moe")
14 | class Qwen3MOEModel(Qwen3TpPartModel):
15 |     # weight class
16 |     transformer_weight_class = Qwen3MOETransformerLayerWeight
17 | 
18 |     # infer class
19 |     transformer_layer_infer_class = Qwen3MOETransformerLayerInfer
20 | 
21 |     def __init__(self, kvargs):
22 |         super().__init__(kvargs)
23 |         return
24 | 


--------------------------------------------------------------------------------
/lightllm/models/qwen_vl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/qwen_vl/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/qwen_vl/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/stablelm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/stablelm/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/stablelm/layer_weights/transformer_layer_weight.py:
--------------------------------------------------------------------------------
 1 | from lightllm.models.qwen2.layer_weights.transformer_layer_weight import Qwen2TransformerLayerWeight
 2 | from lightllm.common.basemodel.layer_weights.meta_weights import NormWeight
 3 | 
 4 | 
 5 | class StablelmTransformerLayerWeight(Qwen2TransformerLayerWeight):
 6 |     def __init__(self, layer_num, data_type, network_config, mode=[], quant_cfg=None):
 7 |         super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
 8 |         return
 9 | 
10 |     def _init_weight_names(self):
11 |         super()._init_weight_names()
12 |         self._att_norm_bias_name = f"model.layers.{self.layer_num_}.input_layernorm.bias"
13 |         self._ffn_norm_bias_name = f"model.layers.{self.layer_num_}.post_attention_layernorm.bias"
14 | 


--------------------------------------------------------------------------------
/lightllm/models/stablelm/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from lightllm.models.registry import ModelRegistry
 5 | from lightllm.models.stablelm.layer_infer.transformer_layer_infer import StablelmTransformerLayerInfer
 6 | from lightllm.models.bloom.layer_infer.post_layer_infer import BloomPostLayerInfer
 7 | from lightllm.models.stablelm.layer_weights.pre_and_post_layer_weight import StableLMPreAndPostLayerWeight
 8 | from lightllm.models.stablelm.layer_weights.transformer_layer_weight import StablelmTransformerLayerWeight
 9 | from lightllm.models.llama.model import LlamaTpPartModel
10 | from lightllm.common.build_utils import repair_config
11 | 
12 | 
13 | @ModelRegistry("stablelm")
14 | class StablelmTpPartModel(LlamaTpPartModel):
15 |     # weight class
16 |     pre_and_post_weight_class = StableLMPreAndPostLayerWeight
17 |     transformer_weight_class = StablelmTransformerLayerWeight
18 | 
19 |     # infer class
20 |     transformer_layer_infer_class = StablelmTransformerLayerInfer
21 |     post_layer_infer_class = BloomPostLayerInfer
22 | 
23 |     def __init__(self, kvargs):
24 |         super().__init__(kvargs)
25 | 
26 |     def _init_config(self):
27 |         super()._init_config()
28 |         repair_config(self.config, same_names=["rms_norm_eps", "layer_norm_eps", "layer_norm_epsilon"])
29 |         return
30 | 


--------------------------------------------------------------------------------
/lightllm/models/starcoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_infer/transformer_layer_infer.py:
--------------------------------------------------------------------------------
 1 | from lightllm.models.bloom.layer_infer.transformer_layer_infer import BloomTransformerLayerInfer
 2 | from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
 3 | 
 4 | 
 5 | class StarcoderTransformerLayerInfer(BloomTransformerLayerInfer):
 6 |     """ """
 7 | 
 8 |     def __init__(self, layer_num, network_config, mode=[]):
 9 |         super().__init__(layer_num, network_config, mode)
10 |         self.tp_k_head_num_ = 1
11 |         self.tp_v_head_num_ = 1
12 |         self._bind_func()
13 |         return
14 | 
15 |     def _bind_func(self):
16 |         LlamaTransformerLayerInfer._bind_attention(self)
17 |         return
18 | 


--------------------------------------------------------------------------------
/lightllm/models/starcoder/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/starcoder2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/starcoder2/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/starcoder2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/starcoder2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/tarsier2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/tarsier2/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/tarsier2/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/vit/layer_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/vit/layer_weights/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/layer_weights/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/vit/triton_kernel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/vit/triton_kernel/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/vit/triton_kernel/gelu_vit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import triton
 3 | import triton.language as tl
 4 | from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 5 | 
 6 | 
 7 | @triton.jit
 8 | def gelu(x):
 9 |     x_fp32 = x.to(tl.float32)
10 |     x_gelu = 0.5 * x_fp32 * (1 + tl.math.erf(x_fp32 * 0.7071067811))
11 |     return x_gelu
12 | 
13 | 
14 | @triton.jit
15 | def gelu_kernel(output_ptr, input_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
16 |     pid = tl.program_id(axis=0)
17 |     block_start = pid * BLOCK_SIZE
18 |     offsets = block_start + tl.arange(0, BLOCK_SIZE)
19 |     mask = offsets < n_elements
20 |     input = tl.load(input_ptr + offsets, mask=mask)
21 |     output = gelu(input)
22 |     tl.store(output_ptr + offsets, output, mask=mask)
23 | 
24 | 
25 | def gelu_fwd(input, use_custom_tensor_mananger=False):
26 |     if use_custom_tensor_mananger:
27 |         shape = input.shape
28 |         dtype = input.dtype
29 |         device = input.device
30 |         output = g_cache_manager.alloc_tensor(shape, dtype, device=device)
31 |     else:
32 |         output = torch.empty_like(input)
33 |     assert input.is_contiguous(), "Input tensor must be contiguous"
34 |     n_elements = input.numel()
35 |     grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
36 |     gelu_kernel[grid](output, input, n_elements, BLOCK_SIZE=1024)
37 |     return output
38 | 


--------------------------------------------------------------------------------
/lightllm/models/whisper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/models/whisper/__init__.py


--------------------------------------------------------------------------------
/lightllm/models/whisper/defaults.py:
--------------------------------------------------------------------------------
1 | MIN_AUDIO_LEN = 480  # 最短音频长度
2 | 


--------------------------------------------------------------------------------
/lightllm/server/__init__.py:
--------------------------------------------------------------------------------
1 | from .router.token_load import TokenLoad
2 | 


--------------------------------------------------------------------------------
/lightllm/server/api_server.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .api_cli import make_argument_parser
 3 | 
 4 | if __name__ == "__main__":
 5 |     torch.multiprocessing.set_start_method("spawn")  # this code will not be ok for settings to fork to subprocess
 6 |     parser = make_argument_parser()
 7 |     args = parser.parse_args()
 8 |     from .api_start import pd_master_start, normal_or_p_d_start, config_server_start
 9 | 
10 |     if args.run_mode == "pd_master":
11 |         pd_master_start(args)
12 |     elif args.run_mode == "config_server":
13 |         config_server_start(args)
14 |     else:
15 |         normal_or_p_d_start(args)
16 | 


--------------------------------------------------------------------------------
/lightllm/server/audioserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/audioserver/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/audioserver/model_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/config_server/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module implements a configuration service designed to facilitate the
 3 | registration and retrieval of information in a PD separation mode. It
 4 | allows various nodes to register their own information and query global
 5 |  configuration details efficiently.
 6 | 
 7 | Key Features:
 8 | - Node registration: Enables nodes to register their specific information.
 9 | - Global configuration query: Provides mechanisms for querying shared
10 |     configuration data across the system.
11 | - Designed for distributed systems operating in PD separation mode.
12 | """
13 | 


--------------------------------------------------------------------------------
/lightllm/server/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/core/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/core/objs/__init__.py:
--------------------------------------------------------------------------------
1 | from .sampling_params import SamplingParams
2 | from .req import Req, FinishStatus
3 | from .shm_req_manager import ShmReqManager
4 | from .rpc_shm import RpcShmParams, RpcShmResults, ShmSyncStatusArray
5 | 


--------------------------------------------------------------------------------
/lightllm/server/core/objs/io_objs/__init__.py:
--------------------------------------------------------------------------------
1 | from .group_req import GroupReqIndexes, GroupReqObjs
2 | 


--------------------------------------------------------------------------------
/lightllm/server/core/objs/io_objs/group_req.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from lightllm.server.multimodal_params import MultimodalParams
 3 | from typing import List
 4 | from ..req import Req
 5 | 
 6 | 
 7 | @dataclass
 8 | class GroupReqIndexes:
 9 |     group_req_id: int
10 |     multimodal_params: MultimodalParams
11 |     shm_req_indexes: List[int]
12 |     time_mark: float
13 | 
14 | 
15 | @dataclass
16 | class GroupReqObjs:
17 |     group_req_id: int
18 |     multimodal_params: MultimodalParams
19 |     shm_req_objs: List[Req]
20 |     time_mark: float
21 | 
22 |     def to_group_req_index(self):
23 |         return GroupReqIndexes(
24 |             group_req_id=self.group_req_id,
25 |             multimodal_params=self.multimodal_params,
26 |             shm_req_indexes=[req.index_in_shm_mem for req in self.shm_req_objs],
27 |             time_mark=self.time_mark,
28 |         )
29 | 


--------------------------------------------------------------------------------
/lightllm/server/detokenization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/detokenization/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/detokenization/decode_mode_fix.py:
--------------------------------------------------------------------------------
 1 | """
 2 | p d 分离模式下, 对于到达的请求，需要将输入的prompt_ids 中的最后一个id，提前处理，然后移入到outputs中
 3 | 这是 p d 分离模式下，decode 节点的特殊处理点。
 4 | """
 5 | from .decode_req import DecodeReq
 6 | from .decode import decode_token
 7 | 
 8 | from lightllm.utils.log_utils import init_logger
 9 | 
10 | logger = init_logger(__name__)
11 | 
12 | 
13 | def decode_mode_fix(req_out: DecodeReq, tokenizer, eos_id):
14 |     new_token_id = req_out.prompt_ids[-1]
15 |     decode_token(tokenizer, req_out, new_token_id, eos_id)
16 |     return req_out
17 | 


--------------------------------------------------------------------------------
/lightllm/server/embed_cache/__init__.py:
--------------------------------------------------------------------------------
1 | from . import impl


--------------------------------------------------------------------------------
/lightllm/server/embed_cache/impl/__init__.py:
--------------------------------------------------------------------------------
1 | from . import naive_memory_cache


--------------------------------------------------------------------------------
/lightllm/server/embed_cache/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from io import BytesIO
 4 | import multiprocessing.shared_memory as shm
 5 | 
 6 | 
 7 | def tensor2bytes(t: torch.Tensor):
 8 |     # t = t.cpu().numpy().tobytes()
 9 |     # return t
10 |     buf = BytesIO()
11 |     torch.save(t.detach().cpu(), buf)
12 |     buf.seek(0)
13 |     return buf.read()
14 | 
15 | 
16 | def bytes2tensor(b):
17 |     # return torch.from_numpy(np.frombuffer(b, dtype=np.float16)).cuda()
18 |     return torch.load(BytesIO(b))
19 | 
20 | 
21 | def create_shm(name, data):
22 |     try:
23 |         data_size = len(data)
24 |         shared_memory = shm.SharedMemory(name=name, create=True, size=data_size)
25 |         mem_view = shared_memory.buf
26 |         mem_view[:data_size] = data
27 |     except FileExistsError:
28 |         print("Warning create shm {} failed because of FileExistsError!".format(name))
29 | 
30 | 
31 | def read_shm(name):
32 |     shared_memory = shm.SharedMemory(name=name)
33 |     data = shared_memory.buf.tobytes()
34 |     return data
35 | 
36 | 
37 | def free_shm(name):
38 |     shared_memory = shm.SharedMemory(name=name)
39 |     shared_memory.close()
40 |     shared_memory.unlink()
41 | 
42 | 
43 | def get_shm_name_data(uid):
44 |     return str(uid) + "-data"
45 | 
46 | 
47 | def get_shm_name_embed(uid):
48 |     return str(uid) + "-embed"
49 | 


--------------------------------------------------------------------------------
/lightllm/server/health_monitor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/health_monitor/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/httpserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/httpserver/async_queue.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | 
 4 | class AsyncQueue:
 5 |     def __init__(self):
 6 |         self.datas = []
 7 |         self.event = asyncio.Event()
 8 |         self.lock = asyncio.Lock()
 9 | 
10 |     async def wait_to_ready(self):
11 |         try:
12 |             await asyncio.wait_for(self.event.wait(), timeout=3)
13 |         except asyncio.TimeoutError:
14 |             pass
15 | 
16 |     async def get_all_data(self):
17 |         async with self.lock:
18 |             self.event.clear()
19 |             ans = self.datas
20 |             self.datas = []
21 |             return ans
22 | 
23 |     async def put(self, obj):
24 |         async with self.lock:
25 |             self.datas.append(obj)
26 |             self.event.set()
27 |         return
28 | 
29 |     async def wait_to_get_all_data(self):
30 |         await self.wait_to_ready()
31 |         handle_list = await self.get_all_data()
32 |         return handle_list
33 | 


--------------------------------------------------------------------------------
/lightllm/server/httpserver_for_pd_master/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/httpserver_for_pd_master/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .manager import start_metric_manager
2 | 


--------------------------------------------------------------------------------
/lightllm/server/router/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/dynamic_prompt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/dynamic_prompt/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/__init__.py:
--------------------------------------------------------------------------------
 1 | from .continues_batch.impl import ContinuesBatchBackend
 2 | from .continues_batch.impl_for_return_all_prompt_logprobs import ReturnPromptLogProbBackend
 3 | from .continues_batch.impl_for_reward_model import RewardModelBackend
 4 | from .chunked_prefill.impl import ChunkedPrefillBackend
 5 | from .diverse_backend.impl import DiversehBackend
 6 | from .chunked_prefill.impl_for_token_healing import TokenHealingBackend
 7 | from .chunked_prefill.impl_for_outlines_constraint_mode import OutlinesConstraintBackend
 8 | from .chunked_prefill.impl_for_first_token_constraint_mode import FirstTokenConstraintBackend
 9 | from .dp_backend.impl import DPChunkedPrefillBackend
10 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl import ChunckedPrefillForPrefillNode
11 | from .continues_batch.pd_mode.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode
12 | from .chunked_prefill.impl_for_xgrammar_mode import XgrammarBackend
13 | from .continues_batch.pd_mode.prefill_node_impl.prefill_impl_for_dp_chuncked import DPChunkedForPrefillNode
14 | from .continues_batch.pd_mode.decode_node_impl.decode_impl_for_dp import DPForDecodeNode
15 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/chunked_prefill/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/__init__.py:
--------------------------------------------------------------------------------
1 | from .decode_kv_move_manager import start_decode_kv_move_manager_process
2 | from .decode_trans_process import start_decode_trans_process
3 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_task_cache.py:
--------------------------------------------------------------------------------
 1 | # 这个里面声明了一个全局变量，主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据
 2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了，不用传输特别的
 3 | # 数据了，提升rpyc 调用的速度, 只用在 decode_impl.py 和 decode_infer_rpyc.py 文件中
 4 | from typing import Dict, List, Tuple
 5 | from lightllm.server.pd_io_struct import KVMoveTask
 6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode
 7 | 
 8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, List[int]]] = {}
 9 | 
10 | g_success_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode, float]] = {}  # 第三个float代表的是时间，用于判断过期条件。
11 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/__init__.py:
--------------------------------------------------------------------------------
1 | from .prefill_trans_process import start_prefill_trans_process
2 | from .prefill_kv_move_manager import start_prefill_kv_move_manager_process
3 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_task_cache.py:
--------------------------------------------------------------------------------
1 | # 这个里面声明了一个全局变量，主要用于推理进程缓存发送给其他进程的Kv move 任务的缓存数据
2 | # 为了减少一些调用时候的序列化开销。有些调用就只需要传输一个请求id就可以了，不用传输特别的
3 | # 数据了，提升rpyc 调用的速度, 只用在 prefill_impl.py 和 prefill_infer_rpyc.py 文件中
4 | from typing import Dict, Tuple
5 | from lightllm.server.pd_io_struct import KVMoveTask
6 | from lightllm.server.router.dynamic_prompt.radix_cache import TreeNode
7 | 
8 | g_kv_move_task_cache: Dict[int, Tuple[KVMoveTask, TreeNode]] = {}
9 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/utils.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import torch.multiprocessing as mp
 3 | from queue import Empty
 4 | 
 5 | 
 6 | def join_if_alive(thread: threading.Thread):
 7 |     if thread is not None and thread.is_alive():
 8 |         try:
 9 |             thread.join()
10 |         except Exception:
11 |             pass
12 |     return
13 | 
14 | 
15 | def clear_queue(queue: mp.Queue):
16 |     while not queue.empty():
17 |         try:
18 |             queue.get_nowait()
19 |         except Empty:
20 |             break
21 | 


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/diverse_backend/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/model_infer/mode_backend/dp_backend/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/req_queue/chunked_prefill/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/chunked_prefill/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/router/req_queue/continues_batch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/router/req_queue/continues_batch/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/visualserver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/__init__.py


--------------------------------------------------------------------------------
/lightllm/server/visualserver/model_infer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/server/visualserver/model_infer/__init__.py


--------------------------------------------------------------------------------
/lightllm/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/lightllm/utils/__init__.py


--------------------------------------------------------------------------------
/lightllm/utils/error_utils.py:
--------------------------------------------------------------------------------
 1 | class ServerBusyError(Exception):
 2 |     """Custom exception for server busy/overload situations"""
 3 | 
 4 |     def __init__(self, message="Server is busy, please try again later", status_code=503):
 5 |         """
 6 |         Initialize the ServerBusyError
 7 | 
 8 |         Args:
 9 |             message (str): Error message to display
10 |             status_code (int): HTTP status code (default 503 Service Unavailable)
11 |         """
12 |         super().__init__(message)
13 |         self.message = message
14 |         self.status_code = status_code  # HTTP 503 Service Unavailable
15 | 
16 |     def __str__(self):
17 |         """String representation of the error"""
18 |         return f"{self.message} (Status code: {self.status_code})"
19 | 


--------------------------------------------------------------------------------
/lightllm/utils/graceful_utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from lightllm.utils.log_utils import init_logger
 3 | 
 4 | logger = init_logger(__name__)
 5 | 
 6 | 
 7 | def graceful_registry(sub_module_name):
 8 |     import signal
 9 | 
10 |     # 子进程在受到 SIGTERM的时候，不能自己就提前退出。
11 |     def graceful_shutdown(signum, frame):
12 |         logger.info(f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown...")
13 |         if signum == signal.SIGTERM:
14 |             # 不退出，由主进程来决定退出时机
15 |             logger.info(f"{sub_module_name} recive sigterm")
16 | 
17 |     signal.signal(signal.SIGTERM, graceful_shutdown)
18 |     return
19 | 


--------------------------------------------------------------------------------
/lightllm/utils/light_utils.py:
--------------------------------------------------------------------------------
 1 | from lightllm.utils.log_utils import init_logger
 2 | 
 3 | logger = init_logger(__name__)
 4 | try:
 5 |     # TODO: lightllm_kernel release
 6 |     import lightllm_kernel
 7 | 
 8 |     light_ops = getattr(lightllm_kernel, "ops", lightllm_kernel)
 9 |     HAS_LIGHTLLM_KERNEL = True
10 | except:
11 |     light_ops = None
12 |     HAS_LIGHTLLM_KERNEL = False
13 |     logger.warning("lightllm_kernel is not installed, you can't use the api of it.")
14 | 


--------------------------------------------------------------------------------
/lightllm/utils/retry_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import functools
 3 | from lightllm.utils.log_utils import init_logger
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | def retry(max_attempts=3, wait_time=1):
 9 |     """
10 |     被修饰的函数调用失败需要自己抛异常
11 |     :param max_attempts: 最大重试次数
12 |     :param wait_time: 每次重试之间的等待时间（秒）
13 |     """
14 | 
15 |     def decorator(func):
16 |         @functools.wraps(func)
17 |         def wrapper(*args, **kwargs):
18 |             attempts = 0
19 |             while attempts < max_attempts:
20 |                 try:
21 |                     return func(*args, **kwargs)
22 |                 except Exception as e:
23 |                     attempts += 1
24 |                     logger.info(f"try {func.__name__} {attempts}/{max_attempts} fail: {str(e)}")
25 |                     if attempts < max_attempts:
26 |                         time.sleep(wait_time)
27 |             raise Exception(f"{func.__name__} try all failed")
28 | 
29 |         return wrapper
30 | 
31 |     return decorator
32 | 


--------------------------------------------------------------------------------
/lightllm/utils/sgl_utils.py:
--------------------------------------------------------------------------------
 1 | from lightllm.utils.log_utils import init_logger
 2 | 
 3 | logger = init_logger(__name__)
 4 | try:
 5 |     import sgl_kernel
 6 | 
 7 |     sgl_ops = sgl_kernel
 8 |     sgl_allreduce_ops = sgl_ops.allreduce
 9 |     HAS_SGL_KERNEL = True
10 | except:
11 |     sgl_ops = None
12 |     sgl_allreduce_ops = None
13 |     HAS_SGL_KERNEL = False
14 |     logger.warning(
15 |         "sgl_kernel is not installed, you can't use the api of it. \
16 |                    You can solve it by running `pip install sgl_kernel`."
17 |     )
18 | 
19 | try:
20 |     from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
21 | 
22 |     flash_attn_varlen_func = flash_attn_varlen_func
23 |     flash_attn_with_kvcache = flash_attn_with_kvcache
24 |     merge_state_v2 = sgl_ops.merge_state_v2
25 | except:
26 |     flash_attn_varlen_func = None
27 |     flash_attn_with_kvcache = None
28 |     merge_state_v2 = None
29 |     logger.warning(
30 |         "sgl_kernel is not installed, or the installed version did not support fa3. \
31 |         Try to upgrade it."
32 |     )
33 | 


--------------------------------------------------------------------------------
/lightllm/utils/statics_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from lightllm.utils.log_utils import init_logger
 3 | 
 4 | logger = init_logger(__name__)
 5 | 
 6 | 
 7 | class MovingAverage:
 8 |     def __init__(self):
 9 |         self.total = 0.0
10 |         self.count = 0
11 |         self.last_time = time.time()
12 | 
13 |     def add(self, value):
14 |         self.total += value
15 |         self.count += 1
16 | 
17 |     def average(self):
18 |         if self.count == 0:
19 |             return 0.0
20 |         return self.total / self.count
21 | 
22 |     def print_log(self, log_str):
23 |         if time.time() - self.last_time >= 30:
24 |             logger.info(f"{log_str}: {self.average()} ms")
25 |             self.last_time = time.time()
26 | 


--------------------------------------------------------------------------------
/lightllm/utils/time_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class TimeChecker:
 5 |     def __init__(self, threshold):
 6 |         self.threshold = threshold
 7 |         self.last_checked = time.time()
 8 | 
 9 |     def has_exceeded(self):
10 |         current_time = time.time()
11 |         if (current_time - self.last_checked) > self.threshold:
12 |             self._reset()
13 |             return True
14 |         return False
15 | 
16 |     def _reset(self):
17 |         self.last_checked = time.time()
18 | 


--------------------------------------------------------------------------------
/lightllm/utils/vllm_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from lightllm.utils.log_utils import init_logger
 3 | 
 4 | logger = init_logger(__name__)
 5 | try:
 6 |     if not torch.cuda.is_initialized():
 7 |         torch.cuda.init()
 8 |     from vllm import _custom_ops as ops
 9 | 
10 |     vllm_ops = ops
11 |     HAS_VLLM = True
12 |     cutlass_scaled_mm = torch.ops._C.cutlass_scaled_mm
13 | 
14 | except:
15 |     HAS_VLLM = False
16 |     cutlass_scaled_mm = None
17 |     vllm_ops = None
18 |     logger.warning(
19 |         "vllm is not installed, you can't use the api of it. \
20 |                    You can solve it by running `pip install vllm`."
21 |     )
22 | 


--------------------------------------------------------------------------------
/lightllm/utils/watchdog_utils.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import time
 3 | from lightllm.utils.log_utils import init_logger
 4 | 
 5 | logger = init_logger(__name__)
 6 | 
 7 | 
 8 | class Watchdog:
 9 |     def __init__(self, timeout):
10 |         self.timeout = timeout
11 |         self.last_heartbeat = time.time()
12 |         self.running = True
13 | 
14 |     def start(self):
15 |         self.thread = threading.Thread(target=self.run, daemon=True)
16 |         self.thread.start()
17 | 
18 |     def run(self):
19 |         while self.running:
20 |             time.sleep(2)
21 |             if time.time() - self.last_heartbeat > self.timeout:
22 |                 logger.error("Watchdog: Timeout! Task is not responding.")
23 |                 self.handle_timeout()
24 | 
25 |     def handle_timeout(self):
26 |         logger.error("Watchdog: time out to exit")
27 |         import sys
28 | 
29 |         sys.exit(-1)
30 | 
31 |     def stop(self):
32 |         self.running = False
33 |         self.thread.join()
34 | 
35 |     def heartbeat(self):
36 |         self.last_heartbeat = time.time()
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | package_data = {"lightllm": ["common/all_kernel_configs/*/*.json"]}
 4 | setup(
 5 |     name="lightllm",
 6 |     version="1.0.1",
 7 |     packages=find_packages(exclude=("build", "include", "test", "dist", "docs", "benchmarks", "lightllm.egg-info")),
 8 |     author="model toolchain",
 9 |     author_email="",
10 |     description="lightllm for inference LLM",
11 |     long_description="",
12 |     long_description_content_type="text/markdown",
13 |     url="",
14 |     classifiers=[
15 |         "Programming Language :: Python :: 3",
16 |         "Operating System :: Linux",
17 |     ],
18 |     python_requires=">=3.9.16",
19 |     install_requires=[
20 |         "pyzmq",
21 |         "uvloop",
22 |         "transformers",
23 |         "einops",
24 |         "packaging",
25 |         "rpyc",
26 |         "ninja",
27 |         "safetensors",
28 |         "triton",
29 |     ],
30 |     package_data=package_data,
31 | )
32 | 


--------------------------------------------------------------------------------
/test/model/test_settings/process_utils.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import re
 3 | 
 4 | 
 5 | def kill_gpu_processes():
 6 |     try:
 7 |         output = subprocess.check_output(["nvidia-smi", "-q", "-x"])
 8 |         output = output.decode("utf-8")
 9 | 
10 |         # 使用正则表达式提取进程信息
11 |         process_info = re.findall(r"<process_info>(.*?)</process_info>", output, re.DOTALL)
12 | 
13 |         if process_info:
14 |             print("找到以下占用显卡的进程：")
15 |             for info in process_info:
16 |                 pid = re.search(r"<pid>(.*?)</pid>", info).group(1)
17 |                 process_name = re.search(r"<process_name>(.*?)</process_name>", info).group(1)
18 |                 print("进程ID:", pid)
19 |                 print("进程名字:", process_name)
20 | 
21 |             for info in process_info:
22 |                 pid = re.search(r"<pid>(.*?)</pid>", info).group(1)
23 |                 subprocess.call(["sudo", "kill", "-9", pid])
24 |                 print("进程ID", pid, "被终止")
25 |         else:
26 |             print("没有找到占用显卡的进程")
27 | 
28 |     except subprocess.CalledProcessError:
29 |         print("无法执行nvidia-smi命令")
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     kill_gpu_processes()
34 | 


--------------------------------------------------------------------------------
/test/server/readme.md:
--------------------------------------------------------------------------------
 1 | # prompt cache 测试：
 2 | 
 3 | - benchmark_prompt_cache.py： 单次测试脚本。
 4 | 
 5 |     例子：
 6 |     ```shell
 7 |     python benchmark_prompt_cache.py --address http://localhost:8090 --model_name llama --num_workers 1 --first_input_len 512 --subsequent_input_len 32 --output_len 32 --num_turns 5 --num_users 1
 8 |     ```
 9 | 
10 |     使用方法详细说明： 
11 |     ```shell
12 |     python benchmark_prompt_cache.py -h
13 |     ```
14 | 
15 | - test_settings.py： 批量测试脚本，可测试多个配置并汇总为md
16 | 


--------------------------------------------------------------------------------
/test/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/lightllm/64d85cb40b79a4568b31d9f357e01f4ef2d6360d/test/test.jpg


--------------------------------------------------------------------------------
/test/test_server.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import requests
 3 | import json
 4 | import threading
 5 | 
 6 | 
 7 | class RequestThread(threading.Thread):
 8 |     def __init__(self, url, headers, data):
 9 |         threading.Thread.__init__(self)
10 |         self.url = url
11 |         self.headers = headers
12 |         self.data = data
13 | 
14 |     def run(self):
15 |         response = requests.post(self.url, headers=self.headers, data=json.dumps(self.data))
16 |         if response.status_code == 200:
17 |             print(response.json())
18 |         else:
19 |             print("Error:", response.status_code, response.text)
20 | 
21 | 
22 | url = "http://localhost:8000/generate"
23 | headers = {"Content-Type": "application/json"}
24 | 
25 | for i in range(1):
26 |     data = {
27 |         "inputs": "San Francisco is a",
28 |         # 'temperature': 0.1,
29 |         "parameters": {
30 |             "do_sample": False,
31 |         },
32 |     }
33 |     thread = RequestThread(url, headers, data)
34 |     thread.start()
35 | 
36 | time.sleep(2)
37 | 
38 | for i in range(20):
39 |     data = {
40 |         "inputs": "San Francisco is a",
41 |         "parameters": {
42 |             "do_sample": False,
43 |             "ignore_eos": True,
44 |             "max_new_tokens": 200,
45 |         },
46 |     }
47 |     thread = RequestThread(url, headers, data)
48 |     thread.start()
49 | 


--------------------------------------------------------------------------------
/tools/resolve_ptx_version:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is used to make old version triton work on generating ptx code up to version 7.8
 3 | # See https://github.com/openai/triton/blob/8650b4d1cbc750d659156e2c17a058736614827b/lib/driver/llvm.cc#L149
 4 | set -e
 5 | 
 6 | mkdir -p $HOME/.triton/
 7 | 
 8 | [ $HOME/.triton/resolve_ptx_version.so -nt $0 ] || (echo '
 9 | #include <stdexcept>
10 | namespace triton {
11 | namespace driver {
12 | 
13 | int vptx(int version) {
14 |     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
15 |     if (version >= 11080) return 78;
16 |     if (version >= 11070) return 77;
17 |     if (version >= 11060) return 76;
18 |     if (version >= 11050) return 75;
19 |     if (version >= 11040) return 74;
20 |     throw std::runtime_error("Triton requires CUDA 11.4+");
21 | }
22 | 
23 | }
24 | }' \
25 | | g++ -x c++ -fPIC -shared -o $HOME/.triton/resolve_ptx_version.so -)
26 | 
27 | [ -z "$*" ] || env LD_PRELOAD=$LD_PRELOAD:$HOME/.triton/resolve_ptx_version.so "$@"


--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_add_in_place.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import pytest
 4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy
 5 | from lightllm.common.basemodel.triton_kernel.add_in_place import add_in_place
 6 | from lightllm.utils.log_utils import init_logger
 7 | 
 8 | logger = init_logger(__name__)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "dim1, dim2, alpha",
13 |     [
14 |         (dim1, dim2, alpha)
15 |         for dim1 in range(1, 1024, 100)
16 |         for dim2 in range(1, 1024, 100)
17 |         for alpha in [0.1, 0.3, 0.5, 0.7, 0.1]
18 |     ],
19 | )
20 | def test_add_in_place(dim1, dim2, alpha):
21 |     input = torch.rand((dim1, dim2), device="cuda")
22 |     other = torch.rand((dim1, dim2), device="cuda")
23 | 
24 |     output = input + other * alpha
25 |     add_in_place(input, other, alpha=alpha)
26 |     rlt = torch.allclose(input, output, atol=1e-5, rtol=0)
27 |     assert rlt
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     pytest.main()
32 | 


--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_gen_decode_params.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytest
 3 | import numpy as np
 4 | from lightllm.utils.log_utils import init_logger
 5 | from lightllm.common.basemodel.triton_kernel.gen_decode_params import gen_decode_params
 6 | 
 7 | 
 8 | def test_gen_decode_params_basic():
 9 |     b_seq_len = torch.ones((9,), dtype=torch.int64, device="cuda") * 8192
10 |     (
11 |         b_q_seq_len,
12 |         b1_cu_q_seq_len,
13 |         b_kv_seq_len,
14 |         b1_cu_kv_seq_len,
15 |         position_ids,
16 |         max_q_seq_len,
17 |         max_kv_seq_len,
18 |     ) = gen_decode_params(b_seq_len)
19 | 
20 |     true_b_q_seq_len = torch.ones_like(b_seq_len)
21 |     b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len
22 | 
23 |     assert max_q_seq_len == 1
24 |     assert max_kv_seq_len == b_seq_len.max().item()
25 |     assert torch.equal(b_q_seq_len, true_b_q_seq_len)
26 |     assert torch.equal(b1_cu_q_seq_len, torch.nn.functional.pad(torch.cumsum(true_b_q_seq_len, dim=0), (1, 0), value=0))
27 |     assert torch.equal(b_kv_seq_len, b_seq_len)
28 |     assert torch.equal(b1_cu_kv_seq_len, torch.nn.functional.pad(torch.cumsum(b_seq_len, dim=0), (1, 0), value=0))
29 |     assert torch.equal(position_ids, b_seq_len - 1)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     pytest.main()
34 | 


--------------------------------------------------------------------------------
/unit_tests/common/basemodel/triton_kernel/test_sp_pad_kernel.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import pytest
 4 | from lightllm.common.basemodel.triton_kernel.sp_pad_copy import sp_pad_copy
 5 | from lightllm.utils.log_utils import init_logger
 6 | 
 7 | logger = init_logger(__name__)
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "token_num, hidden_dim, sp_world_size",
12 |     [
13 |         (token_num, hidden_dim, sp_world_size)
14 |         for token_num in range(3, 6)
15 |         for hidden_dim in [257, 2048]
16 |         for sp_world_size in range(2, 5)
17 |     ],
18 | )
19 | def test_sp_pad_copy(token_num, hidden_dim, sp_world_size):
20 | 
21 |     in_tensor = torch.randn((token_num, hidden_dim), dtype=torch.float16, device="cuda")
22 |     out_tensors = [
23 |         sp_pad_copy(in_tensor=in_tensor, sp_rank_id=rank_id, sp_world_size=sp_world_size)
24 |         for rank_id in range(sp_world_size)
25 |     ]
26 |     out_tensor = torch.cat(out_tensors, dim=0)
27 |     assert torch.equal(in_tensor, out_tensor[0:token_num, :])
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     pytest.main()
32 | 


--------------------------------------------------------------------------------
/unit_tests/models/deepseek2/test_rope_repeat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import pytest
 4 | from lightllm.models.deepseek2.triton_kernel.repeat_rope import repeat_rope
 5 | 
 6 | 
 7 | def test_torch_cat():
 8 |     source = torch.randn((100, 1, 1077), device="cuda")
 9 |     dest = torch.randn((100, 7, 1077), device="cuda")
10 | 
11 |     repeat_rope(dest, source)
12 |     torch.equal(dest[:, 0, :], source)
13 |     torch.equal(dest[:, -1, :], source)
14 | 
15 |     source = torch.randn((100, 1, 128), device="cuda")
16 |     dest = torch.randn((100, 64, 128), device="cuda")
17 | 
18 |     repeat_rope(dest, source)
19 |     torch.equal(dest[:, 0, :], source)
20 |     torch.equal(dest[:, -1, :], source)
21 |     return
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     pytest.main()
26 | 


--------------------------------------------------------------------------------
/unit_tests/utils/test_custom_kernel_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | import pytest
 4 | from lightllm.utils.custom_kernel_utis import torch_cat_3
 5 | 
 6 | 
 7 | def test_torch_cat():
 8 |     a = torch.tensor([[[1, 2], [3, 4]]], device="cuda")
 9 |     b = torch.tensor([[[5, 6], [7, 8]]], device="cuda")
10 |     c = torch_cat_3([a, b], dim=0)
11 |     torch.equal(torch.cat((a, b), dim=0), c)
12 | 
13 |     d = torch_cat_3([a, b], dim=1)
14 |     torch.equal(torch.cat((a, b), dim=1), d)
15 | 
16 |     e = torch_cat_3([a, b], dim=-1)
17 |     torch.equal(torch.cat((a, b), dim=-1), e)
18 | 
19 |     empty = torch.empty((0, 2), device="cuda")
20 |     torch_cat_3([a, empty, b], dim=0)
21 |     return
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     pytest.main()
26 | 


--------------------------------------------------------------------------------