├── .clang-format
├── .github
    ├── CONTRIBUTING.md
    ├── ISSUE_TEMPLATE
    │   ├── 1-bug-report.yml
    │   ├── 2-feature-request.yml
    │   └── 3-documentation.yml
    ├── md-link-config.json
    ├── pull_request_template.md
    ├── release.yml
    ├── scripts
    │   ├── action_tools.py
    │   ├── check_lmdeploy.py
    │   ├── doc_link_checker.py
    │   ├── eval_base_config.py
    │   ├── eval_chat_config.py
    │   ├── eval_regression_base_models.py
    │   ├── eval_regression_chat_models.py
    │   ├── eval_stable_object_config.py
    │   └── eval_stable_subject_config.py
    └── workflows
    │   ├── benchmark.yml
    │   ├── cuda11.8-whl-release.yml
    │   ├── daily_ete_test.yml
    │   ├── daily_ete_test_3090.yml
    │   ├── docker.yml
    │   ├── evaluate.yml
    │   ├── evaluate_remote.yml
    │   ├── lint.yml
    │   ├── linux-x64-gpu.yml
    │   ├── pr_ete_test.yml
    │   ├── pr_full_test.yml
    │   ├── pypi.yml
    │   ├── stable.yml
    │   ├── stale.yml
    │   ├── unit-test.yml
    │   └── windows-x64-gpu.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_ja.md
├── README_zh-CN.md
├── autotest
    ├── benchmark
    │   ├── test_apiserver_performance.py
    │   ├── test_generation_performance.py
    │   └── test_throughput_performance.py
    ├── chat_prompt_case.yaml
    ├── config-3090.yaml
    ├── config-pr.yaml
    ├── config.yaml
    ├── conftest.py
    ├── interface
    │   ├── pipeline
    │   │   ├── test_pipeline_func.py
    │   │   └── test_pipeline_longtext_func.py
    │   └── restful
    │   │   ├── test_restful_chat_func.py
    │   │   └── test_restful_completions_v1.py
    ├── prompt_case.yaml
    ├── pytest.ini
    ├── template.json
    ├── toolchain
    │   └── test_lagent.py
    ├── tools
    │   ├── chat
    │   │   ├── test_command_chat_hf_pytorch.py
    │   │   └── test_command_chat_hf_turbomind.py
    │   ├── pipeline
    │   │   ├── llm_case.py
    │   │   ├── mllm_case.py
    │   │   ├── test_pipeline_chat_pytorch_llm.py
    │   │   ├── test_pipeline_chat_pytorch_mllm.py
    │   │   ├── test_pipeline_chat_turbomind_llm.py
    │   │   └── test_pipeline_chat_turbomind_mllm.py
    │   ├── quantization
    │   │   ├── test_quantization_awq.py
    │   │   └── test_quantization_w8a8.py
    │   └── restful
    │   │   ├── test_restful_chat_hf_pytorch_llm.py
    │   │   ├── test_restful_chat_hf_pytorch_mllm.py
    │   │   ├── test_restful_chat_hf_turbomind_llm.py
    │   │   └── test_restful_chat_hf_turbomind_mllm.py
    └── utils
    │   ├── benchmark_utils.py
    │   ├── config_utils.py
    │   ├── get_run_config.py
    │   ├── mp_log_utils.py
    │   ├── pipeline_chat.py
    │   ├── quantization_utils.py
    │   ├── restful_return_check.py
    │   ├── rule_condition_assert.py
    │   ├── run_client_chat.py
    │   └── run_restful_chat.py
├── benchmark
    ├── README.md
    ├── benchmark_decode.py
    ├── benchmark_pipeline.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── lmdeploy.yml
    ├── profile_generation.py
    ├── profile_pipeline_api.py
    ├── profile_restful_api.py
    └── profile_throughput.py
├── builder
    ├── manywheel
    │   ├── Dockerfile_2014
    │   ├── README.md
    │   ├── build_all_docker.sh
    │   ├── build_all_wheel.sh
    │   ├── build_docker.sh
    │   ├── build_wheel.sh
    │   ├── entrypoint_build.sh
    │   └── scripts
    │   │   ├── install_conda.sh
    │   │   ├── install_cuda.sh
    │   │   └── install_openmpi.sh
    └── windows
    │   ├── README.md
    │   ├── generate.ps1
    │   └── setup_cuda.ps1
├── cmake
    ├── Modules
    │   ├── FindCUDNN.cmake
    │   └── FindNCCL.cmake
    ├── TritonTurboMindBackendConfig.cmake.in
    └── TurboMindConfig.cmake.in
├── debug.sh
├── docker
    ├── Dockerfile
    ├── Dockerfile_Hopper
    ├── Dockerfile_aarch64_ascend
    ├── InternVL_Dockerfile
    └── Qwen2VL_Dockerfile
├── docs
    ├── en
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── _static
    │   │   ├── css
    │   │   │   └── readthedocs.css
    │   │   └── image
    │   │   │   └── lmdeploy-logo.svg
    │   ├── advance
    │   │   ├── chat_template.md
    │   │   ├── debug_turbomind.md
    │   │   ├── long_context.md
    │   │   ├── pytorch_multinodes.md
    │   │   ├── pytorch_multithread.md
    │   │   ├── pytorch_new_model.md
    │   │   ├── pytorch_profiling.md
    │   │   └── structed_output.md
    │   ├── api
    │   │   └── pipeline.rst
    │   ├── benchmark
    │   │   ├── a100_fp16.md
    │   │   ├── benchmark.md
    │   │   └── evaluate_with_opencompass.md
    │   ├── conf.py
    │   ├── faq.md
    │   ├── get_started
    │   │   ├── ascend
    │   │   │   └── get_started.md
    │   │   ├── get_started.md
    │   │   ├── index.rst
    │   │   └── installation.md
    │   ├── index.rst
    │   ├── inference
    │   │   ├── load_hf.md
    │   │   ├── pytorch.md
    │   │   ├── turbomind.md
    │   │   └── turbomind_config.md
    │   ├── llm
    │   │   ├── api_server.md
    │   │   ├── api_server_lora.md
    │   │   ├── api_server_reasoning.md
    │   │   ├── api_server_tools.md
    │   │   ├── codellama.md
    │   │   ├── gradio.md
    │   │   ├── pipeline.md
    │   │   └── proxy_server.md
    │   ├── make.bat
    │   ├── multi_modal
    │   │   ├── api_server_vl.md
    │   │   ├── cogvlm.md
    │   │   ├── deepseek_vl2.md
    │   │   ├── gemma3.md
    │   │   ├── index.rst
    │   │   ├── internvl.md
    │   │   ├── llava.md
    │   │   ├── minicpmv.md
    │   │   ├── mllama.md
    │   │   ├── molmo.md
    │   │   ├── phi3.md
    │   │   ├── qwen2_5_vl.md
    │   │   ├── qwen2_vl.md
    │   │   ├── vl_pipeline.md
    │   │   └── xcomposer2d5.md
    │   ├── quantization
    │   │   ├── kv_quant.md
    │   │   ├── w4a16.md
    │   │   └── w8a8.md
    │   └── supported_models
    │   │   ├── reward_models.md
    │   │   └── supported_models.md
    └── zh_cn
    │   ├── .readthedocs.yaml
    │   ├── Makefile
    │   ├── _static
    │       ├── css
    │       │   └── readthedocs.css
    │       └── image
    │       │   └── lmdeploy-logo.svg
    │   ├── advance
    │       ├── chat_template.md
    │       ├── debug_turbomind.md
    │       ├── long_context.md
    │       ├── pytorch_multinodes.md
    │       ├── pytorch_multithread.md
    │       ├── pytorch_new_model.md
    │       ├── pytorch_profiling.md
    │       └── structed_output.md
    │   ├── api
    │       └── pipeline.rst
    │   ├── benchmark
    │       ├── benchmark.md
    │       └── evaluate_with_opencompass.md
    │   ├── conf.py
    │   ├── faq.md
    │   ├── get_started
    │       ├── ascend
    │       │   └── get_started.md
    │       ├── get_started.md
    │       ├── index.rst
    │       └── installation.md
    │   ├── index.rst
    │   ├── inference
    │       ├── load_hf.md
    │       ├── pytorch.md
    │       ├── turbomind.md
    │       └── turbomind_config.md
    │   ├── llm
    │       ├── api_server.md
    │       ├── api_server_lora.md
    │       ├── api_server_reasoning.md
    │       ├── api_server_tools.md
    │       ├── codellama.md
    │       ├── gradio.md
    │       ├── pipeline.md
    │       └── proxy_server.md
    │   ├── make.bat
    │   ├── multi_modal
    │       ├── api_server_vl.md
    │       ├── cogvlm.md
    │       ├── deepseek_vl2.md
    │       ├── gemma3.md
    │       ├── index.rst
    │       ├── internvl.md
    │       ├── llava.md
    │       ├── minicpmv.md
    │       ├── mllama.md
    │       ├── molmo.md
    │       ├── phi3.md
    │       ├── qwen2_5_vl.md
    │       ├── qwen2_vl.md
    │       ├── vl_pipeline.md
    │       └── xcomposer2d5.md
    │   ├── quantization
    │       ├── kv_quant.md
    │       ├── w4a16.md
    │       └── w8a8.md
    │   └── supported_models
    │       ├── reward_models.md
    │       └── supported_models.md
├── generate.sh
├── k8s
    ├── deployment.yaml
    └── service.yaml
├── lmdeploy
    ├── __init__.py
    ├── __main__.py
    ├── api.py
    ├── archs.py
    ├── cli
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── entrypoint.py
    │   ├── lite.py
    │   ├── serve.py
    │   └── utils.py
    ├── lite
    │   ├── __init__.py
    │   ├── apis
    │   │   ├── __init__.py
    │   │   ├── auto_awq.py
    │   │   ├── calibrate.py
    │   │   ├── get_small_sharded_hf.py
    │   │   ├── gptq.py
    │   │   ├── kv_qparams.py
    │   │   └── smooth_quant.py
    │   ├── defaults.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   ├── internlm2_gptq.py
    │   │   └── internlm3_gptq.py
    │   ├── quantization
    │   │   ├── __init__.py
    │   │   ├── activation
    │   │   │   ├── __init__.py
    │   │   │   └── observer.py
    │   │   ├── awq.py
    │   │   ├── calibration.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   └── linear.py
    │   │   └── weight
    │   │   │   ├── __init__.py
    │   │   │   ├── quant_utils.py
    │   │   │   └── quantizer.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── batch_split.py
    │   │   ├── cal_qparams.py
    │   │   ├── calib_dataloader.py
    │   │   ├── collect.py
    │   │   ├── global_avail.py
    │   │   ├── load.py
    │   │   └── memory_efficient.py
    ├── logger.py
    ├── messages.py
    ├── model.py
    ├── profiler.py
    ├── pytorch
    │   ├── __init__.py
    │   ├── accel.py
    │   ├── adapter
    │   │   ├── __init__.py
    │   │   └── adapter.py
    │   ├── backends
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── apply_rotary_emb.py
    │   │   ├── attention.py
    │   │   ├── awq_modules.py
    │   │   ├── base.py
    │   │   ├── blockedf8_modules.py
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── apply_rotary_emb.py
    │   │   │   ├── attention.py
    │   │   │   ├── awq_modules.py
    │   │   │   ├── blockedf8_modules.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── graph_runner.py
    │   │   │   ├── lora.py
    │   │   │   ├── moe.py
    │   │   │   ├── multinomial_sampling.py
    │   │   │   ├── norm.py
    │   │   │   ├── op_backend.py
    │   │   │   ├── qmodules.py
    │   │   │   ├── token_dispatcher.py
    │   │   │   └── warmup_manager.py
    │   │   ├── default
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── apply_rotary_emb.py
    │   │   │   ├── awq_modules.py
    │   │   │   ├── linear.py
    │   │   │   ├── moe.py
    │   │   │   ├── multinomial_sampling.py
    │   │   │   ├── norm.py
    │   │   │   ├── op_backend.py
    │   │   │   ├── rotary_embedding.py
    │   │   │   └── token_dispatcher.py
    │   │   ├── dlinfer
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── apply_rotary_emb.py
    │   │   │   ├── ascend
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── graph_runner.py
    │   │   │   │   ├── op_backend.py
    │   │   │   │   └── utils.py
    │   │   │   ├── attention.py
    │   │   │   ├── awq_modules.py
    │   │   │   ├── camb
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── op_backend.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── linear.py
    │   │   │   ├── maca
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── op_backend.py
    │   │   │   ├── moe.py
    │   │   │   ├── norm.py
    │   │   │   ├── op_backend.py
    │   │   │   ├── qmodules.py
    │   │   │   └── rotary_embedding.py
    │   │   ├── flash_attention.py
    │   │   ├── graph_runner.py
    │   │   ├── linear.py
    │   │   ├── lora.py
    │   │   ├── moe.py
    │   │   ├── multinomial_sampling.py
    │   │   ├── norm.py
    │   │   ├── qmodules.py
    │   │   ├── rotary_embedding.py
    │   │   ├── selector.py
    │   │   └── token_dispatcher.py
    │   ├── block.py
    │   ├── chat.py
    │   ├── check_env
    │   │   ├── __init__.py
    │   │   ├── adapter.py
    │   │   ├── base.py
    │   │   ├── cuda.py
    │   │   ├── deeplink.py
    │   │   ├── dist.py
    │   │   ├── model.py
    │   │   ├── torch.py
    │   │   ├── transformers.py
    │   │   ├── triton.py
    │   │   └── triton_custom_add.py
    │   ├── config.py
    │   ├── configurations
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── chatglm.py
    │   │   ├── cogvlm.py
    │   │   ├── deepseek_v2.py
    │   │   ├── deepseek_vl2.py
    │   │   ├── default.py
    │   │   ├── gemma.py
    │   │   ├── internvl.py
    │   │   ├── internvl3_hf.py
    │   │   ├── llama4.py
    │   │   ├── llava_hf.py
    │   │   ├── minicpm3.py
    │   │   ├── mllama.py
    │   │   ├── qwen.py
    │   │   └── utils.py
    │   ├── devices
    │   │   ├── __init__.py
    │   │   └── device_manager.py
    │   ├── disagg
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── backend
    │   │   │   ├── __init__.py
    │   │   │   ├── backend.py
    │   │   │   ├── base.py
    │   │   │   ├── dlslime.py
    │   │   │   ├── infinistore.py
    │   │   │   └── mooncake.py
    │   │   ├── config.py
    │   │   ├── conn.py
    │   │   ├── messages.py
    │   │   └── request.py
    │   ├── distributed.py
    │   ├── engine
    │   │   ├── __init__.py
    │   │   ├── cache_engine.py
    │   │   ├── engine.py
    │   │   ├── engine_checker.py
    │   │   ├── engine_instance.py
    │   │   ├── executor
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── base_worker.py
    │   │   │   ├── dist_utils.py
    │   │   │   ├── mp_executor.py
    │   │   │   ├── ray_executor.py
    │   │   │   └── uni_executor.py
    │   │   ├── guided_process.py
    │   │   ├── input_process.py
    │   │   ├── logits_process.py
    │   │   ├── model_agent.py
    │   │   ├── mp_engine
    │   │   │   ├── __init__.py
    │   │   │   ├── mp_engine.py
    │   │   │   └── zmq_rpc.py
    │   │   └── request.py
    │   ├── envs.py
    │   ├── kernels
    │   │   ├── __init__.py
    │   │   ├── alibi_pagedattention.py
    │   │   ├── apply_rotary_pos_emb.py
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── alibi_pagedattention.py
    │   │   │   ├── apply_rotary_pos_emb.py
    │   │   │   ├── awq_kernels.py
    │   │   │   ├── blocked_fp8_fused_moe.py
    │   │   │   ├── blocked_gemm_fp8.py
    │   │   │   ├── ep_moe.py
    │   │   │   ├── fill_kv_cache.py
    │   │   │   ├── flash_mla.py
    │   │   │   ├── flashattention.py
    │   │   │   ├── flatten_kv_cache.py
    │   │   │   ├── fused_lora.py
    │   │   │   ├── fused_moe.py
    │   │   │   ├── fused_rotary_emb.py
    │   │   │   ├── multinomial_sampling.py
    │   │   │   ├── pagedattention.py
    │   │   │   ├── rms_norm.py
    │   │   │   ├── triton_utils.py
    │   │   │   ├── utils.py
    │   │   │   ├── w8a8_fused_moe.py
    │   │   │   └── w8a8_triton_kernels.py
    │   │   ├── default
    │   │   │   ├── __init__.py
    │   │   │   ├── multinomial_sampling.py
    │   │   │   └── w8a8_kernels.py
    │   │   ├── dispatcher.py
    │   │   ├── dlinfer
    │   │   │   ├── __init__.py
    │   │   │   ├── activation.py
    │   │   │   ├── apply_rotary_pos_emb.py
    │   │   │   ├── awq_kernels.py
    │   │   │   ├── fill_kv_cache.py
    │   │   │   ├── flash_attention.py
    │   │   │   ├── fused_moe.py
    │   │   │   ├── fused_rotary_emb.py
    │   │   │   ├── linear.py
    │   │   │   ├── moe_gating_topk_softmax.py
    │   │   │   ├── pagedattention.py
    │   │   │   ├── rms_norm.py
    │   │   │   └── w8a8_kernels.py
    │   │   ├── fill_kv_cache.py
    │   │   ├── flash_mla.py
    │   │   ├── fused_moe.py
    │   │   ├── fused_rotary_emb.py
    │   │   ├── moe_gating_topk_softmax.py
    │   │   ├── multinomial_sampling.py
    │   │   ├── pagedattention.py
    │   │   ├── rms_norm.py
    │   │   └── w8a8_triton_kernels.py
    │   ├── messages.py
    │   ├── model_inputs.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── baichuan.py
    │   │   ├── chatglm2.py
    │   │   ├── cogvlm.py
    │   │   ├── deepseek.py
    │   │   ├── deepseek_v2.py
    │   │   ├── deepseek_vl2.py
    │   │   ├── gemma.py
    │   │   ├── gemma3_vl.py
    │   │   ├── internlm.py
    │   │   ├── internlm2.py
    │   │   ├── internlm2_reward.py
    │   │   ├── internlm2_ve.py
    │   │   ├── internlm3.py
    │   │   ├── internvl.py
    │   │   ├── internvl3_hf.py
    │   │   ├── internvl_patch.py
    │   │   ├── llama.py
    │   │   ├── llama4.py
    │   │   ├── llava.py
    │   │   ├── minicpm3.py
    │   │   ├── minicpmv26.py
    │   │   ├── mistral.py
    │   │   ├── mixtral.py
    │   │   ├── mllama.py
    │   │   ├── module_map.py
    │   │   ├── patch.py
    │   │   ├── phi3.py
    │   │   ├── phi3_moe.py
    │   │   ├── phi3_v.py
    │   │   ├── q_modules.py
    │   │   ├── qwen.py
    │   │   ├── qwen2.py
    │   │   ├── qwen2_5_vl.py
    │   │   ├── qwen2_moe.py
    │   │   ├── qwen2_reward.py
    │   │   ├── qwen2_vl.py
    │   │   ├── qwen3.py
    │   │   ├── qwen3_moe.py
    │   │   ├── siglip.py
    │   │   ├── starcoder2.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── cudagraph.py
    │   │   │   ├── micro_batch.py
    │   │   │   ├── model.py
    │   │   │   └── multimodal.py
    │   ├── multimodal
    │   │   ├── __init__.py
    │   │   ├── data_type.py
    │   │   └── image_type.py
    │   ├── nn
    │   │   ├── __init__.py
    │   │   ├── activation.py
    │   │   ├── attention.py
    │   │   ├── eplb.py
    │   │   ├── linear.py
    │   │   ├── moe.py
    │   │   ├── multinomial_sampling.py
    │   │   ├── norm.py
    │   │   ├── quant_utils.py
    │   │   ├── rotary_embedding.py
    │   │   └── utils.py
    │   ├── paging
    │   │   ├── __init__.py
    │   │   ├── block_manager
    │   │   │   ├── __init__.py
    │   │   │   ├── base_block_manager.py
    │   │   │   ├── default_block_manager.py
    │   │   │   └── window_block_manager.py
    │   │   ├── block_trie.py
    │   │   ├── eviction_helper
    │   │   │   ├── __init__.py
    │   │   │   ├── base_eviction_helper.py
    │   │   │   └── recompute_eviction_helper.py
    │   │   └── scheduler.py
    │   ├── supported_models.py
    │   ├── tools
    │   │   ├── __init__.py
    │   │   ├── layout_convert.py
    │   │   ├── make_inputs.py
    │   │   └── utils.py
    │   ├── utils.py
    │   └── weight_loader
    │   │   ├── __init__.py
    │   │   └── model_weight_loader.py
    ├── serve
    │   ├── __init__.py
    │   ├── async_engine.py
    │   ├── gradio
    │   │   ├── __init__.py
    │   │   ├── api_server_backend.py
    │   │   ├── app.py
    │   │   ├── constants.py
    │   │   ├── turbomind_coupled.py
    │   │   └── vl.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   ├── api_client.py
    │   │   ├── api_server.py
    │   │   ├── launch_server.py
    │   │   ├── protocol.py
    │   │   ├── reasoning_parser
    │   │   │   ├── __init__.py
    │   │   │   ├── deepseek_r1_reasoning_parser.py
    │   │   │   ├── qwen_qwq_reasoning_parser.py
    │   │   │   └── reasoning_parser.py
    │   │   └── tool_parser
    │   │   │   ├── __init__.py
    │   │   │   ├── internlm2_parser.py
    │   │   │   ├── llama3_parser.py
    │   │   │   ├── qwen2d5_parser.py
    │   │   │   ├── qwen3_parser.py
    │   │   │   ├── tool_parser.py
    │   │   │   └── utils.py
    │   ├── proxy
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   └── proxy.py
    │   ├── turbomind
    │   │   ├── __init__.py
    │   │   └── triton_python_backend
    │   │   │   ├── README.md
    │   │   │   ├── client.py
    │   │   │   ├── config.pbtxt
    │   │   │   └── model.py
    │   ├── utils.py
    │   └── vl_async_engine.py
    ├── tokenizer.py
    ├── turbomind
    │   ├── __init__.py
    │   ├── chat.py
    │   ├── deploy
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── converter.py
    │   │   ├── loader.py
    │   │   ├── module.py
    │   │   ├── parameter.py
    │   │   ├── policy.py
    │   │   ├── source_model
    │   │   │   ├── __init__.py
    │   │   │   ├── baichuan.py
    │   │   │   ├── base.py
    │   │   │   ├── deepseek2.py
    │   │   │   ├── deepseek_vl.py
    │   │   │   ├── glm4.py
    │   │   │   ├── internlm2.py
    │   │   │   ├── internvl.py
    │   │   │   ├── llama.py
    │   │   │   ├── llava.py
    │   │   │   ├── minicpmv.py
    │   │   │   ├── mixtral.py
    │   │   │   ├── molmo.py
    │   │   │   ├── qwen.py
    │   │   │   └── xcomposer2.py
    │   │   └── target_model
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   └── fp.py
    │   ├── generate_gemm_config.py
    │   ├── supported_models.py
    │   ├── turbomind.py
    │   └── utils.py
    ├── utils.py
    ├── version.py
    └── vl
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── engine.py
    │   ├── model
    │       ├── __init__.py
    │       ├── base.py
    │       ├── builder.py
    │       ├── cogvlm.py
    │       ├── deepseek.py
    │       ├── deepseek_vl2.py
    │       ├── gemma3_vl.py
    │       ├── glm_4v.py
    │       ├── internvl.py
    │       ├── internvl3_hf.py
    │       ├── internvl_llava.py
    │       ├── llama4.py
    │       ├── llava.py
    │       ├── llava_hf.py
    │       ├── llava_next.py
    │       ├── minicpmv.py
    │       ├── mllama.py
    │       ├── molmo.py
    │       ├── phi3_vision.py
    │       ├── qwen.py
    │       ├── qwen2.py
    │       ├── utils.py
    │       ├── xcomposer2.py
    │       └── yi.py
    │   ├── tools
    │       ├── __init__.py
    │       └── merge_xcomposer2d5_task.py
    │   └── utils.py
├── requirements
    ├── build.txt
    ├── docs.txt
    ├── lite.txt
    ├── readthedocs.txt
    ├── runtime_ascend.txt
    ├── runtime_camb.txt
    ├── runtime_cuda.txt
    ├── runtime_maca.txt
    ├── serve.txt
    └── test.txt
├── requirements_ascend.txt
├── requirements_camb.txt
├── requirements_cuda.txt
├── requirements_maca.txt
├── resources
    └── batch_memory.png
├── setup.py
├── src
    ├── CMakeLists.txt
    └── turbomind
    │   ├── CMakeLists.txt
    │   ├── comm
    │       ├── CMakeLists.txt
    │       ├── barrier.h
    │       ├── cuda_ipc
    │       │   ├── CMakeLists.txt
    │       │   ├── allgather.cu
    │       │   ├── allreduce.cu
    │       │   ├── bootstrap.h
    │       │   ├── cuda_ipc_comm.cu
    │       │   ├── cuda_ipc_comm.h
    │       │   ├── device_semaphore.h
    │       │   ├── fused_allreduce.cu
    │       │   ├── fused_allreduce_ex.cu
    │       │   ├── group_sum.h
    │       │   └── mscclpp.h
    │       ├── device_comm.cc
    │       ├── device_comm.h
    │       ├── host_comm.cc
    │       ├── host_comm.h
    │       ├── nccl
    │       │   ├── CMakeLists.txt
    │       │   └── nccl.cu
    │       ├── test_comm.cu
    │       └── thread_comm.cc
    │   ├── core
    │       ├── CMakeLists.txt
    │       ├── allocator.cc
    │       ├── allocator.h
    │       ├── buffer.cc
    │       ├── buffer.h
    │       ├── check.cc
    │       ├── check.h
    │       ├── common.h
    │       ├── context.cc
    │       ├── context.h
    │       ├── core.h
    │       ├── cuda_data_type.h
    │       ├── data_type.h
    │       ├── layout.cc
    │       ├── layout.h
    │       ├── module.cc
    │       ├── module.h
    │       ├── stream.cc
    │       ├── stream.h
    │       ├── tensor.cc
    │       ├── tensor.cu
    │       ├── tensor.h
    │       └── test_core.cc
    │   ├── engine
    │       ├── CMakeLists.txt
    │       ├── gateway.cc
    │       ├── gateway.h
    │       ├── model_request.cc
    │       ├── model_request.h
    │       ├── request.h
    │       ├── request_queue.cc
    │       ├── request_queue.h
    │       └── signal_buffer.h
    │   ├── kernels
    │       ├── CMakeLists.txt
    │       ├── activation_kernels.cu
    │       ├── activation_kernels.h
    │       ├── attention
    │       │   ├── CMakeLists.txt
    │       │   ├── arch.h
    │       │   ├── attention.cu
    │       │   ├── attention.h
    │       │   ├── attention_config.h
    │       │   ├── attention_params.h
    │       │   ├── attention_template.h
    │       │   ├── attention_universal.h
    │       │   ├── block.h
    │       │   ├── block_iterator.h
    │       │   ├── codegen
    │       │   │   ├── attention_sm70_128_f16.cu
    │       │   │   ├── attention_sm70_64_f16.cu
    │       │   │   ├── attention_sm75_128_f16.cu
    │       │   │   ├── attention_sm75_64_f16.cu
    │       │   │   ├── attention_sm80_128_bf16.cu
    │       │   │   ├── attention_sm80_128_f16.cu
    │       │   │   ├── attention_sm80_192.cu
    │       │   │   ├── attention_sm80_64_bf16.cu
    │       │   │   ├── attention_sm80_64_f16.cu
    │       │   │   ├── decoding_sm70_128_f16_f16.cu
    │       │   │   ├── decoding_sm70_128_f16_u4.cu
    │       │   │   ├── decoding_sm70_128_f16_u8.cu
    │       │   │   ├── decoding_sm70_64_f16_f16.cu
    │       │   │   ├── decoding_sm70_64_f16_u4.cu
    │       │   │   ├── decoding_sm70_64_f16_u8.cu
    │       │   │   ├── decoding_sm75_128_f16_f16.cu
    │       │   │   ├── decoding_sm75_128_f16_u4.cu
    │       │   │   ├── decoding_sm75_128_f16_u8.cu
    │       │   │   ├── decoding_sm75_64_f16_f16.cu
    │       │   │   ├── decoding_sm75_64_f16_u4.cu
    │       │   │   ├── decoding_sm75_64_f16_u8.cu
    │       │   │   ├── decoding_sm80_128_bf16_bf16.cu
    │       │   │   ├── decoding_sm80_128_bf16_u4.cu
    │       │   │   ├── decoding_sm80_128_bf16_u8.cu
    │       │   │   ├── decoding_sm80_128_f16_f16.cu
    │       │   │   ├── decoding_sm80_128_f16_u4.cu
    │       │   │   ├── decoding_sm80_128_f16_u8.cu
    │       │   │   ├── decoding_sm80_192.cu
    │       │   │   ├── decoding_sm80_64_bf16_bf16.cu
    │       │   │   ├── decoding_sm80_64_bf16_u4.cu
    │       │   │   ├── decoding_sm80_64_bf16_u8.cu
    │       │   │   ├── decoding_sm80_64_f16_f16.cu
    │       │   │   ├── decoding_sm80_64_f16_u4.cu
    │       │   │   └── decoding_sm80_64_f16_u8.cu
    │       │   ├── cta_map.h
    │       │   ├── decoding.cu
    │       │   ├── decoding.h
    │       │   ├── decoding_config.h
    │       │   ├── decoding_template.h
    │       │   ├── impl.h
    │       │   ├── impl_16816.h
    │       │   ├── impl_1688.h
    │       │   ├── impl_81616.h
    │       │   ├── impl_884.h
    │       │   ├── impl_m16n8.h
    │       │   ├── impl_simt.h
    │       │   ├── iterator.h
    │       │   ├── iterator_sm70.h
    │       │   ├── iterator_sm80.h
    │       │   ├── kv_cache_utils_v2.cu
    │       │   ├── kv_cache_utils_v2.h
    │       │   ├── linear_iterator.h
    │       │   ├── mainloop.h
    │       │   ├── mainloop_sm70.h
    │       │   ├── mainloop_sm80.h
    │       │   ├── quantization.h
    │       │   ├── reduce.cu
    │       │   ├── reduce.h
    │       │   ├── reduce_kernel.h
    │       │   ├── reference.cu
    │       │   ├── reference.h
    │       │   ├── rotary_embedding.h
    │       │   ├── test_attention.cu
    │       │   ├── test_quant.cu
    │       │   ├── test_utils.cu
    │       │   ├── test_utils.h
    │       │   ├── utils.cc
    │       │   └── utils.h
    │       ├── ban_bad_words.cu
    │       ├── ban_bad_words.h
    │       ├── core
    │       │   ├── array.h
    │       │   ├── array_ops.h
    │       │   ├── common.h
    │       │   ├── data_type.h
    │       │   ├── layout.h
    │       │   ├── math.h
    │       │   ├── meta.h
    │       │   ├── mma.h
    │       │   ├── pipe_iter.h
    │       │   ├── smem.h
    │       │   ├── sub_byte_ptr.h
    │       │   ├── sync.h
    │       │   └── thread_map.h
    │       ├── decoding_kernels.cu
    │       ├── decoding_kernels.h
    │       ├── flash_attention
    │       │   ├── CMakeLists.txt
    │       │   ├── flash_attention.cu
    │       │   ├── flash_attention.h
    │       │   ├── flash_attention2
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── README.md
    │       │   │   ├── block_info.h
    │       │   │   ├── flash.h
    │       │   │   ├── flash_api.cpp
    │       │   │   ├── flash_fwd_hdim128_bf16_sm80.cu
    │       │   │   ├── flash_fwd_hdim128_fp16_sm80.cu
    │       │   │   ├── flash_fwd_hdim256_bf16_sm80.cu
    │       │   │   ├── flash_fwd_hdim256_fp16_sm80.cu
    │       │   │   ├── flash_fwd_hdim32_bf16_sm80.cu
    │       │   │   ├── flash_fwd_hdim32_fp16_sm80.cu
    │       │   │   ├── flash_fwd_hdim64_bf16_sm80.cu
    │       │   │   ├── flash_fwd_hdim64_fp16_sm80.cu
    │       │   │   ├── flash_fwd_kernel.h
    │       │   │   ├── flash_fwd_launch_template.h
    │       │   │   ├── kernel_traits.h
    │       │   │   ├── softmax.h
    │       │   │   ├── static_switch.h
    │       │   │   └── utils.h
    │       │   └── fused_multi_head_attention
    │       │   │   ├── CMakeLists.txt
    │       │   │   ├── llama_flash_attention_kernel.cu
    │       │   │   ├── mma_accum_lambda_iterator.h
    │       │   │   └── tile_smem_loader.h
    │       ├── gemm
    │       │   ├── CMakeLists.txt
    │       │   ├── arch.h
    │       │   ├── arch
    │       │   │   ├── config_simt.h
    │       │   │   ├── config_sm70_s884.h
    │       │   │   ├── config_sm75_s16816.h
    │       │   │   ├── config_sm80_s16816.h
    │       │   │   ├── mma_simt.h
    │       │   │   ├── mma_sm70.h
    │       │   │   ├── mma_sm80.h
    │       │   │   ├── operand_simt.h
    │       │   │   ├── operand_sm70_s884.h
    │       │   │   ├── operand_sm80_s16816.h
    │       │   │   ├── smem_copy_simt.h
    │       │   │   ├── smem_copy_sm70.h
    │       │   │   └── smem_copy_sm80.h
    │       │   ├── cast.cu
    │       │   ├── cast.h
    │       │   ├── context.cu
    │       │   ├── context.h
    │       │   ├── convert_v2.cu
    │       │   ├── convert_v2.h
    │       │   ├── cp_async.h
    │       │   ├── cta_map.h
    │       │   ├── desc.h
    │       │   ├── dispatch_cache.cu
    │       │   ├── dispatch_cache.h
    │       │   ├── epilogue.h
    │       │   ├── format.h
    │       │   ├── gemm.cu
    │       │   ├── gemm.h
    │       │   ├── gemm_universal.h
    │       │   ├── gemm_universal_sm90.h
    │       │   ├── gemm_universal_sm90_v2.h
    │       │   ├── gemm_universal_sm90_v3.h
    │       │   ├── gemm_universal_sm90_v4.h
    │       │   ├── gemm_universal_sm90_v5.h
    │       │   ├── gpu_metric.cu
    │       │   ├── gpu_metric.h
    │       │   ├── iterator.h
    │       │   ├── iterator_sm70.h
    │       │   ├── iterator_sm80.h
    │       │   ├── iterator_sm90.h
    │       │   ├── kernel.cu
    │       │   ├── kernel.h
    │       │   ├── kernel
    │       │   │   ├── f16_u4g128_f16_tnt_sm70_s884.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm75_s16816.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm75_simt.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm80_s16816.cu
    │       │   │   ├── f16_u4g128_f16_tnt_sm90_s16816.cu
    │       │   │   ├── sm70_s884_dynamic.cu
    │       │   │   ├── sm75_s16816_dynamic.cu
    │       │   │   ├── sm80_s16816_dynamic.cu
    │       │   │   ├── sm90_q64n32.cu
    │       │   │   ├── sm90_s16816_dynamic.cu
    │       │   │   └── u4g128_f16_f16_nnn_sm80_s16816.cu
    │       │   ├── kernel_impl.h
    │       │   ├── kernel_impl_sm90.h
    │       │   ├── mainloop_sm70.h
    │       │   ├── mainloop_sm80_v2.h
    │       │   ├── matrix_ptr.h
    │       │   ├── moe_utils_v2.cu
    │       │   ├── moe_utils_v2.h
    │       │   ├── operand.h
    │       │   ├── predicate.h
    │       │   ├── registry.cu
    │       │   ├── registry.h
    │       │   ├── scheduler.cuh
    │       │   ├── simt.h
    │       │   ├── sm90_utils.h
    │       │   ├── smem_copy.h
    │       │   ├── test
    │       │   │   ├── gemm_bench.cu
    │       │   │   ├── gemm_test.cu
    │       │   │   ├── models.h
    │       │   │   ├── quantization.cu
    │       │   │   ├── quantization.h
    │       │   │   ├── quantization_impl.h
    │       │   │   ├── reference.cu
    │       │   │   ├── reference.h
    │       │   │   ├── test_gemm.cu
    │       │   │   ├── test_moe_utils.cu
    │       │   │   ├── test_utils.cu
    │       │   │   ├── test_utils.h
    │       │   │   ├── testbed.h
    │       │   │   └── testbed_v2.h
    │       │   ├── thread_group_map.h
    │       │   ├── thread_map.h
    │       │   ├── tiled_mma.h
    │       │   ├── tma.cu
    │       │   ├── tma.h
    │       │   ├── transform.h
    │       │   ├── tuner
    │       │   │   ├── cache_utils.cu
    │       │   │   ├── cache_utils.h
    │       │   │   ├── measurer.cu
    │       │   │   ├── measurer.h
    │       │   │   ├── params.cc
    │       │   │   ├── params.h
    │       │   │   ├── sampler.cu
    │       │   │   ├── sampler.h
    │       │   │   ├── stats.h
    │       │   │   ├── stopping_criterion.cc
    │       │   │   └── stopping_criterion.h
    │       │   ├── types.h
    │       │   ├── unpack.cu
    │       │   └── utils.h
    │       ├── gpt_kernels.cu
    │       ├── gpt_kernels.h
    │       ├── logprob_kernels.cu
    │       ├── logprob_kernels.h
    │       ├── norm
    │       │   ├── CMakeLists.txt
    │       │   ├── rms_norm.cu
    │       │   └── rms_norm.h
    │       ├── penalty_types.h
    │       ├── quantization.cu
    │       ├── quantization.cuh
    │       ├── quantization.h
    │       ├── reduce_kernel_utils.cuh
    │       ├── sampling_kernels.cu
    │       ├── sampling_kernels.h
    │       ├── sampling_penalty_kernels.cu
    │       ├── sampling_penalty_kernels.h
    │       ├── sampling_topk_kernels.cu
    │       ├── sampling_topk_kernels.h
    │       ├── sampling_topp_kernels.cu
    │       ├── sampling_topp_kernels.h
    │       ├── stop_criteria_kernels.cu
    │       ├── stop_criteria_kernels.h
    │       ├── test_quantization.cc
    │       ├── unfused_attention_kernels.cu
    │       └── unfused_attention_kernels.h
    │   ├── layers
    │       ├── BaseDynamicDecodeLayer.h
    │       ├── CMakeLists.txt
    │       ├── DynamicDecodeLayer.cc
    │       ├── DynamicDecodeLayer.h
    │       └── sampling_layers
    │       │   ├── CMakeLists.txt
    │       │   ├── LogitsProcessorLayer.cc
    │       │   ├── LogitsProcessorLayer.h
    │       │   ├── SamplingLayer.cc
    │       │   ├── SamplingLayer.h
    │       │   ├── StopCriteriaLayer.cc
    │       │   ├── StopCriteriaLayer.h
    │       │   └── utils.h
    │   ├── macro.h
    │   ├── models
    │       ├── CMakeLists.txt
    │       └── llama
    │       │   ├── Barrier.h
    │       │   ├── BlockManager.cc
    │       │   ├── BlockManager.h
    │       │   ├── BlockTrie.cc
    │       │   ├── BlockTrie.h
    │       │   ├── CMakeLists.txt
    │       │   ├── LlamaBatch.cc
    │       │   ├── LlamaBatch.h
    │       │   ├── LlamaDecoderLayerWeight.cc
    │       │   ├── LlamaDecoderLayerWeight.h
    │       │   ├── LlamaDenseWeight.cc
    │       │   ├── LlamaDenseWeight.h
    │       │   ├── LlamaFfnLayer.cc
    │       │   ├── LlamaFfnLayer.h
    │       │   ├── LlamaLinear.cu
    │       │   ├── LlamaLinear.h
    │       │   ├── LlamaV2.cc
    │       │   ├── LlamaV2.h
    │       │   ├── LlamaWeight.cc
    │       │   ├── LlamaWeight.h
    │       │   ├── SequenceManager.cc
    │       │   ├── SequenceManager.h
    │       │   ├── context.h
    │       │   ├── copy.h
    │       │   ├── llama_kernels.cu
    │       │   ├── llama_kernels.h
    │       │   ├── llama_params.h
    │       │   ├── llama_rope.h
    │       │   ├── llama_utils.cu
    │       │   ├── llama_utils.h
    │       │   ├── mla_utils.cu
    │       │   ├── mla_utils.h
    │       │   ├── moe_ffn_layer.cc
    │       │   ├── moe_ffn_layer.h
    │       │   ├── test_cache_manager.cc
    │       │   ├── unified_attention_layer.cc
    │       │   ├── unified_attention_layer.h
    │       │   ├── unified_decoder.cc
    │       │   └── unified_decoder.h
    │   ├── python
    │       ├── CMakeLists.txt
    │       ├── bind.cpp
    │       └── dlpack.h
    │   ├── triton_backend
    │       ├── CMakeLists.txt
    │       └── llama
    │       │   ├── CMakeLists.txt
    │       │   ├── LlamaTritonModel.cc
    │       │   └── LlamaTritonModel.h
    │   └── utils
    │       ├── CMakeLists.txt
    │       ├── anomaly_handler.cu
    │       ├── anomaly_handler.h
    │       ├── constant.h
    │       ├── cuda_bf16_fallbacks.cuh
    │       ├── cuda_bf16_wrapper.h
    │       ├── cuda_type_utils.cuh
    │       ├── cuda_utils.cc
    │       ├── cuda_utils.h
    │       ├── debug_utils.h
    │       ├── dispatch.h
    │       ├── logger.cc
    │       ├── logger.h
    │       ├── memory_utils.cu
    │       ├── memory_utils.h
    │       ├── monotonic.h
    │       ├── nvtx_utils.cc
    │       ├── nvtx_utils.h
    │       ├── parser.cc
    │       ├── parser.h
    │       ├── string_utils.h
    │       └── test_utils.h
└── tests
    ├── csrc
        ├── CMakeLists.txt
        └── unittests
        │   ├── CMakeLists.txt
        │   ├── gtest_utils.h
        │   ├── test_logprob_kernels.cu
        │   ├── test_penalty_kernels.cu
        │   ├── test_sampling_kernels.cu
        │   ├── test_sampling_layer.cu
        │   └── unittest_utils.h
    ├── pytorch
        ├── engine
        │   ├── test_logits_process.py
        │   ├── test_request.py
        │   └── test_zmq_rpc.py
        ├── kernel
        │   ├── test_activation.py
        │   ├── test_apply_rotary.py
        │   ├── test_fill_kv_cache.py
        │   ├── test_flash_attention.py
        │   ├── test_flatten_kv_cache.py
        │   ├── test_fuse_moe_blocked_fp8.py
        │   ├── test_fused_lora.py
        │   ├── test_fused_moe.py
        │   ├── test_fused_rotary_emb.py
        │   ├── test_gemm_fp8.py
        │   ├── test_multinomial_sampling.py
        │   ├── test_paged_attention.py
        │   └── test_rms_norm.py
        ├── paging
        │   ├── test_block_manager.py
        │   ├── test_block_trie.py
        │   └── test_scheduler.py
        └── tools
        │   ├── test_layout_convert.py
        │   └── test_make_inputs.py
    └── test_lmdeploy
        ├── test_async_engine.py
        ├── test_auto_backend.py
        ├── test_lite
            └── test_quantization
            │   └── test_utils
            │       └── test_cal_qparams.py
        ├── test_messages.py
        ├── test_model.py
        ├── test_qwen3_parser.py
        ├── test_tokenizer.py
        ├── test_turbomind
            └── test_converter.py
        ├── test_utils.py
        └── test_vl
            └── test_vl_encode.py


/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | title: "[Feature] "
 4 | 
 5 | body:
 6 | - type: markdown
 7 |   attributes:
 8 |     value: |
 9 |       We strongly appreciate you creating a PR to implement this feature [here](https://github.com/InternLM/lmdeploy/pulls)!
10 |       If you need our help, please fill in as much of the following form as you're able to.
11 | 
12 |       **The less clear the description, the longer it will take to solve it.**
13 | - type: textarea
14 |   attributes:
15 |     label: Motivation
16 |     description: |
17 |       A clear and concise description of the motivation of the feature.
18 |       Ex1. It is inconvenient when \[....\].
19 |   validations:
20 |     required: true
21 | - type: textarea
22 |   attributes:
23 |     label: Related resources
24 |     description: |
25 |       If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
26 | - type: textarea
27 |   attributes:
28 |     label: Additional context
29 |     description: |
30 |       Add any other context or screenshots about the feature request here.
31 |       If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to the documentation.
 3 | labels: "kind/doc,status/unconfirmed"
 4 | title: "[Docs] "
 5 | 
 6 | body:
 7 | - type: textarea
 8 |   attributes:
 9 |     label: 📚 The doc issue
10 |     description: >
11 |       A clear and concise description the issue.
12 |   validations:
13 |     required: true
14 | 
15 | - type: textarea
16 |   attributes:
17 |     label: Suggest a potential alternative/fix
18 |     description: >
19 |       Tell us how we could improve the documentation in this regard.
20 | - type: markdown
21 |   attributes:
22 |     value: >
23 |       Thanks for contributing 🎉!
24 | 


--------------------------------------------------------------------------------
/.github/md-link-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ignorePatterns": [
 3 |     {
 4 |       "pattern": "^https://www.reddit.com/"
 5 |     },
 6 |     {
 7 |       "pattern": "^https://developer.nvidia.com/"
 8 |     },
 9 |     {
10 |       "pattern": "^https://docs.openvino.ai/"
11 |     },
12 |     {
13 |       "pattern": "^https://developer.android.com/"
14 |     },
15 |     {
16 |       "pattern": "^https://developer.qualcomm.com/"
17 |     },
18 |     {
19 |       "pattern": "^http://localhost"
20 |     },
21 |     {
22 |       "pattern": "^https://twitter.com"
23 |     },
24 |     {
25 |       "pattern": "^https://platform.openai.com"
26 |     },
27 |     {
28 |       "pattern": "^http://0.0.0.0"
29 |     }
30 |   ],
31 |   "httpHeaders": [
32 |     {
33 |       "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"],
34 |       "headers": {
35 |         "Accept-Encoding": "zstd, br, gzip, deflate"
36 |       }
37 |     }
38 |   ],
39 |   "timeout": "20s",
40 |   "retryOn429": true,
41 |   "retryCount": 5,
42 |   "fallbackRetryDelay": "30s",
43 |   "aliveStatusCodes": [200, 206, 429]
44 | }
45 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
 1 | changelog:
 2 |   categories:
 3 |     - title: 🚀 Features
 4 |       labels:
 5 |         - feature
 6 |         - enhancement
 7 |     - title: 💥 Improvements
 8 |       labels:
 9 |         - improvement
10 |     - title: 🐞 Bug fixes
11 |       labels:
12 |         - bug
13 |         - Bug:P0
14 |         - Bug:P1
15 |         - Bug:P2
16 |         - Bug:P3
17 |     - title: 📚 Documentations
18 |       labels:
19 |         - documentation
20 |     - title: 🌐 Other
21 |       labels:
22 |         - '*'
23 |       exclude:
24 |         labels:
25 |           - feature
26 |           - enhancement
27 |           - improvement
28 |           - bug
29 |           - documentation
30 |           - Bug:P0
31 |           - Bug:P1
32 |           - Bug:P2
33 |           - Bug:P3
34 | 


--------------------------------------------------------------------------------
/.github/scripts/check_lmdeploy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) MegFlow. All rights reserved.
 2 | import glob
 3 | import os
 4 | 
 5 | import fire
 6 | 
 7 | 
 8 | def check_module_init(root: str):
 9 |     """Check if a module has __init__.py file."""
10 |     all_files = glob.glob(os.path.join(root, '**/*'), recursive=True)
11 |     not_exist = []
12 |     for d in all_files:
13 |         if not os.path.isdir(d):
14 |             continue
15 |         if '__pycache__' in d:
16 |             continue
17 |         elif d.startswith('lmdeploy/bin'):
18 |             continue
19 |         elif d.startswith('lmdeploy/lib'):
20 |             continue
21 |         elif d.startswith('lmdeploy/serve/turbomind/triton_models'):
22 |             continue
23 |         elif d.startswith('lmdeploy/serve/turbomind/triton_python_backend'):
24 |             continue
25 |         init_file = os.path.join(d, '__init__.py')
26 |         if not os.path.exists(init_file):
27 |             not_exist.append(init_file)
28 | 
29 |     assert len(not_exist) == 0, f'Missing files: {not_exist}'
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     fire.Fire()
34 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | include lmdeploy/lib/*.so
3 | include lmdeploy/lib/*.so*
4 | include lmdeploy/lib/*.dll
5 | include lmdeploy/lib/*.pyd
6 | include lmdeploy/bin/*
7 | 


--------------------------------------------------------------------------------
/autotest/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | import yaml
 5 | 
 6 | cli_prompt_case_file = 'autotest/chat_prompt_case.yaml'
 7 | common_prompt_case_file = 'autotest/prompt_case.yaml'
 8 | config_file = 'autotest/config.yaml'
 9 | 
10 | 
11 | @pytest.fixture(scope='session')
12 | def config():
13 |     config_path = os.path.join(config_file)
14 |     with open(config_path) as f:
15 |         env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
16 |     return env_config
17 | 
18 | 
19 | @pytest.fixture(scope='session')
20 | def cli_case_config():
21 |     case_path = os.path.join(cli_prompt_case_file)
22 |     with open(case_path) as f:
23 |         case_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
24 |     return case_config
25 | 
26 | 
27 | @pytest.fixture(scope='class', autouse=True)
28 | def common_case_config():
29 |     case_path = os.path.join(common_prompt_case_file)
30 |     with open(case_path) as f:
31 |         case_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
32 |     return case_config
33 | 
34 | 
35 | def pytest_addoption(parser):
36 |     parser.addoption('--run_id', action='store', default='', help='github run_id')
37 | 
38 | 
39 | @pytest.fixture(scope='session')
40 | def run_id(request):
41 |     return request.config.getoption('--run_id')
42 | 


--------------------------------------------------------------------------------
/autotest/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = test*_*.py  # test file
3 | python_classes = Test*     # test class
4 | python_functions = test_*  # test function
5 | pytest_runtest_call.tryfirst = True
6 | filterwarnings = ignore::UserWarning
7 | reruns = 2
8 | reruns_delay = 1
9 | 


--------------------------------------------------------------------------------
/autotest/template.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model_name": "base",
3 |     "capability": "completion"
4 | }
5 | 


--------------------------------------------------------------------------------
/autotest/toolchain/test_lagent.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.order(10)
 5 | @pytest.mark.lagent
 6 | @pytest.mark.flaky(reruns=2)
 7 | @pytest.mark.parametrize('model', ['internlm/internlm2_5-7b-chat'])
 8 | def test_repeat(config, model):
 9 |     from lagent.llms import INTERNLM2_META, LMDeployPipeline
10 | 
11 |     model = LMDeployPipeline(
12 |         path='/'.join([config.get('model_path'), model]),
13 |         meta_template=INTERNLM2_META,
14 |         tp=1,
15 |         top_k=40,
16 |         top_p=0.8,
17 |         temperature=1.2,
18 |         stop_words=['<|im_end|>'],
19 |         max_new_tokens=4096,
20 |     )
21 |     response_list = []
22 |     for i in range(3):
23 |         print(f'run_{i}：')
24 |         response = model.chat([{
25 |             'role':
26 |             'user',
27 |             'content':
28 |             '已知$$z_{1}=1$$,$$z_{2}=\\text{i}$$,$$z_{3}=-1$$,$$z_{4}=-\\text{i}$$,顺次连结它们所表示的点,则所得图形围成的面积为（ ）\nA. $$\\dfrac{1}{4}$$\n B. $$\\dfrac{1}{2}$$\n C. $$1$$\n D. $$2$$\n\n'  # noqa: F401, E501
29 |         }])
30 |         print(response)
31 |         response_list.append(response)
32 |         assert len(response) > 10
33 |     assert response_list[0] != response_list[1] and response_list[1] != response_list[2]
34 | 


--------------------------------------------------------------------------------
/autotest/tools/quantization/test_quantization_w8a8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import allure
 4 | import pytest
 5 | from utils.config_utils import get_cuda_prefix_by_workerid, get_quantization_model_list
 6 | from utils.quantization_utils import quantization
 7 | 
 8 | 
 9 | @pytest.mark.order(2)
10 | @pytest.mark.quantization_w8a8
11 | @pytest.mark.timeout(900)
12 | @pytest.mark.parametrize('model', get_quantization_model_list('w8a8'))
13 | def test_quantization_w8a8(config, model, worker_id):
14 |     quantization_w8a8(config, model + '-inner-w8a8', model, get_cuda_prefix_by_workerid(worker_id))
15 | 
16 | 
17 | def quantization_w8a8(config, quantization_model_name, origin_model_name, cuda_prefix):
18 |     quantization_type = 'w8a8'
19 |     result, msg = quantization(config, quantization_model_name, origin_model_name, quantization_type, cuda_prefix)
20 |     log_path = config.get('log_path')
21 |     quantization_log = os.path.join(
22 |         log_path, '_'.join(['quantization', quantization_type,
23 |                             quantization_model_name.split('/')[1]]) + '.log')
24 | 
25 |     allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT)
26 |     assert result, msg
27 | 


--------------------------------------------------------------------------------
/autotest/utils/mp_log_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import allure
 4 | from pytest_assume.plugin import assume
 5 | 
 6 | 
 7 | def write_log(config, result, msg, is_new: bool = True, case_path_tag: str = 'default'):
 8 |     try:
 9 |         log_path = os.path.join(config.get('log_path'), case_path_tag)
10 | 
11 |         if is_new:
12 |             file = open(log_path, 'w')
13 |         else:
14 |             file = open(log_path, 'a')
15 | 
16 |         file.writelines('result:' + result + ', reason:' + msg + '\n')
17 |         file.close()
18 |     except Exception as e:
19 |         return False, None, f'Unknown error: {e}'
20 | 
21 | 
22 | def assert_log(config, case_path_tag: str = 'default'):
23 |     log_path = os.path.join(config.get('log_path'), case_path_tag)
24 | 
25 |     with open(log_path, 'r') as f:
26 |         lines = f.readlines()
27 | 
28 |         for line in lines:
29 |             if 'result:False, reason:' in line:
30 |                 result = False
31 |                 msg = line
32 |                 break
33 |             if 'result:True, reason:' in line and not result:
34 |                 result = True
35 | 
36 |     allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT)
37 |     with assume:
38 |         assert result, msg
39 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | We provide several profiling tools to benchmark our models.
 4 | 
 5 | ## profile with dataset
 6 | 
 7 | Download the dataset below or create your own dataset.
 8 | 
 9 | ```bash
10 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
11 | ```
12 | 
13 | Profiling your model with `profile_throughput.py`
14 | 
15 | ```bash
16 | python profile_throughput.py \
17 |  ShareGPT_V3_unfiltered_cleaned_split.json \
18 |  /path/to/your/model \
19 |  --concurrency 64
20 | ```
21 | 
22 | ## profile without dataset
23 | 
24 | `profile_generation.py` perform benchmark with dummy data.
25 | 
26 | ```shell
27 | pip install nvidia-ml-py
28 | ```
29 | 
30 | ```bash
31 | python profile_generation.py \
32 |  /path/to/your/model \
33 |  --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
34 | ```
35 | 
36 | ## profile restful api
37 | 
38 | `profile_restful_api.py` is used to do benchmark on api server.
39 | 
40 | ```bash
41 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
42 | 
43 | python3 profile_restful_api.py --backend lmdeploy --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json
44 | ```
45 | 


--------------------------------------------------------------------------------
/benchmark/lmdeploy.yml:
--------------------------------------------------------------------------------
 1 | num_promts: &num_prompts 1
 2 | dataset_path: &dataset_path "/nvme1/shared/ShareGPT_V3_unfiltered_cleaned_split.json"
 3 | dataset_name: &dataset_name "sharegpt"
 4 | model_path: &model_path "Qwen/Qwen3-8B"
 5 | server:
 6 |   server_port: 23333
 7 | engine:
 8 |   - tp: 1
 9 |     model_path: *model_path
10 |     max_batch_size: 1024
11 |     cache_max_entry_count: 0.8
12 | data:
13 |   - dataset_name: *dataset_name
14 |     dataset_path: *dataset_path
15 |     num_prompts: *num_prompts
16 |   - dataset_name: *dataset_name
17 |     dataset_path: *dataset_path
18 |     sharegpt_output_len: 2048
19 |     num_prompts: *num_prompts
20 |   - dataset_name: *dataset_name
21 |     dataset_path: *dataset_path
22 |     sharegpt_output_len: 4096
23 |     num_prompts: *num_prompts
24 |   - dataset_name: *dataset_name
25 |     dataset_path: *dataset_path
26 |     sharegpt_output_len: 8192
27 |     num_prompts: *num_prompts
28 |   - dataset_name: *dataset_name
29 |     dataset_path: *dataset_path
30 |     sharegpt_output_len: 16384
31 |     num_prompts: *num_prompts
32 |   - dataset_name: *dataset_name
33 |     dataset_path: *dataset_path
34 |     sharegpt_output_len: 32768
35 |     num_prompts: *num_prompts
36 | 


--------------------------------------------------------------------------------
/builder/manywheel/README.md:
--------------------------------------------------------------------------------
 1 | # Build lmdeploy manylinux wheel
 2 | 
 3 | ## Prepare docker image
 4 | 
 5 | To build all docker images you can use the convenient script:
 6 | 
 7 | ```bash
 8 | ./build_all_docker.sh
 9 | # Build with pushing
10 | WITH_PUSH=true ./build_all_docker.sh
11 | ```
12 | 
13 | To build a docker image with specific cuda version or manylinux-docker version, you may use:
14 | 
15 | ```bash
16 | MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION=12.4 ./build_docker.sh
17 | ```
18 | 
19 | ## Build lmdeploy wheel
20 | 
21 | ```bash
22 | ./build_all_wheel.sh
23 | ```
24 | 


--------------------------------------------------------------------------------
/builder/manywheel/build_all_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eou pipefail
 4 | 
 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
 6 | 
 7 | for cuda_version in 12.4; do
 8 |     MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/manywheel/build_docker.sh"
 9 | done
10 | 


--------------------------------------------------------------------------------
/builder/manywheel/build_all_wheel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eou pipefail
 4 | 
 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
 6 | 
 7 | CUDA_VER=${CUDA_VER:-12.4}
 8 | 
 9 | PLAT_NAME=manylinux2014_x86_64
10 | for cuver in ${CUDA_VER}; do
11 |     DOCKER_TAG=cuda${cuver}
12 |     OUTPUT_FOLDER=cuda${cuver}_dist
13 |     for pyver in py38 py39 py310 py311 py312; do
14 |         bash ${TOPDIR}/manywheel/build_wheel.sh ${pyver} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} \
15 |             |& tee ${PLAT_NAME}.${pyver}.cuda${cuver}.log.txt
16 |     done
17 | done
18 | 


--------------------------------------------------------------------------------
/builder/manywheel/build_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eou pipefail
 4 | 
 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
 6 | GPU_ARCH_VERSION=${GPU_ARCH_VERSION}
 7 | WITH_PUSH=${WITH_PUSH:-}
 8 | 
 9 | TARGET=cuda_final
10 | DOCKER_TAG=cuda${GPU_ARCH_VERSION}
11 | 
12 | DOCKER_IMAGE=openmmlab/lmdeploy-builder:${DOCKER_TAG}
13 | if [[ -n ${MANY_LINUX_VERSION} ]]; then
14 |     DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
15 | else
16 |     DOCKERFILE_SUFFIX=''
17 | fi
18 | 
19 | (
20 |     set -x
21 |     DOCKER_BUILDKIT=1 docker build \
22 |         -t "${DOCKER_IMAGE}" \
23 |         --build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} \
24 |         --build-arg DEVTOOLSET_VERSION=9 \
25 |         --build-arg HTTPS_PROXY=${HTTPS_PROXY} \
26 |         --build-arg HTTP_PROXY=${HTTP_PROXY} \
27 |         --target "${TARGET}" \
28 |         -f "${TOPDIR}/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
29 |         "${TOPDIR}"
30 | )
31 | 
32 | if [[ "${WITH_PUSH}" == true ]]; then
33 |     (
34 |         set -x
35 |         docker push "${DOCKER_IMAGE}"
36 |     )
37 | fi
38 | 


--------------------------------------------------------------------------------
/builder/manywheel/build_wheel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eux
 3 | 
 4 | PYTHON_VERSION="$1"
 5 | PLAT_NAME="$2"
 6 | DOCKER_TAG="$3"
 7 | OUTPUT_DIR="$4"
 8 | 
 9 | DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
10 | export USERID=$(id -u)
11 | export GROUPID=$(id -g)
12 | 
13 | cd "$(dirname "$0")"  # move inside the script directory
14 | mkdir -p "${OUTPUT_DIR}"
15 | docker pull ${DOCKER_IMAGE}
16 | docker run --rm -it \
17 |     --env PYTHON_VERSION="${PYTHON_VERSION}" \
18 |     --env PLAT_NAME="${PLAT_NAME}" \
19 |     --env USERID="${USERID}" \
20 |     --env GROUPID="${GROUPID}" \
21 |     --volume "$(pwd)/../../:/lmdeploy" \
22 |     --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
23 |     --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
24 |     --entrypoint /entrypoint_build.sh \
25 |     ${DOCKER_IMAGE}
26 | 


--------------------------------------------------------------------------------
/builder/manywheel/entrypoint_build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eux
 3 | 
 4 | export PYTHON_VERSION=$PYTHON_VERSION
 5 | export PLAT_NAME=$PLAT_NAME
 6 | export USERID=${USERID}
 7 | export GROUPID=${GROUPID}
 8 | export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
 9 | export NCCL_INCLUDE_DIR=/usr/local/cuda/include
10 | export NCCL_LIB_DIR=/usr/local/cuda/lib64
11 | 
12 | source /opt/conda/bin/activate
13 | conda activate $PYTHON_VERSION
14 | 
15 | cd lmdeploy
16 | rm -rf lmdeploy/lib
17 | mkdir -p build && cd build && rm -rf *
18 | bash ../generate.sh make
19 | make -j$(nproc) && make install
20 | if [ $? != 0 ]; then
21 |     echo "build failed"
22 |     exit 1
23 | fi
24 | cd ..
25 | rm -rf build
26 | python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/
27 | chown ${USERID}:${GROUPID} /tmpbuild/*
28 | mv /tmpbuild/* /lmdeploy_build/
29 | 


--------------------------------------------------------------------------------
/builder/manywheel/scripts/install_conda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
6 | chmod +x  Miniconda3-latest-Linux-x86_64.sh
7 | bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
8 | rm Miniconda3-latest-Linux-x86_64.sh
9 | 


--------------------------------------------------------------------------------
/builder/manywheel/scripts/install_openmpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
 6 | tar xf openmpi-4.1.5.tar.gz
 7 | cd openmpi-4.1.5
 8 | ./configure --prefix=/usr/local/mpi
 9 | make -j$(nproc)
10 | make install
11 | 


--------------------------------------------------------------------------------
/builder/windows/README.md:
--------------------------------------------------------------------------------
 1 | # Build lmdeploy on windows
 2 | 
 3 | ## Requirements
 4 | 
 5 | - [CMake 3.17+](https://github.com/Kitware/CMake/releases)
 6 | - [Visual Studio 2019+](https://visualstudio.microsoft.com/downloads/)
 7 | - [CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
 8 | 
 9 | ## Build lmdeploy wheel
10 | 
11 | ```powershell
12 | mkdir build
13 | cd build
14 | ..\builder\windows\generate.ps1
15 | cmake --build . --config Release -- /m
16 | cmake --install . --config Release
17 | cd ..
18 | rm build -Force -Recurse
19 | python setup.py bdist_wheel -d build\wheel
20 | ```
21 | 


--------------------------------------------------------------------------------
/builder/windows/generate.ps1:
--------------------------------------------------------------------------------
1 | cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
2 |     -DCMAKE_BUILD_TYPE=Release `
3 |     -DCMAKE_INSTALL_PREFIX=install `
4 |     -DBUILD_PY_FFI=ON `
5 |     -DBUILD_MULTI_GPU=OFF `
6 |     -DUSE_NVTX=OFF `
7 |     -DBUILD_TEST="$env:BUILD_TEST"
8 | 


--------------------------------------------------------------------------------
/debug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | builder="-G Ninja"
 4 | 
 5 | if [ "$1" == "make" ]; then
 6 |     builder=""
 7 | fi
 8 | 
 9 | cmake ${builder} .. \
10 |     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
11 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
12 |     -DCMAKE_INSTALL_PREFIX=./install \
13 |     -DBUILD_PY_FFI=ON \
14 |     -DBUILD_MULTI_GPU=ON \
15 |     -DCMAKE_CUDA_FLAGS="-lineinfo" \
16 |     -DUSE_NVTX=ON \
17 |     -DPYTHON_EXECUTABLE=$(which python3) \
18 |     -DBUILD_TEST=ON
19 | 


--------------------------------------------------------------------------------
/docker/InternVL_Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=cu12
 2 | 
 3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12
 4 | ENV CUDA_VERSION_SHORT=cu123
 5 | 
 6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11
 7 | ENV CUDA_VERSION_SHORT=cu118
 8 | 
 9 | FROM ${CUDA_VERSION} AS final
10 | 
11 | RUN python3 -m pip install timm
12 | 
13 | RUN python3 -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+${CUDA_VERSION_SHORT}torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
14 | 


--------------------------------------------------------------------------------
/docker/Qwen2VL_Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG CUDA_VERSION=cu12
 2 | 
 3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12
 4 | ENV CUDA_VERSION_SHORT=cu123
 5 | 
 6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11
 7 | ENV CUDA_VERSION_SHORT=cu118
 8 | 
 9 | FROM ${CUDA_VERSION} AS final
10 | 
11 | # we use transformers to load vision part of qwen2_vl and it needs transformers > v4.44.2
12 | RUN python3 -m pip install git+https://github.com/huggingface/transformers.git
13 | 
14 | RUN python3 -m pip install qwen_vl_utils
15 | 


--------------------------------------------------------------------------------
/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | formats: all
 4 | 
 5 | build:
 6 |   os: "ubuntu-22.04"
 7 |   tools:
 8 |     python: "3.10"
 9 | 
10 | 
11 | sphinx:
12 |   configuration: docs/en/conf.py
13 | 
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 |     - requirements: requirements/readthedocs.txt
19 | 


--------------------------------------------------------------------------------
/docs/en/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 


--------------------------------------------------------------------------------
/docs/en/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | table.autosummary td {
 2 |   width: 50%
 3 | }
 4 | 
 5 | img.align-center {
 6 |   display: block;
 7 |   margin-left: auto;
 8 |   margin-right: auto;
 9 | }
10 | 


--------------------------------------------------------------------------------
/docs/en/api/pipeline.rst:
--------------------------------------------------------------------------------
 1 | inference pipeline
 2 | ==================
 3 | .. currentmodule:: lmdeploy
 4 | 
 5 | pipeline
 6 | --------
 7 | .. autofunction:: pipeline
 8 | 
 9 | serving
10 | --------
11 | .. autofunction:: serve
12 | .. autofunction:: client
13 | 
14 | 
15 | PytorchEngineConfig
16 | -------------------
17 | .. autoclass:: PytorchEngineConfig
18 | 
19 | 
20 | TurbomindEngineConfig
21 | ---------------------
22 | .. autoclass:: TurbomindEngineConfig
23 | 
24 | 
25 | GenerationConfig
26 | ----------------
27 | .. autoclass:: GenerationConfig
28 | 
29 | 
30 | ChatTemplateConfig
31 | ------------------
32 | .. autoclass:: ChatTemplateConfig
33 | 


--------------------------------------------------------------------------------
/docs/en/get_started/index.rst:
--------------------------------------------------------------------------------
1 | On Other Platforms
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: NPU(Huawei)
7 | 
8 |    ascend/get_started.md
9 | 


--------------------------------------------------------------------------------
/docs/en/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/en/multi_modal/index.rst:
--------------------------------------------------------------------------------
 1 | Vision-Language Models
 2 | =================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Examples
 7 | 
 8 |    deepseek_vl2.md
 9 |    llava.md
10 |    internvl.md
11 |    xcomposer2d5.md
12 |    cogvlm.md
13 |    minicpmv.md
14 |    phi3.md
15 |    mllama.md
16 |    qwen2_vl.md
17 |    qwen2_5_vl.md
18 |    molmo.md
19 |    gemma3.md
20 | 


--------------------------------------------------------------------------------
/docs/zh_cn/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | formats: all
 4 | 
 5 | build:
 6 |   os: "ubuntu-22.04"
 7 |   tools:
 8 |     python: "3.10"
 9 | 
10 | 
11 | sphinx:
12 |   configuration: docs/zh_cn/conf.py
13 | 
14 | 
15 | python:
16 |   install:
17 |     - requirements: requirements/docs.txt
18 |     - requirements: requirements/readthedocs.txt
19 | 


--------------------------------------------------------------------------------
/docs/zh_cn/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 | 


--------------------------------------------------------------------------------
/docs/zh_cn/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
 1 | table.autosummary td {
 2 |   width: 50%
 3 | }
 4 | 
 5 | img.align-center {
 6 |   display: block;
 7 |   margin-left: auto;
 8 |   margin-right: auto;
 9 | }
10 | 


--------------------------------------------------------------------------------
/docs/zh_cn/api/pipeline.rst:
--------------------------------------------------------------------------------
 1 | 推理 pipeline
 2 | ==================
 3 | .. currentmodule:: lmdeploy
 4 | 
 5 | pipeline
 6 | --------
 7 | .. autofunction:: pipeline
 8 | 
 9 | serving
10 | --------
11 | .. autofunction:: serve
12 | .. autofunction:: client
13 | 
14 | 
15 | PytorchEngineConfig
16 | -------------------
17 | .. autoclass:: PytorchEngineConfig
18 | 
19 | 
20 | TurbomindEngineConfig
21 | ---------------------
22 | .. autoclass:: TurbomindEngineConfig
23 | 
24 | 
25 | GenerationConfig
26 | ----------------
27 | .. autoclass:: GenerationConfig
28 | 
29 | 
30 | ChatTemplateConfig
31 | ------------------
32 | .. autoclass:: ChatTemplateConfig
33 | 


--------------------------------------------------------------------------------
/docs/zh_cn/get_started/index.rst:
--------------------------------------------------------------------------------
1 | 其他软硬件平台
2 | =================================
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 |    :caption: NPU(Huawei)
7 | 
8 |    ascend/get_started.md
9 | 


--------------------------------------------------------------------------------
/docs/zh_cn/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/zh_cn/multi_modal/gemma3.md:
--------------------------------------------------------------------------------
 1 | # Gemma3
 2 | 
 3 | ## 简介
 4 | 
 5 | Gemma 是 Google 推出的轻量级、最先进的开放模型系列，采用与创建 Gemini 模型相同的研究和技术构建而成。Gemma3 模型是多模态模型，可处理文本和图像输入并生成文本输出，对预训练和指令微调均具有开源的权重。Gemma3 具有 128K 的大型上下文窗口，支持 140 多种语言，并且比以前的版本提供更多尺寸。Gemma3 模型非常适合各种文本生成和图像理解任务，包括问答、总结和推理。它们的尺寸相对较小，因此可以将其部署在资源有限的环境中，例如笔记本电脑、台式机或您自己的云基础设施，从而让每个人都能轻松访问最先进的 AI 模型，并帮助促进创新。
 6 | 
 7 | ## 快速开始
 8 | 
 9 | 请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
10 | 
11 | ### 准备
12 | 
13 | 在使用 LMDeploy 部署 **Gemma3** 模型时，请安装最新的 transformers。
14 | 
15 | ### 离线推理 pipeline
16 | 
17 | 以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)。
18 | 
19 | ```python
20 | from lmdeploy import pipeline
21 | from lmdeploy.vl import load_image
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     pipe = pipeline('google/gemma-3-12b-it')
26 | 
27 |     image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
28 |     response = pipe(('describe this image', image))
29 |     print(response)
30 | ```
31 | 


--------------------------------------------------------------------------------
/docs/zh_cn/multi_modal/index.rst:
--------------------------------------------------------------------------------
 1 | 视觉语言模型
 2 | =================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: 示例
 7 | 
 8 |    deepseek_vl2.md
 9 |    llava.md
10 |    internvl.md
11 |    xcomposer2d5.md
12 |    cogvlm.md
13 |    minicpmv.md
14 |    phi3.md
15 |    mllama.md
16 |    qwen2_vl.md
17 |    qwen2_5_vl.md
18 |    molmo.md
19 |    gemma3.md
20 | 


--------------------------------------------------------------------------------
/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | WORKSPACE_PATH=$(dirname "$(readlink -f "$0")")
 3 | 
 4 | builder="-G Ninja"
 5 | 
 6 | if [ "$1" == "make" ]; then
 7 |     builder=""
 8 | fi
 9 | 
10 | cmake ${builder} .. \
11 |     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
12 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
13 |     -DCMAKE_INSTALL_PREFIX=${WORKSPACE_PATH}/install \
14 |     -DBUILD_PY_FFI=ON \
15 |     -DBUILD_MULTI_GPU=ON \
16 |     -DCMAKE_CUDA_FLAGS="-lineinfo" \
17 |     -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
18 |     -DUSE_NVTX=ON
19 | 


--------------------------------------------------------------------------------
/k8s/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app: internlm2-chat-7b
 6 |   name: internlm2-chat-7b-svc
 7 | spec:
 8 |   ports:
 9 |   - name: main
10 |     port: 23333
11 |     protocol: TCP
12 |     targetPort: main
13 |   selector:
14 |     app: internlm2-chat-7b
15 |   type: ClusterIP
16 | 


--------------------------------------------------------------------------------
/lmdeploy/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .api import client, pipeline, serve
 4 | from .messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
 5 | from .model import ChatTemplateConfig
 6 | from .tokenizer import Tokenizer
 7 | from .version import __version__, version_info
 8 | 
 9 | __all__ = [
10 |     'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig', '__version__', 'version_info', 'ChatTemplateConfig',
11 |     'PytorchEngineConfig', 'TurbomindEngineConfig', 'VisionConfig'
12 | ]
13 | 


--------------------------------------------------------------------------------
/lmdeploy/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .cli import run
3 | 
4 | if __name__ == '__main__':
5 |     run()
6 | 


--------------------------------------------------------------------------------
/lmdeploy/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .entrypoint import run
3 | 
4 | __all__ = ['run']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .apis import *  # noqa: F401,F403
3 | from .quantization import *  # noqa: F401,F403
4 | from .utils import *  # noqa: F401,F403
5 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/apis/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/defaults.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch import nn
3 | 
4 | OFFLOAD_MOD = (nn.Linear, )
5 | KV_CACHE_SIGNATURE = 'past_key_value'
6 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/internlm2_gptq.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from auto_gptq.modeling import BaseGPTQForCausalLM
 3 | 
 4 | 
 5 | class InternLM2GPTQForCausalLM(BaseGPTQForCausalLM):
 6 |     layer_type = 'InternLM2DecoderLayer'
 7 |     layers_block_name = 'model.layers'
 8 |     outside_layer_modules = ['model.tok_embeddings', 'model.norm']
 9 |     inside_layer_modules = [
10 |         ['attention.wqkv'],
11 |         ['attention.wo'],
12 |         ['feed_forward.w3', 'feed_forward.w1'],
13 |         ['feed_forward.w2'],
14 |     ]
15 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/internlm3_gptq.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from auto_gptq.modeling import BaseGPTQForCausalLM
 3 | 
 4 | 
 5 | class InternLM3GPTQForCausalLM(BaseGPTQForCausalLM):
 6 |     layer_type = 'InternLM3DecoderLayer'
 7 |     layers_block_name = 'model.layers'
 8 |     outside_layer_modules = ['model.embed_tokens', 'model.norm']
 9 |     inside_layer_modules = [
10 |         ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
11 |         ['self_attn.o_proj'],
12 |         ['mlp.up_proj', 'mlp.gate_proj'],
13 |         ['mlp.down_proj'],
14 |     ]
15 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .activation import ActivationObserver, KVCacheObserver
3 | from .calibration import CalibrationContext, CalibrationContextV2
4 | from .weight import WeightQuantizer
5 | 
6 | __all__ = ['WeightQuantizer', 'ActivationObserver', 'KVCacheObserver', 'CalibrationContext', 'CalibrationContextV2']
7 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/activation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .observer import ActivationObserver, KVCacheObserver
3 | 
4 | __all__ = ['ActivationObserver', 'KVCacheObserver']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .linear import WeightOnlyQLinear
3 | 
4 | __all__ = ['WeightOnlyQLinear']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/weight/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .quantizer import WeightQuantizer
3 | 
4 | __all__ = ['WeightQuantizer']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/lite/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
 4 | from .cal_qparams import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax,
 5 |                           cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax,
 6 |                           cal_qparams_per_tensor_minmax, precise_round)
 7 | from .calib_dataloader import get_calib_loaders
 8 | from .collect import bimap_name_mod, collect_target_modules, collect_target_weights
 9 | from .global_avail import GlobalAvailMixin
10 | from .load import load_hf_from_pretrained
11 | 
12 | __all__ = [
13 |     'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax', 'cal_qparams_per_group_absmax',
14 |     'cal_qparams_per_group_minmax', 'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax', 'QParams',
15 |     'get_calib_loaders', 'collect_target_modules', 'precise_round', 'collect_target_weights', 'GlobalAvailMixin',
16 |     'split_decoder_layer_inputs', 'bimap_name_mod', 'concat_decoder_layer_outputs', 'load_hf_from_pretrained'
17 | ]
18 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/adapter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import OpType  # noqa: F401
3 | from .selector import get_backend  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | 
 5 | class SiluAndMulImpl(ABC):
 6 |     """Silu + multiple residual fused implementation."""
 7 | 
 8 |     @abstractmethod
 9 |     def forward(self, x):
10 |         """forward."""
11 |         raise NotImplementedError
12 | 
13 | 
14 | class SiluAndMulBuilder(ABC):
15 |     """Silu and mul implementation builder."""
16 | 
17 |     @staticmethod
18 |     @abstractmethod
19 |     def build(inplace: bool = False):
20 |         """build."""
21 |         raise NotImplementedError
22 | 
23 | 
24 | class GeluAndMulImpl(ABC):
25 |     """Gelu + multiple residual fused implementation."""
26 | 
27 |     @abstractmethod
28 |     def forward(self, x):
29 |         """forward."""
30 |         raise NotImplementedError
31 | 
32 | 
33 | class GeluAndMulBuilder(ABC):
34 |     """Gelu and mul implementation builder."""
35 | 
36 |     @staticmethod
37 |     @abstractmethod
38 |     def build(approximate: str = 'none'):
39 |         """build."""
40 |         raise NotImplementedError
41 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/apply_rotary_emb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | class ApplyRotaryEmbImpl(ABC):
 8 |     """Apply rotary embedding implementation."""
 9 | 
10 |     @abstractmethod
11 |     def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
12 |         """forward."""
13 |         raise NotImplementedError
14 | 
15 | 
16 | class ApplyRotaryEmbBuilder(ABC):
17 |     """Apply rotary embedding implementation builder."""
18 | 
19 |     @staticmethod
20 |     @abstractmethod
21 |     def build():
22 |         """Build implementation."""
23 |         raise NotImplementedError
24 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/awq_modules.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | from typing import Optional
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class LinearW4A16Impl(ABC):
 9 |     """W4a16 linear implementation."""
10 | 
11 |     def update_weights(self,
12 |                        qweight: torch.Tensor,
13 |                        scales: torch.Tensor,
14 |                        qzeros: torch.Tensor,
15 |                        bias: Optional[torch.Tensor] = None):
16 |         """Update weights."""
17 |         return qweight, scales, qzeros, bias
18 | 
19 |     @abstractmethod
20 |     def forward(self, x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, all_reduce: bool = False):
21 |         """forward."""
22 |         raise NotImplementedError
23 | 
24 | 
25 | class LinearW4A16Builder(ABC):
26 |     """W4a16 linear implementation builder."""
27 | 
28 |     @staticmethod
29 |     @abstractmethod
30 |     def build(in_features: int,
31 |               out_features: int,
32 |               w_bit: int,
33 |               group_size: int,
34 |               bias: bool = False,
35 |               dtype: torch.dtype = None):
36 |         """build."""
37 |         raise NotImplementedError
38 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/blockedf8_modules.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | from typing import List, Optional
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class LinearBlockedF8Impl(ABC):
 9 |     """Linear BlockedF8 implementation api."""
10 | 
11 |     def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
12 |         """Update weights."""
13 |         return weight, scale, bias
14 | 
15 |     @abstractmethod
16 |     def forward(self,
17 |                 x,
18 |                 weight: torch.Tensor,
19 |                 scale: torch.Tensor,
20 |                 bias: Optional[torch.Tensor] = None,
21 |                 all_reduce: bool = False,
22 |                 rank: int = 0,
23 |                 scatter_size: List[int] = None):
24 |         """forward."""
25 |         raise NotImplementedError
26 | 
27 | 
28 | class LinearBlockedF8Builder(ABC):
29 |     """Linear BlockedF8 implementation builder."""
30 | 
31 |     @staticmethod
32 |     @abstractmethod
33 |     def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None):
34 |         """build."""
35 |         raise NotImplementedError
36 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import CudaOpsBackend  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul
 3 | 
 4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl
 5 | 
 6 | 
 7 | class TritonSiluAndMulImpl(SiluAndMulImpl):
 8 |     """Silu + multiple residual fused implementation."""
 9 | 
10 |     def __init__(self, inplace: bool):
11 |         self.inplace = inplace
12 | 
13 |     def forward(self, x):
14 |         """forward."""
15 |         out = None
16 |         x_shape = None
17 |         if x.dim() != 2:
18 |             x_shape = x.shape
19 |             x = x.flatten(0, -2)
20 |         if self.inplace:
21 |             out = x.chunk(2, -1)[0]
22 | 
23 |         out = silu_and_mul(x, out)
24 | 
25 |         if x_shape is not None:
26 |             out = out.unflatten(0, x_shape[:-1])
27 |         return out
28 | 
29 | 
30 | class TritonSiluAndMulBuilder(SiluAndMulBuilder):
31 |     """Silu and mul implementation builder."""
32 | 
33 |     @staticmethod
34 |     def build(inplace: bool = False):
35 |         """build."""
36 |         return TritonSiluAndMulImpl(inplace)
37 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/apply_rotary_emb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | from torch import Tensor
 4 | 
 5 | from lmdeploy.pytorch.kernels.cuda import apply_rotary_pos_emb
 6 | 
 7 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl
 8 | 
 9 | 
10 | class TritonApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
11 |     """Apply rotary embedding implementation."""
12 | 
13 |     def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
14 |         """forward."""
15 |         if inplace:
16 |             q_embed = query
17 |             k_embed = key
18 |         else:
19 |             q_embed = torch.empty_like(query)
20 |             k_embed = torch.empty_like(key)
21 |         return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
22 | 
23 | 
24 | class TritonApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
25 |     """Apply rotary embedding implementation builder."""
26 | 
27 |     @staticmethod
28 |     def build():
29 |         """Build implementation."""
30 |         return TritonApplyRotaryEmbImpl()
31 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/multinomial_sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from lmdeploy.pytorch.kernels.cuda import multinomial_sampling
 6 | 
 7 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl
 8 | 
 9 | 
10 | class TritonMultinomialSamplingImpl(MultinomialSamplingImpl):
11 | 
12 |     def forward(self,
13 |                 scores: torch.Tensor,
14 |                 seeds: torch.LongTensor,
15 |                 offsets: torch.LongTensor,
16 |                 indices: torch.Tensor = None):
17 |         """forward."""
18 |         return multinomial_sampling(scores, seeds, offsets, indices)
19 | 
20 | 
21 | class TritonMultinomialSamplingBuilder(MultinomialSamplingBuilder):
22 |     """Triton multinomial sampling builder."""
23 | 
24 |     def build():
25 |         """build."""
26 |         return TritonMultinomialSamplingImpl()
27 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from lmdeploy.pytorch.kernels.cuda import rms_norm
 5 | 
 6 | from ..norm import RMSNormBuilder, RMSNormImpl
 7 | 
 8 | 
 9 | class TritonRMSNormImpl(RMSNormImpl):
10 |     """Triton RMS norm implementation."""
11 | 
12 |     def __init__(self, hidden_size: int, eps: float = 1e-6):
13 |         self.hidden_size = hidden_size
14 |         self.eps = eps
15 | 
16 |     def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
17 |         """forward."""
18 |         if residual is None:
19 |             x = rms_norm(x, weight, self.eps)
20 |             return x
21 |         else:
22 |             x, residual = rms_norm(x, weight, self.eps, residual=residual)
23 |             return x, residual
24 | 
25 | 
26 | class TritonRMSNormBuilder(RMSNormBuilder):
27 |     """Triton RMS norm implementation builder."""
28 | 
29 |     @staticmethod
30 |     def build(weight: torch.Tensor, eps: float = 1e-6):
31 |         """build."""
32 |         return TritonRMSNormImpl(weight, eps)
33 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import DefaultOpsBackend  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/moe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from ..moe import SoftmaxTopKBuilder, SoftmaxTopKImpl
 5 | 
 6 | 
 7 | class DefaultSoftmaxTopKImpl(SoftmaxTopKImpl):
 8 |     """RMS norm implementation api."""
 9 | 
10 |     def __init__(self, top_k: int, dim: int = -1):
11 |         self.top_k = top_k
12 |         self.dim = dim
13 | 
14 |     def forward(self, x: torch.Tensor):
15 |         """forward."""
16 |         routing_weights = torch.softmax(x, dim=self.dim, dtype=torch.float32)
17 |         topk_weights, topk_ids = torch.topk(routing_weights, self.top_k, dim=self.dim)
18 |         return topk_weights, topk_ids
19 | 
20 | 
21 | class DefaultSoftmaxTopKBuilder(SoftmaxTopKBuilder):
22 |     """RMS norm implementation builder."""
23 | 
24 |     @staticmethod
25 |     def build(top_k: int, dim: int = -1):
26 |         """build."""
27 |         return DefaultSoftmaxTopKImpl(top_k, dim)
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/multinomial_sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl
 6 | 
 7 | 
 8 | class DefaultMultinomialSamplingImpl(MultinomialSamplingImpl):
 9 |     """Multinomial sampling implementation api."""
10 | 
11 |     def forward(self,
12 |                 scores: torch.Tensor,
13 |                 seeds: torch.LongTensor,
14 |                 offsets: torch.LongTensor,
15 |                 indices: torch.Tensor = None):
16 |         """forward."""
17 |         sampled_index = torch.multinomial(scores, num_samples=1, replacement=True)
18 |         outputs = torch.gather(indices, dim=1, index=sampled_index)
19 |         return outputs.view(-1)
20 | 
21 | 
22 | class DefaultMultinomialSamplingBuilder(MultinomialSamplingBuilder):
23 |     """Multinomial sampling implementation builder."""
24 | 
25 |     def build():
26 |         """build."""
27 |         return DefaultMultinomialSamplingImpl()
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .ascend import AscendOpsBackend  # noqa: F401
3 | from .camb import CambOpsBackend  # noqa: F401
4 | from .maca import MacaOpsBackend  # noqa: F401
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from lmdeploy.pytorch.kernels.dlinfer.activation import silu_and_mul
 3 | 
 4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl
 5 | 
 6 | 
 7 | class DlinferSiluAndMulImpl(SiluAndMulImpl):
 8 |     """Silu + multiple fused implementation."""
 9 | 
10 |     def forward(self, x):
11 |         """forward."""
12 |         return silu_and_mul(x)
13 | 
14 | 
15 | class DlinferSiluAndMulBuilder(SiluAndMulBuilder):
16 |     """Silu and mul implementation builder."""
17 | 
18 |     @staticmethod
19 |     def build(inplace: bool = False):
20 |         """build."""
21 |         return DlinferSiluAndMulImpl()
22 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from torch import Tensor
 3 | 
 4 | from lmdeploy.pytorch.kernels.dlinfer import apply_rotary_pos_emb
 5 | 
 6 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl
 7 | 
 8 | 
 9 | class DlinferApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
10 |     """Apply rotary embedding implementation."""
11 | 
12 |     def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
13 |         """forward."""
14 |         if inplace:
15 |             q_embed = None
16 |             k_embed = None
17 |         else:
18 |             q_embed = query.new_empty(query.shape)
19 |             k_embed = key.new_empty(key.shape)
20 |         return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
21 | 
22 | 
23 | class DlinferApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
24 |     """Apply rotary embedding implementation builder."""
25 | 
26 |     @staticmethod
27 |     def build():
28 |         """Build implementation."""
29 |         return DlinferApplyRotaryEmbImpl()
30 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/ascend/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import AscendOpsBackend, SocVersion  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/ascend/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import torch_npu
 4 | 
 5 | ACL_FORMAT_FRACTAL_NZ = 29
 6 | 
 7 | 
 8 | def nd_to_nz_spec(tensor: torch.Tensor) -> torch.Tensor:
 9 |     '''
10 |     This function is copied from vllm-ascend commit hash: 420e794c35fe887db2be81cf9db0461f5b71da0b
11 |     It converts a tensor in ACL_FORMAT_ND format to ACL_FORMAT_FRACTAL_NZ format for Ascend 310P devices.
12 |     It behaves similarly to the TransdataOperation and it requires the input tensor to be 2D.
13 |     '''
14 |     num_tokens = tensor.shape[0]
15 |     max_seq_len = tensor.shape[1]
16 | 
17 |     tokens_pad = (num_tokens + 15) // 16 * 16
18 |     max_seq_len_pad = (max_seq_len + 15) // 16 * 16
19 | 
20 |     tensor_pad = \
21 |         torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=tensor.dtype, device=tensor.device)
22 | 
23 |     tensor_pad[0][:num_tokens, :max_seq_len] = tensor
24 |     tensor_nz = tensor_pad.reshape((1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3)
25 | 
26 |     tensor_nz = torch_npu.npu_format_cast(tensor_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ)
27 |     return tensor_nz
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/camb/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import CambOpsBackend  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/maca/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import MacaOpsBackend  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from lmdeploy.pytorch.kernels.dlinfer import rms_norm
 5 | 
 6 | from ..norm import RMSNormBuilder, RMSNormImpl
 7 | 
 8 | 
 9 | class DlinferRMSNormImpl(RMSNormImpl):
10 |     """Dlinfer RMS norm implementation."""
11 | 
12 |     def __init__(self, hidden_size: int, eps: float = 1e-6):
13 |         self.hidden_size = hidden_size
14 |         self.eps = eps
15 | 
16 |     def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
17 |         """forward."""
18 |         if residual is None:
19 |             x = rms_norm(x, weight, self.eps)
20 |             return x
21 |         else:
22 |             x, residual = rms_norm(x, weight, self.eps, residual=residual)
23 |             return x, residual
24 | 
25 | 
26 | class DlinferRMSNormBuilder(RMSNormBuilder):
27 |     """Dlinfer RMS norm implementation builder."""
28 | 
29 |     @staticmethod
30 |     def build(weight: torch.Tensor, eps: float = 1e-6):
31 |         """build."""
32 |         return DlinferRMSNormImpl(weight, eps)
33 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/flash_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from torch import Tensor
 5 | 
 6 | 
 7 | class FlashAttentionImpl(ABC):
 8 |     """FlashAttention implementation."""
 9 | 
10 |     def forward(self,
11 |                 query: Tensor,
12 |                 key: Tensor,
13 |                 value: Tensor,
14 |                 q_start_loc: Tensor,
15 |                 q_seqlens: Tensor,
16 |                 kv_start_loc: Tensor,
17 |                 kv_seqlens: Tensor,
18 |                 max_q_seqlen: int = None):
19 |         """forward."""
20 |         raise NotImplementedError
21 | 
22 | 
23 | class FlashAttentionBuilder(ABC):
24 |     """FlashAttention implementation builder."""
25 | 
26 |     @staticmethod
27 |     @abstractmethod
28 |     def build(
29 |         num_heads: int,
30 |         head_dim: int,
31 |         scale: float = None,
32 |         num_kv_heads: int = None,
33 |         v_head_dim: int = None,
34 |         causal: bool = True,
35 |         sliding_window: int = None,
36 |         logical_softcapping: float = None,
37 |         **kwargs,
38 |     ) -> FlashAttentionImpl:
39 |         """build."""
40 |         raise NotImplementedError
41 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | from typing import List, Optional
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class LinearImpl(ABC):
 9 |     """Linear implementation api."""
10 | 
11 |     def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
12 |         """Update weights."""
13 |         return weight, bias
14 | 
15 |     @abstractmethod
16 |     def forward(self,
17 |                 x,
18 |                 weight: torch.Tensor,
19 |                 bias: Optional[torch.Tensor] = None,
20 |                 all_reduce: bool = False,
21 |                 rank: int = 0,
22 |                 scatter_size: List[int] = None):
23 |         """forward."""
24 |         raise NotImplementedError
25 | 
26 | 
27 | class LinearBuilder(ABC):
28 |     """Linear implementation builder."""
29 | 
30 |     @staticmethod
31 |     @abstractmethod
32 |     def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None):
33 |         """build."""
34 |         raise NotImplementedError
35 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/multinomial_sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class MultinomialSamplingImpl(ABC):
 8 |     """Multinomial sampling implementation api."""
 9 | 
10 |     @abstractmethod
11 |     def forward(scores: torch.Tensor, seeds: torch.LongTensor, offsets: torch.LongTensor, indices: torch.Tensor = None):
12 |         """forward."""
13 |         raise NotImplementedError
14 | 
15 | 
16 | class MultinomialSamplingBuilder(ABC):
17 |     """Multinomial sampling implementation builder."""
18 | 
19 |     @staticmethod
20 |     @abstractmethod
21 |     def build():
22 |         """build."""
23 |         raise NotImplementedError
24 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class RMSNormImpl(ABC):
 8 |     """RMS norm implementation api."""
 9 | 
10 |     @abstractmethod
11 |     def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
12 |         """forward."""
13 |         raise NotImplementedError
14 | 
15 | 
16 | class RMSNormBuilder(ABC):
17 |     """RMS norm implementation builder."""
18 | 
19 |     @staticmethod
20 |     @abstractmethod
21 |     def build(hidden_size: int, eps: float = 1e-6):
22 |         """build."""
23 |         raise NotImplementedError
24 | 
25 | 
26 | class LayerNormImpl(ABC):
27 |     """Layer norm implementation api."""
28 | 
29 |     @abstractmethod
30 |     def forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor = None, residual: torch.Tensor = None):
31 |         """forward."""
32 |         raise NotImplementedError
33 | 
34 | 
35 | class LayerNormBuilder(ABC):
36 |     """Layer norm implementation builder."""
37 | 
38 |     @staticmethod
39 |     @abstractmethod
40 |     def build(normalized_shape: int, eps: float = 1e-6):
41 |         """build."""
42 |         raise NotImplementedError
43 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/adapter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .base import BaseChecker
 3 | 
 4 | 
 5 | class AdapterChecker(BaseChecker):
 6 |     """Check adapter is available."""
 7 | 
 8 |     def __init__(self, adapter_path: str, logger=None):
 9 |         super().__init__(logger)
10 |         self.adapter_path = adapter_path
11 | 
12 |     def check(self):
13 |         """check."""
14 |         path = self.adapter_path
15 | 
16 |         try:
17 |             import peft  # noqa: F401
18 |         except Exception as e:
19 |             self.log_and_exit(e, 'Adapter', message='Failed to import peft.')
20 | 
21 |         try:
22 |             from peft import PeftConfig
23 |             PeftConfig.from_pretrained(path)
24 |         except Exception as e:
25 |             message = ('Please make sure the adapter can be loaded with '
26 |                        '`peft.PeftConfig.from_pretrained`\n')
27 |             err_msg = '' if len(e.args) == 0 else e.args[0]
28 |             if 'got an unexpected keyword argument' in err_msg:
29 |                 message += ('Or try remove all unexpected keywords '
30 |                             'in `adapter_config.json`.')
31 |             self.log_and_exit(e, 'Adapter', message=message)
32 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/cuda.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .base import BaseChecker
 3 | 
 4 | 
 5 | class CudaChecker(BaseChecker):
 6 |     """Check pytorch is available."""
 7 | 
 8 |     def __init__(self, model_format: str = None, logger=None) -> None:
 9 |         super().__init__(logger=logger)
10 |         self.model_format = model_format
11 | 
12 |     def check(self):
13 |         """check."""
14 |         import torch
15 | 
16 |         if not torch.cuda.is_available():
17 |             self.log_and_exit(mod_name='CUDA', message='cuda is not available.')
18 | 
19 |         if self.model_format == 'fp8':
20 |             props = torch.cuda.get_device_properties(0)
21 |             if props.major < 9:
22 |                 self.log_and_exit(mod_name='CUDA', message='model_format=fp8 requires sm>=9.0.')
23 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/deeplink.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from lmdeploy.utils import try_import_deeplink
 3 | 
 4 | from .base import BaseChecker
 5 | 
 6 | 
 7 | class DeeplinkChecker(BaseChecker):
 8 |     """Check pytorch is available."""
 9 | 
10 |     def __init__(self, device_type: str, logger=None) -> None:
11 |         super().__init__(logger=logger)
12 |         self.device_type = device_type
13 | 
14 |     def check(self):
15 |         """check."""
16 |         try_import_deeplink(self.device_type)
17 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/torch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .base import BaseChecker
 3 | 
 4 | 
 5 | class TorchChecker(BaseChecker):
 6 |     """Check pytorch is available."""
 7 | 
 8 |     def __init__(self, device: str = 'cuda', logger=None) -> None:
 9 |         super().__init__(logger=logger)
10 |         self.device = device
11 | 
12 |     def check(self):
13 |         """check."""
14 |         try:
15 |             import torch
16 |             a = torch.tensor([1, 2], device=self.device)
17 |             b = a.new_tensor([3, 4], device=self.device)
18 |             c = a + b
19 |             torch.testing.assert_close(c, a.new_tensor([4, 6]))
20 |         except Exception as e:
21 |             self.log_and_exit(e, 'PyTorch', 'PyTorch is not available.')
22 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/transformers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from packaging import version
 3 | 
 4 | from .base import BaseChecker
 5 | 
 6 | MIN_TRANSFORMERS_VERSION = '4.33.0'
 7 | MAX_TRANSFORMERS_VERSION = '4.51.0'
 8 | 
 9 | 
10 | class TransformersChecker(BaseChecker):
11 |     """Check transformers is available."""
12 | 
13 |     def check(self):
14 |         """check."""
15 |         import transformers
16 |         logger = self.get_logger()
17 |         try:
18 |             trans_version = version.parse(transformers.__version__)
19 |             min_version = version.parse(MIN_TRANSFORMERS_VERSION)
20 |             max_version = version.parse(MAX_TRANSFORMERS_VERSION)
21 |             if trans_version < min_version or trans_version > max_version:
22 |                 logger.warning('LMDeploy requires transformers version: '
23 |                                f'[{MIN_TRANSFORMERS_VERSION} ~ '
24 |                                f'{MAX_TRANSFORMERS_VERSION}], '
25 |                                'but found version: '
26 |                                f'{transformers.__version__}')
27 |         except Exception as e:
28 |             self.log_and_exit(e, 'transformers', 'transformers is not available.')
29 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/triton_custom_add.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import triton
 4 | import triton.language as tl
 5 | 
 6 | 
 7 | @triton.jit
 8 | def _add_kernel(A, B, C, size, BLOCK: tl.constexpr):
 9 |     """Add kernel."""
10 |     prog_id = tl.program_id(0)
11 |     offs = prog_id * BLOCK + tl.arange(0, BLOCK)
12 |     a = tl.load(A + offs, mask=offs < size)
13 |     b = tl.load(B + offs, mask=offs < size)
14 |     tl.store(C + offs, a + b, mask=offs < size)
15 | 
16 | 
17 | def custom_add(a, b):
18 |     """Custom add one."""
19 |     c = torch.empty_like(a)
20 |     size = c.size(0)
21 |     BLOCK = 16
22 | 
23 |     grid = (triton.cdiv(size, BLOCK), )
24 |     _add_kernel[grid](a, b, c, size, BLOCK=BLOCK)
25 |     return c
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     a = torch.tensor([1, 2], device='cuda')
30 |     b = a.new_tensor([3, 4], device='cuda')
31 |     c = custom_add(a, b)
32 |     torch.testing.assert_close(c, a + b)
33 |     print('Done.')
34 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import importlib
 3 | import pkgutil
 4 | 
 5 | from .builder import AutoModelConfigBuilder
 6 | 
 7 | __all__ = []
 8 | 
 9 | # load all submodule
10 | for loader, module_name, is_pkg in pkgutil.walk_packages(__path__):
11 |     __all__.append(module_name)
12 |     _module = importlib.import_module('{}.{}'.format(__name__, module_name))
13 |     globals()[module_name] = _module
14 | 
15 | __all__ += ['AutoModelConfigBuilder']
16 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/cogvlm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class CogVLMModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         model_arch = hf_config.architectures[0] if hf_config.architectures else None
12 |         return model_arch == 'CogVLMForCausalLM'
13 | 
14 |     @classmethod
15 |     def build(cls, hf_config, model_path: str = None, **kwargs):
16 |         """build."""
17 |         from lmdeploy.utils import is_bf16_supported
18 |         if getattr(hf_config, 'num_multi_query_heads', None):
19 |             hf_config.num_key_value_heads = hf_config.num_multi_query_heads
20 |         else:
21 |             hf_config.num_key_value_heads = hf_config.num_attention_heads
22 | 
23 |         cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
24 |         cfg.cogvlm_style = True
25 |         torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
26 |         hf_config.torch_dtype = torch_dtype
27 |         return cfg
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/deepseek_vl2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class DeepseekVLV2ModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.model_type in ['deepseek_vl_v2']
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """Build deepseek-vl2."""
16 | 
17 |         if hf_config.language_config.use_mla:
18 |             from .deepseek_v2 import DeepseekV2ModelConfigBuilder
19 |             cfg = DeepseekV2ModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs)
20 |             cfg.hf_config = hf_config
21 |         else:
22 |             # deepseek-vl2-tiny uses MHA, rather than MLA
23 |             # in this case, we use DefaultModelConfigBuilder
24 |             cfg = DefaultModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs)
25 |             cfg.hf_config = hf_config
26 | 
27 |         return cfg
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/internvl.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class InternVLModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.architectures[0] == 'InternVLChatModel'
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """Build llava hf."""
16 |         cfg = DefaultModelConfigBuilder.build(hf_config.llm_config, model_path, **kwargs)
17 |         cfg.hf_config = hf_config
18 |         return cfg
19 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/internvl3_hf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class InternVL3ModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.architectures[0] == 'InternVLForConditionalGeneration'
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """Build config."""
16 |         cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
17 |         cfg.hf_config = hf_config
18 |         return cfg
19 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/llama4.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class Llama4ModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.model_type in ['llama4']
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """Build llama4."""
16 |         cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
17 |         cfg.hf_config = hf_config
18 | 
19 |         return cfg
20 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/minicpm3.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .builder import AutoModelConfigBuilder
 4 | from .default import DefaultModelConfigBuilder
 5 | 
 6 | 
 7 | class MiniCPM3ModelConfigBuilder(AutoModelConfigBuilder):
 8 | 
 9 |     @classmethod
10 |     def condition(cls, hf_config):
11 |         """config."""
12 |         return hf_config.architectures[0] in ['MiniCPM3ForCausalLM']
13 | 
14 |     @classmethod
15 |     def build(cls, hf_config, model_path: str = None, **kwargs):
16 |         """build."""
17 |         head_dim = (hf_config.qk_nope_head_dim + hf_config.qk_rope_head_dim)
18 | 
19 |         cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
20 |         cfg.head_dim = head_dim
21 |         cfg.k_head_dim = head_dim
22 |         cfg.v_head_dim = head_dim
23 | 
24 |         return cfg
25 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/mllama.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class MLlamaModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.architectures[0] == 'MllamaForConditionalGeneration'
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """Build llava hf."""
16 |         cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
17 |         cfg.hf_config = hf_config
18 |         return cfg
19 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import AutoModelConfigBuilder
 3 | from .default import DefaultModelConfigBuilder
 4 | 
 5 | 
 6 | class QwenModelConfigBuilder(AutoModelConfigBuilder):
 7 | 
 8 |     @classmethod
 9 |     def condition(cls, hf_config):
10 |         """config."""
11 |         return hf_config.model_type == 'qwen'
12 | 
13 |     @classmethod
14 |     def build(cls, hf_config, model_path: str = None, **kwargs):
15 |         """build."""
16 |         from lmdeploy.utils import is_bf16_supported
17 |         cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
18 |         if cfg.bos_token_id is None:
19 |             cfg.bos_token_id = 151644
20 |         if cfg.eos_token_id is None:
21 |             cfg.eos_token_id = 151645
22 | 
23 |         torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
24 |         if hf_config.bf16 and is_bf16_supported():
25 |             torch_dtype = 'bfloat16'
26 |         elif hf_config.fp16:
27 |             torch_dtype = 'float16'
28 |         hf_config.torch_dtype = torch_dtype
29 |         return cfg
30 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from lmdeploy.utils import get_logger
 5 | 
 6 | logger = get_logger('lmdeploy')
 7 | 
 8 | 
 9 | def flash_mla_available():
10 |     """Check if flash mla is available."""
11 |     # use flash_mla by default if it is installed
12 |     use_flash_mla = False
13 |     try:
14 |         # torch_npu device_properties doesn't have 'major' attribute
15 |         device_properties = torch.cuda.get_device_properties(0)
16 |         if hasattr(device_properties, 'major') and device_properties.major >= 9:
17 |             import flash_mla  # noqa
18 |             use_flash_mla = True
19 |     except ImportError:
20 |         logger.warning('For higher performance, please install flash_mla https://github.com/deepseek-ai/FlashMLA')
21 |     return use_flash_mla
22 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/devices/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .device_manager import DefaultContext, DeviceContext, get_device_manager
3 | 
4 | __all__ = ['DeviceContext', 'DefaultContext', 'get_device_manager']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/backend/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from lmdeploy.logger import get_logger
 3 | 
 4 | logger = get_logger('lmdeploy')
 5 | 
 6 | try:
 7 |     logger.debug('Registering DLSlime Backend')
 8 |     from .dlslime import DLSlimeBackend
 9 | except ImportError:
10 |     logger.warning('Disable DLSlime Backend')
11 | 
12 | try:
13 |     logger.debug('Registering Mooncake Backend')
14 |     from .mooncake import MooncakeBackend
15 | except ImportError:
16 |     logger.warning('Disable Mooncake Backend')
17 | 
18 | try:
19 |     logger.debug('Registering InfiniStoreBackend Backend')
20 |     from .infinistore import InfiniStoreBackend
21 | except ImportError:
22 |     logger.warning('Disable InfiniStoreBackend Backend')
23 | 
24 | __all__ = ['DLSlimeBackend', 'MooncakeBackend', 'InfiniStoreBackend']
25 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/backend/backend.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.registry import Registry
3 | 
4 | MIGRATION_BACKENDS = Registry('migration_backend', locations=['lmdeploy.pytorch.disagg.backend.backend'])
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/request.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import List, Optional
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from lmdeploy.pytorch.disagg.config import (DistServeEngineConfig, DistServeNVLinkConfig, DistServeRDMAConfig,
 7 |                                             DistServeTCPConfig, MigrationProtocol)
 8 | 
 9 | 
10 | class DistServeConnectionRequest(BaseModel):
11 |     protocol: MigrationProtocol
12 |     remote_engine_id: str
13 |     remote_endpoint_info: str
14 | 
15 | 
16 | class DistServeInitRequest(BaseModel):
17 |     local_engine_id: str
18 |     local_engine_config: DistServeEngineConfig
19 | 
20 |     remote_engine_id: str
21 |     remote_engine_config: DistServeEngineConfig
22 | 
23 |     protocol: MigrationProtocol
24 | 
25 |     rank: Optional[int] = None
26 | 
27 |     tcp_config: Optional[DistServeTCPConfig] = None
28 |     rdma_config: Optional[DistServeRDMAConfig] = None
29 |     nvlink_config: Optional[DistServeNVLinkConfig] = None
30 | 
31 | 
32 | class MigrationRequest(BaseModel):
33 |     protocol: MigrationProtocol
34 | 
35 |     remote_engine_id: str
36 |     remote_session_id: int
37 |     remote_token_id: int
38 |     remote_block_ids: List[int]
39 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .engine import Engine
3 | from .engine_instance import EngineInstance
4 | 
5 | __all__ = ['Engine', 'EngineInstance']
6 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/engine/mp_engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .alibi_pagedattention import alibi_paged_attention_fwd
 4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
 5 | from .fill_kv_cache import fill_kv_cache
 6 | from .fused_moe import fused_moe
 7 | from .fused_rotary_emb import fused_rotary_emb
 8 | from .multinomial_sampling import multinomial_sampling
 9 | from .pagedattention import paged_attention_fwd
10 | from .rms_norm import rms_norm
11 | from .w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8,
12 |                                   rms_norm_dynamic_quant)
13 | 
14 | __all__ = [
15 |     'apply_rotary_pos_emb',
16 |     'fused_moe',
17 |     'fused_rotary_emb',
18 |     'paged_attention_fwd',
19 |     'alibi_paged_attention_fwd',
20 |     'fill_kv_cache',
21 |     'multinomial_sampling',
22 |     'rms_norm',
23 |     'matmul_kernel_dynamic_quant',
24 |     'per_channel_quant',
25 |     'per_token_quant_int8',
26 |     'rms_norm_dynamic_quant',
27 | ]
28 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/alibi_pagedattention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | alibi_paged_attention_fwd = FunctionDispatcher('alibi_paged_attention_fwd').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/apply_rotary_pos_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | apply_rotary_pos_emb = FunctionDispatcher('apply_rotary_pos_emb').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ..default.w8a8_kernels import per_channel_quant
 3 | from .alibi_pagedattention import alibi_paged_attention_fwd
 4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
 5 | from .fill_kv_cache import fill_kv_cache
 6 | from .flash_mla import flash_mla_fwd
 7 | from .flashattention import flash_attention_fwd
 8 | from .flatten_kv_cache import flatten_kv_cache
 9 | from .fused_moe import fused_moe
10 | from .fused_rotary_emb import fused_rotary_emb
11 | from .multinomial_sampling import multinomial_sampling
12 | from .pagedattention import paged_attention_fwd
13 | from .rms_norm import rms_norm
14 | from .w8a8_fused_moe import fused_moe_w8a8
15 | from .w8a8_triton_kernels import matmul_kernel_dynamic_quant, per_token_quant_int8, rms_norm_dynamic_quant
16 | 
17 | __all__ = [
18 |     'apply_rotary_pos_emb',
19 |     'fused_moe',
20 |     'fused_rotary_emb',
21 |     'paged_attention_fwd',
22 |     'alibi_paged_attention_fwd',
23 |     'fill_kv_cache',
24 |     'multinomial_sampling',
25 |     'rms_norm',
26 |     'matmul_kernel_dynamic_quant',
27 |     'per_channel_quant',
28 |     'per_token_quant_int8',
29 |     'rms_norm_dynamic_quant',
30 |     'flash_attention_fwd',
31 |     'flatten_kv_cache',
32 |     'fused_moe_w8a8',
33 |     'flash_mla_fwd',
34 | ]
35 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .multinomial_sampling import multinomial_sampling
3 | from .w8a8_kernels import per_channel_quant
4 | 
5 | __all__ = [
6 |     'multinomial_sampling',
7 |     'per_channel_quant',
8 | ]
9 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/multinomial_sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | from torch import LongTensor, Tensor
 4 | 
 5 | 
 6 | def multinomial_sampling(scores: Tensor, seeds: LongTensor, offsets: LongTensor, indices: Tensor = None):
 7 |     sampled_index = torch.multinomial(scores, num_samples=1, replacement=True)
 8 |     outputs = torch.gather(indices, dim=1, index=sampled_index)
 9 |     return outputs.view(-1)
10 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/w8a8_kernels.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | def per_channel_quant(x: torch.Tensor, dtype: torch.dtype):
 6 |     """Quantize the input tensor 'x' channel-wise using the given number of
 7 |     bits.
 8 | 
 9 |     Args:
10 |         x (torch.Tensor): The input tensor to be quantized. Must be a
11 |             2-dimensional tensor.
12 |         dtype (torch.dtype): The data type to which the quantized tensor should
13 |             be converted.
14 | 
15 |     Returns:
16 |         tuple: A tuple containing two items -- the quantized tensor and
17 |             the scale used for quantization.
18 |     """
19 |     assert x.ndim == 2
20 |     x = x.to(torch.float32)
21 |     x_absmax = x.view(x.shape[0], -1).abs().max(dim=1, keepdim=True)[0]
22 |     qtype_info = torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype)
23 |     q_max = qtype_info.max
24 |     q_min = qtype_info.min
25 |     scale = x_absmax / q_max
26 |     x_q = x / scale
27 |     if not dtype.is_floating_point:
28 |         x_q = torch.round(x_q)
29 |     x_q = x_q.clamp(q_min, q_max).to(dtype)
30 |     return x_q, scale
31 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ..default import multinomial_sampling, per_channel_quant
 3 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
 4 | from .awq_kernels import awq_linear
 5 | from .fill_kv_cache import fill_kv_cache
 6 | from .flash_attention import flash_attention_fwd
 7 | from .fused_moe import fused_moe
 8 | from .linear import linear
 9 | from .moe_gating_topk_softmax import moe_gating_topk_softmax
10 | from .pagedattention import paged_attention_fwd
11 | from .rms_norm import rms_norm
12 | 
13 | __all__ = [
14 |     'rms_norm',
15 |     'apply_rotary_pos_emb',
16 |     'awq_linear',
17 |     'fill_kv_cache',
18 |     'fused_moe',
19 |     'paged_attention_fwd',
20 |     'flash_attention_fwd',
21 |     'linear',
22 |     'moe_gating_topk_softmax',
23 |     'multinomial_sampling',
24 |     'per_channel_quant',
25 | ]
26 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 | 
5 | 
6 | def silu_and_mul(input_tensor: Tensor, ) -> Tensor:
7 |     return ext_ops.silu_and_mul(input_tensor)
8 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Optional, Tuple
 3 | 
 4 | import dlinfer.ops as ext_ops
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | def apply_rotary_pos_emb(
 9 |     query_states: Tensor,
10 |     key_states: Tensor,
11 |     cos: Tensor,
12 |     sin: Tensor,
13 |     q_embed: Optional[Tensor],
14 |     k_embed: Optional[Tensor],
15 | ) -> Tuple[Tensor, Tensor]:
16 |     query_states_embed, key_states_embed = \
17 |         ext_ops.apply_rotary_pos_emb(query_states,
18 |                                      key_states,
19 |                                      cos, sin)
20 |     if q_embed is None:
21 |         q_embed = query_states_embed.view(query_states.shape)
22 |     elif q_embed is not query_states:
23 |         q_embed.copy_(query_states_embed.view(query_states.shape))
24 | 
25 |     if k_embed is None:
26 |         k_embed = key_states_embed.view(key_states.shape)
27 |     elif k_embed is not key_states:
28 |         k_embed.copy_(key_states_embed.view(key_states.shape))
29 | 
30 |     return q_embed, k_embed
31 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Optional
 3 | 
 4 | import dlinfer.ops as ext_ops
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | def awq_linear(x: Tensor,
 9 |                qweight: Tensor,
10 |                scales: Tensor,
11 |                qzeros: Tensor,
12 |                bias: Optional[Tensor] = None,
13 |                all_reduce: bool = False,
14 |                group_size: int = 0):
15 |     return ext_ops.weight_quant_matmul(x.squeeze(0),
16 |                                        qweight,
17 |                                        scales,
18 |                                        offset=qzeros,
19 |                                        bias=bias,
20 |                                        all_reduce=all_reduce,
21 |                                        group_size=group_size).unsqueeze(0)
22 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Optional, Sequence
 3 | 
 4 | import dlinfer.ops as ext_ops
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | def fill_kv_cache(
 9 |     key_states: Tensor,
10 |     value_states: Tensor,
11 |     key_caches: Tensor,
12 |     value_caches: Tensor,
13 |     kv_start_indices: Tensor,
14 |     k_scales_zeros: Sequence[Optional[Tensor]],
15 |     v_scales_zeros: Sequence[Optional[Tensor]],
16 |     quant_bits: int = 0,
17 | ):
18 |     """Fill key/value state to cache for paged attention."""
19 |     return ext_ops.fill_kv_cache(key_states,
20 |                                  value_states,
21 |                                  key_caches,
22 |                                  value_caches,
23 |                                  kv_start_indices,
24 |                                  k_scales_zeros=k_scales_zeros,
25 |                                  v_scales_zeros=v_scales_zeros,
26 |                                  quant_bits=quant_bits)
27 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import dlinfer.ops as ext_ops
 3 | from dlinfer.utils.type_annotation import Tensor
 4 | 
 5 | 
 6 | def flash_attention_fwd(
 7 |     query_states: Tensor,
 8 |     key_states: Tensor,
 9 |     value_states: Tensor,
10 |     attn_output: Tensor,
11 |     q_start_loc: Tensor,
12 |     q_seqlens: Tensor,
13 |     kv_start_loc: Tensor,
14 |     kv_seqlens: Tensor,
15 |     num_heads: int,
16 |     num_kv_heads: int,
17 |     max_q_seqlen: int = None,
18 |     window_size: int = None,
19 |     sm_scale: float = None,
20 |     logit_softcapping: float = None,
21 |     causal: bool = True,
22 | ):
23 |     return ext_ops.prefill_attention(
24 |         query_states,
25 |         key_states,
26 |         value_states,
27 |         None,
28 |         None,
29 |         q_start_loc,
30 |         q_seqlens,
31 |         kv_seqlens,
32 |         max_q_seqlen,
33 |         num_heads,
34 |         num_kv_heads,
35 |         attn_mask=[],
36 |         softmax_scale=sm_scale,
37 |         attn_output=attn_output,
38 |     )
39 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import dlinfer.ops as ext_ops
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | def fused_moe(
 7 |     hidden_states: Tensor,
 8 |     gate_up_weights: Tensor,
 9 |     down_weights: Tensor,
10 |     topk_weights: Tensor,
11 |     topk_ids: Tensor,
12 |     topk: int,
13 |     renormalize: bool,
14 | ):
15 |     """Dlinfer fused moe."""
16 |     return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize)
17 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/linear.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Optional
 3 | 
 4 | import dlinfer.ops as ext_ops
 5 | from torch import Tensor
 6 | 
 7 | 
 8 | def linear(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, all_reduce: bool = False, group: str = ''):
 9 |     return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce, group=group)
10 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 | 
5 | 
6 | def moe_gating_topk_softmax(router_logits: Tensor, topk: int):
7 |     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk)
8 |     return routing_weights, selected_experts
9 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import dlinfer.ops as ext_ops
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | def rms_norm(hidden_states: Tensor, weight: Tensor, epsilon: float = 1e-6, residual: Tensor = None, out: Tensor = None):
 7 |     if residual is None:
 8 |         rms_norm_out = ext_ops.rms_norm(hidden_states, weight, epsilon)
 9 |         if out is None:
10 |             out = rms_norm_out
11 |         else:
12 |             out.copy_(rms_norm_out)
13 |         return out
14 |     else:
15 |         return ext_ops.add_rms_norm(hidden_states, residual, weight, epsilon)
16 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fill_kv_cache.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | fill_kv_cache = FunctionDispatcher('fill_kv_cache').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/flash_mla.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | flash_mla_fwd = FunctionDispatcher('flash_mla_fwd').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fused_moe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | fused_moe = FunctionDispatcher('fused_moe').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fused_rotary_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | fused_rotary_emb = FunctionDispatcher('fused_rotary_emb').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | moe_gating_topk_softmax = FunctionDispatcher('moe_gating_topk_softmax').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | multinomial_sampling = FunctionDispatcher('multinomial_sampling').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/pagedattention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | paged_attention_fwd = FunctionDispatcher('paged_attention_fwd').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/rms_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 | 
4 | rms_norm = FunctionDispatcher('rms_norm').make_caller()
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/w8a8_triton_kernels.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .dispatcher import FunctionDispatcher
 3 | 
 4 | per_channel_quant = FunctionDispatcher('per_channel_quant').make_caller()
 5 | 
 6 | matmul_kernel_dynamic_quant = FunctionDispatcher('matmul_kernel_dynamic_quant').make_caller()
 7 | 
 8 | per_token_quant_int8 = FunctionDispatcher('per_token_quant_int8').make_caller()
 9 | 
10 | rms_norm_dynamic_quant = FunctionDispatcher('rms_norm_dynamic_quant').make_caller()
11 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .q_modules import QLinear, QRMSNorm
3 | 
4 | __all__ = ['QLinear', 'QRMSNorm']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/utils/multimodal.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import List, Tuple
 3 | 
 4 | from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
 5 | 
 6 | PreparedInputs = Tuple[List[int], MultiModalInputs]
 7 | 
 8 | 
 9 | class MultiModalMixin:
10 | 
11 |     def prepare_multimodal_input(self, input_ids, input_multimodals, **kwargs) -> PreparedInputs:
12 |         """Prepare multimodals inputs."""
13 |         raise NotImplementedError('prepare input not implemented.')
14 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .data_type import MultiModalData, MultiModalTensor
3 | 
4 | __all__ = ['MultiModalData', 'MultiModalTensor']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/multimodal/image_type.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from dataclasses import dataclass
 3 | from typing import Any, ClassVar, Dict
 4 | 
 5 | from PIL import Image
 6 | 
 7 | from .data_type import MultiModalData
 8 | 
 9 | 
10 | @dataclass
11 | class ImageData(MultiModalData):
12 |     data: Image
13 |     loc: int
14 |     meta: Dict[str, Any] = None
15 |     type: ClassVar[str] = 'image'
16 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # attention module is modified from:
 3 | # https://github.com/vllm-project/vllm/blob/main/vllm/attention/
 4 | from .activation import GeluAndMul, SiluAndMul  # noqa: F401
 5 | from .attention import Attention, FlashAttention  # noqa: F401
 6 | from .norm import LayerNorm, RMSNorm  # noqa: F401
 7 | from .rotary_embedding import ApplyRotaryEmb  # noqa: F401
 8 | from .rotary_embedding import RopeType  # noqa: F401
 9 | from .rotary_embedding import YarnParameters  # noqa: F401
10 | from .rotary_embedding import build_rotary_embedding  # noqa: F401
11 | from .rotary_embedding import build_rotary_params  # noqa: F401
12 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/activation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from torch import Tensor, nn
 3 | 
 4 | from ..backends import OpType, get_backend
 5 | 
 6 | 
 7 | class SiluAndMul(nn.Module):
 8 |     """Silu and elementwise multiple."""
 9 | 
10 |     def __init__(self, inplace: bool = True):
11 |         super().__init__()
12 |         backend = get_backend()
13 |         builder = backend.get_layer_impl_builder(OpType.SiluAndMul)
14 |         self.impl = builder.build(inplace)
15 | 
16 |     def forward(self, x: Tensor):
17 |         """forward."""
18 |         return self.impl.forward(x)
19 | 
20 | 
21 | class GeluAndMul(nn.Module):
22 |     """Gelu and elementwise multiple."""
23 | 
24 |     def __init__(self, approximate: str = 'none'):
25 |         super().__init__()
26 |         backend = get_backend()
27 |         builder = backend.get_layer_impl_builder(OpType.GeluAndMul)
28 |         self.impl = builder.build(approximate)
29 | 
30 |     def forward(self, x: Tensor):
31 |         """forward."""
32 |         return self.impl.forward(x)
33 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/multinomial_sampling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from ..backends import OpType, get_backend
 5 | 
 6 | 
 7 | def multinomial_sampling(scores: torch.Tensor,
 8 |                          seeds: torch.LongTensor,
 9 |                          offsets: torch.LongTensor,
10 |                          indices: torch.Tensor = None):
11 |     """Multinomial sampling op."""
12 |     impl_builder = get_backend().get_layer_impl_builder(OpType.MultinomialSampling)
13 |     return impl_builder.build().forward(scores, seeds, offsets, indices)
14 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/quant_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from lmdeploy.lite.quantization.weight.quant_utils import quant_blocked_fp8  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | def div_up(a: int, b: int):
 6 |     """Div up."""
 7 |     return (a + b - 1) // b
 8 | 
 9 | 
10 | def get_distribute_size(feature_size: int, world_size: int, rank: int, align: int = 1):
11 |     """Update feature size."""
12 |     assert feature_size % align == 0
13 |     aligned_size = feature_size // align
14 |     # try to make every rank has same amount of feats
15 |     updated_aligned_size = aligned_size // world_size
16 |     # if there are still some remain, given them to
17 |     # each rank
18 |     if rank < aligned_size % world_size:
19 |         updated_aligned_size += 1
20 |     return updated_aligned_size * align
21 | 
22 | 
23 | def chunk_aligned(weight: torch.Tensor, chunks: int, dim: int, align: int):
24 |     """Chunk aligned."""
25 |     if align == 1:
26 |         return weight.chunk(chunks, dim=dim)
27 |     size = weight.size(dim)
28 |     assert size % align == 0
29 |     aligned_size = size // align
30 | 
31 |     # try best to evenly split chunks
32 |     align_per_chunk = aligned_size // chunks
33 |     remain = aligned_size % chunks
34 |     sections = [align_per_chunk + int(c < remain) for c in range(chunks)]
35 |     sections = [sec * align for sec in sections]
36 |     return weight.split(sections, dim=dim)
37 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .scheduler import Scheduler
3 | 
4 | __all__ = ['Scheduler']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/block_manager/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ...config import CacheConfig
 3 | from .base_block_manager import BaseBlockManager
 4 | from .default_block_manager import DefaultBlockManager
 5 | from .window_block_manager import WindowBlockManager
 6 | 
 7 | 
 8 | def build_block_manager(cache_config: CacheConfig) -> BaseBlockManager:
 9 |     """Build block manager.
10 | 
11 |     Args:
12 |         cache_config (CacheConfig):  cache_config.
13 |     """
14 | 
15 |     num_cpu_blocks = cache_config.num_cpu_blocks
16 |     num_gpu_blocks = cache_config.num_gpu_blocks
17 |     window_size = cache_config.window_size
18 | 
19 |     if window_size < 0:
20 |         return DefaultBlockManager(num_gpu_blocks, num_cpu_blocks)
21 |     else:
22 |         return WindowBlockManager(num_gpu_blocks, num_cpu_blocks, window_size=window_size)
23 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/eviction_helper/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .recompute_eviction_helper import RecomputeEvictionHelper
3 | 
4 | __all__ = ['RecomputeEvictionHelper']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import List
 3 | 
 4 | from ...messages import SchedulerSequence
 5 | from ..scheduler import Scheduler
 6 | 
 7 | SeqList = List[SchedulerSequence]
 8 | 
 9 | 
10 | class BaseEvictionHelper:
11 |     """Base eviction helper."""
12 | 
13 |     def __init__(self, scheduler: Scheduler):
14 |         self.scheduler = scheduler
15 |         self.block_manager = scheduler.block_manager
16 |         self.block_trie = scheduler.block_trie
17 | 
18 |     def need_swap_in(self, seq: SchedulerSequence):
19 |         """Sequence need swap in."""
20 |         raise NotImplementedError('Not implemented.')
21 | 
22 |     def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int):
23 |         """Evict seqs."""
24 |         raise NotImplementedError('Not implemented.')
25 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .utils import Timer  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # modify from: https://github.com/vllm-project/vllm
 3 | import inspect
 4 | from inspect import Parameter, Signature
 5 | from typing import Dict, Sequence
 6 | 
 7 | import psutil
 8 | 
 9 | 
10 | def get_gpu_memory(device_id: int = None) -> int:
11 |     """Returns the free and total physical memory of the GPU in bytes."""
12 |     import torch
13 |     if device_id is None:
14 |         device_id = torch.cuda.current_device()
15 |     return torch.cuda.mem_get_info(device_id)
16 | 
17 | 
18 | def get_cpu_memory() -> int:
19 |     """Returns the total CPU memory of the node in bytes."""
20 |     return psutil.virtual_memory().total
21 | 
22 | 
23 | def bind_sigature(input_names: str, args: Sequence, kwargs: Dict):
24 |     """Bind args and kwargs to given input names."""
25 |     kind = inspect._ParameterKind.POSITIONAL_OR_KEYWORD
26 | 
27 |     sig = Signature([Parameter(name, kind) for name in input_names])
28 |     bind = sig.bind(*args, **kwargs)
29 |     return bind.arguments
30 | 


--------------------------------------------------------------------------------
/lmdeploy/pytorch/weight_loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/gradio/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/gradio/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | import gradio as gr
 4 | 
 5 | CSS = """
 6 | #container {
 7 |     width: 95%;
 8 |     margin-left: auto;
 9 |     margin-right: auto;
10 | }
11 | 
12 | #chatbot {
13 |     height: 500px;
14 |     overflow: auto;
15 | }
16 | 
17 | .chat_wrap_space {
18 |     margin-left: 0.5em
19 | }
20 | """
21 | 
22 | THEME = gr.themes.Soft(primary_hue=gr.themes.colors.blue,
23 |                        secondary_hue=gr.themes.colors.sky,
24 |                        font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
25 | 
26 | enable_btn = gr.update(interactive=True)
27 | disable_btn = gr.update(interactive=False)
28 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/openai/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/openai/reasoning_parser/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
3 | from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser
4 | from .reasoning_parser import ReasoningParser, ReasoningParserManager
5 | 
6 | __all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser']
7 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/openai/tool_parser/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .internlm2_parser import Internlm2ToolParser
 3 | from .llama3_parser import Llama3JsonToolParser
 4 | from .qwen2d5_parser import Qwen2d5ToolParser
 5 | from .qwen3_parser import Qwen3ToolParser
 6 | from .tool_parser import ToolParser, ToolParserManager
 7 | 
 8 | __all__ = [
 9 |     'Internlm2ToolParser',
10 |     'Qwen2d5ToolParser',
11 |     'Qwen3ToolParser',
12 |     'ToolParser',
13 |     'ToolParserManager',
14 |     'Llama3JsonToolParser',
15 | ]
16 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/serve/turbomind/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | 
 4 | def bootstrap():
 5 |     import os
 6 |     import sys
 7 | 
 8 |     has_turbomind = False
 9 |     pwd = os.path.dirname(__file__)
10 |     if os.path.exists(os.path.join(pwd, '..', 'lib')):
11 |         has_turbomind = True
12 |     if os.name == 'nt' and has_turbomind:
13 |         if sys.version_info[:2] >= (3, 8):
14 |             CUDA_PATH = os.getenv('CUDA_PATH')
15 |             assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
16 |             dll_path = os.path.join(CUDA_PATH, 'bin')
17 |             print(f'Add dll path {dll_path}, please note cuda version '
18 |                   'should >= 11.3 when compiled with cuda 11')
19 |             os.add_dll_directory(dll_path)
20 | 
21 | 
22 | bootstrap()
23 | 
24 | from .turbomind import TurboMind, update_parallel_config  # noqa: E402
25 | 
26 | __all__ = ['TurboMind', 'update_parallel_config']
27 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
 3 | from .deepseek2 import DeepSeek2Model  # noqa: F401
 4 | from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 5 | from .glm4 import Glm4Model  # noqa: F401
 6 | from .internlm2 import InternLM2Model  # noqa: F401
 7 | from .internvl import InternVLModel  # noqa: F401
 8 | from .llama import LlamaModel  # noqa: F401
 9 | from .llava import LlavaModel  # noqa: F401
10 | from .minicpmv import MiniCPMVModel  # noqa: F401
11 | from .mixtral import MixtralModel  # noqa: F401
12 | from .molmo import MolmoModel  # noqa: F401
13 | from .qwen import QwenModel  # noqa: F401
14 | from .xcomposer2 import Xcomposer2Model  # noqa: F401
15 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/minicpmv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | import json
 4 | import os.path as osp
 5 | 
 6 | from .base import INPUT_MODELS
 7 | from .llama import LlamaModel, LlamaReader
 8 | 
 9 | 
10 | class MiniCPMVReader(LlamaReader):
11 |     """MiniCPMVReader for llama model."""
12 | 
13 |     attn_layer_prefix = 'llm.model.layers'
14 |     attn_layer_patten = r'llm.model.layers.([0-9]+).'
15 |     tok_embeddings_key = 'llm.model.embed_tokens.weight'
16 |     norm_weight_key = 'llm.model.norm.weight'
17 |     output_weight_key = 'llm.lm_head.weight'
18 | 
19 | 
20 | @INPUT_MODELS.register_module(name='minicpmv')
21 | class MiniCPMVModel(LlamaModel):
22 |     """MiniCPMV model in hf format."""
23 |     Reader = MiniCPMVReader
24 | 
25 |     def model_info(self):
26 |         info = super().model_info()
27 |         with open(osp.join(self.model_path, 'config.json')) as f:
28 |             config = json.load(f)
29 |             if str(config.get('version')) == '2.6':
30 |                 info['attn_bias'] = True
31 |         return info
32 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/mixtral.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .base import INPUT_MODELS
 4 | from .llama import LlamaModel, LlamaReader
 5 | 
 6 | 
 7 | class MixtralReader(LlamaReader):
 8 | 
 9 |     def moe_ffn_expert(self, e=None, i=None, kind=None):
10 |         if not kind:
11 |             return self.filter(r'experts')
12 |         result = []
13 |         for x in ['w1', 'w2', 'w3']:
14 |             name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}'
15 |             tensor = self.params.get(name)
16 |             tensor = self.transform(tensor, kind)
17 |             result.append(tensor)
18 |         return (*result, )
19 | 
20 |     def moe_ffn_gate(self, i):
21 |         return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.weight')
22 | 
23 | 
24 | @INPUT_MODELS.register_module(name='mixtral')
25 | class MixtralModel(LlamaModel):
26 | 
27 |     Reader = MixtralReader
28 | 
29 |     def model_info(self):
30 |         cfg = self.model_config
31 |         info = super().model_info()
32 |         info['expert_num'] = cfg['num_local_experts']
33 |         info['expert_inter_size'] = cfg['intermediate_size']
34 |         info['experts_per_token'] = cfg['num_experts_per_tok']
35 |         info['norm_topk_prob'] = True
36 |         info['inter_size'] = 0
37 |         return info
38 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/target_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .fp import TurbomindModel  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/target_model/fp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | from .base import OUTPUT_MODELS, BaseOutputModel
 4 | 
 5 | 
 6 | @OUTPUT_MODELS.register_module(name='tm')
 7 | class TurbomindModel(BaseOutputModel):
 8 |     """Export to turbomind fp16 format."""
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/lmdeploy/turbomind/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import os
 3 | 
 4 | from transformers.utils import ExplicitEnum
 5 | 
 6 | from lmdeploy.utils import get_logger
 7 | 
 8 | logger = get_logger('lmdeploy')
 9 | 
10 | 
11 | class ModelSource(ExplicitEnum):
12 |     """Turbomind model source."""
13 |     WORKSPACE = 'workspace'
14 |     HF_MODEL = 'hf_model'
15 | 
16 | 
17 | def get_model_source(pretrained_model_name_or_path: str, **kwargs) -> ModelSource:
18 |     """Get model source."""
19 |     triton_model_path = os.path.join(pretrained_model_name_or_path, 'triton_models')
20 |     if os.path.exists(triton_model_path):
21 |         return ModelSource.WORKSPACE
22 |     return ModelSource.HF_MODEL
23 | 


--------------------------------------------------------------------------------
/lmdeploy/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from typing import Tuple
 3 | 
 4 | __version__ = '0.9.0'
 5 | short_version = __version__
 6 | 
 7 | 
 8 | def parse_version_info(version_str: str) -> Tuple:
 9 |     """Parse version from a string.
10 | 
11 |     Args:
12 |         version_str (str): A string represents a version info.
13 | 
14 |     Returns:
15 |         tuple: A sequence of integer and string represents version.
16 |     """
17 |     _version_info = []
18 |     for x in version_str.split('.'):
19 |         if x.isdigit():
20 |             _version_info.append(int(x))
21 |         elif x.find('rc') != -1:
22 |             patch_version = x.split('rc')
23 |             _version_info.append(int(patch_version[0]))
24 |             _version_info.append(f'rc{patch_version[1]}')
25 |     return tuple(_version_info)
26 | 
27 | 
28 | version_info = parse_version_info(__version__)
29 | 


--------------------------------------------------------------------------------
/lmdeploy/vl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .utils import load_image
3 | 
4 | __all__ = ['load_image']
5 | 


--------------------------------------------------------------------------------
/lmdeploy/vl/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | IMAGE_DUMMY_TOKEN_INDEX = 0
3 | IMAGE_TOKEN = '<IMAGE_TOKEN>'
4 | 


--------------------------------------------------------------------------------
/lmdeploy/vl/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/lmdeploy/vl/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 


--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | pybind11<=2.13.1
2 | setuptools
3 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | markdown>=3.4.0
2 | myst-parser
3 | sphinx==8.0.2
4 | sphinx-book-theme
5 | sphinx-copybutton
6 | sphinx-tabs
7 | sphinxcontrib-mermaid
8 | 


--------------------------------------------------------------------------------
/requirements/lite.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets
3 | transformers_stream_generator
4 | 


--------------------------------------------------------------------------------
/requirements/readthedocs.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | mmengine-lite
3 | pillow
4 | pydantic
5 | torch
6 | transformers
7 | urllib3<2.0.0
8 | 


--------------------------------------------------------------------------------
/requirements/runtime_ascend.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=0.29.3
 2 | dlinfer-ascend>=0.1.3
 3 | einops
 4 | fastapi
 5 | fire
 6 | mmengine-lite
 7 | numpy<2.0.0
 8 | openai
 9 | outlines<0.1.0
10 | partial_json_parser
11 | peft<=0.11.1
12 | pillow
13 | protobuf
14 | pydantic>2.0.0
15 | pyzmq
16 | safetensors
17 | sentencepiece
18 | shortuuid
19 | tiktoken
20 | torch<=2.4.0,>=2.3.1
21 | torch-npu==2.3.1
22 | torchvision<=0.19.0,>=0.18.1
23 | transformers
24 | uvicorn
25 | 


--------------------------------------------------------------------------------
/requirements/runtime_camb.txt:
--------------------------------------------------------------------------------
 1 | accelerate==1.2.0
 2 | einops
 3 | fastapi
 4 | fire
 5 | mmengine-lite
 6 | numpy<2.0.0
 7 | openai
 8 | outlines<0.1.0
 9 | partial_json_parser
10 | peft<=0.11.1
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | pyzmq
15 | safetensors
16 | sentencepiece
17 | shortuuid
18 | tiktoken
19 | torch==2.4.0
20 | torchvision<=0.19.0,>=0.15.0
21 | transformers
22 | uvicorn
23 | 


--------------------------------------------------------------------------------
/requirements/runtime_cuda.txt:
--------------------------------------------------------------------------------
 1 | accelerate>=0.29.3
 2 | einops
 3 | fastapi
 4 | fire
 5 | mmengine-lite
 6 | numpy<2.0.0
 7 | openai
 8 | outlines
 9 | partial_json_parser
10 | peft<=0.14.0
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | pynvml
15 | pyzmq
16 | ray
17 | safetensors
18 | sentencepiece
19 | shortuuid
20 | tiktoken
21 | torch<=2.7.1,>=2.0.0
22 | torchvision<=0.22.1,>=0.15.0
23 | transformers
24 | triton<=3.3.1,>=3.0.0; sys_platform == "linux"
25 | uvicorn
26 | 


--------------------------------------------------------------------------------
/requirements/runtime_maca.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.32.1
 2 | einops
 3 | fastapi
 4 | fire
 5 | mmengine-lite
 6 | numpy<2.0.0
 7 | openai
 8 | outlines<0.1.0
 9 | partial_json_parser
10 | peft<=0.11.1
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | pyzmq
15 | safetensors
16 | sentencepiece
17 | shortuuid
18 | tiktoken
19 | torch<=2.4.0,>=2.0.0
20 | torchvision<=0.19.0,>=0.15.0
21 | transformers
22 | triton>=2.1.0; sys_platform == "linux"
23 | uvicorn
24 | 


--------------------------------------------------------------------------------
/requirements/serve.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | protobuf
3 | tritonclient[grpc]
4 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
 1 | allure-pytest
 2 | coverage
 3 | nvidia-ml-py
 4 | pytest
 5 | pytest-assume
 6 | pytest-cov
 7 | pytest-order
 8 | pytest-rerunfailures
 9 | pytest-sugar
10 | pytest-xdist
11 | pyyaml
12 | 


--------------------------------------------------------------------------------
/requirements_ascend.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_ascend.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 | 


--------------------------------------------------------------------------------
/requirements_camb.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_camb.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 | 


--------------------------------------------------------------------------------
/requirements_cuda.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_cuda.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 | 


--------------------------------------------------------------------------------
/requirements_maca.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_maca.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 | 


--------------------------------------------------------------------------------
/resources/batch_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternLM/lmdeploy/a7146ed10ae8cc44b2dbfa2a506ec13f08111ab6/resources/batch_memory.png


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(turbomind)
16 | 


--------------------------------------------------------------------------------
/src/turbomind/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(utils)
16 | add_subdirectory(core)
17 | add_subdirectory(kernels)
18 | add_subdirectory(layers)
19 | add_subdirectory(comm)
20 | add_subdirectory(models)
21 | add_subdirectory(engine)
22 | if(BUILD_PYT)
23 |     add_subdirectory(th_op)
24 | endif()
25 | if(BUILD_PY_FFI)
26 |     add_subdirectory(python)
27 | endif()
28 | add_subdirectory(triton_backend)
29 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | 
 5 | add_library(host_comm STATIC host_comm.cc thread_comm.cc)
 6 | target_link_libraries(host_comm PRIVATE core logger)
 7 | set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
 8 | 
 9 | add_library(device_comm STATIC device_comm.cc)
10 | target_link_libraries(device_comm PRIVATE core logger)
11 | set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
12 | set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
13 | 
14 | if (BUILD_MULTI_GPU)
15 |     add_subdirectory(cuda_ipc)
16 |     target_link_libraries(device_comm INTERFACE cuda_ipc_comm)
17 | 
18 |     if (USE_NCCL)
19 |         add_subdirectory(nccl)
20 |         target_link_libraries(device_comm INTERFACE nccl_comm)
21 |     endif ()
22 | 
23 |     if (BUILD_TEST)
24 |         add_executable(test_comm test_comm.cu)
25 |         target_link_libraries(test_comm PRIVATE device_comm host_comm core pthread nvtx_utils)
26 |         target_compile_options(test_comm PRIVATE -O3 -march=native -mtune=native)
27 |     endif ()
28 | endif ()
29 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/barrier.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <condition_variable>
 6 | #include <cstdint>
 7 | #include <mutex>
 8 | 
 9 | namespace turbomind::comm {
10 | 
11 | class Barrier {
12 | public:
13 |     explicit Barrier(int count): threshold_{count}, count_{count} {}
14 | 
15 |     void arrive_and_wait()
16 |     {
17 |         std::unique_lock lock{mutex_};
18 |         auto             phase = phase_;
19 |         if (--count_ == 0) {
20 |             ++phase_;
21 |             count_ = threshold_;
22 |             cv_.notify_all();
23 |         }
24 |         else {
25 |             cv_.wait(lock, [this, phase] { return phase_ != phase; });
26 |         }
27 |     }
28 | 
29 | private:
30 |     std::mutex              mutex_;
31 |     std::condition_variable cv_;
32 | 
33 |     int threshold_;
34 |     int count_;
35 | 
36 |     uint32_t phase_{};
37 | };
38 | 
39 | }  // namespace turbomind::comm
40 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/cuda_ipc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | 
 5 | add_library(cuda_ipc_comm STATIC
 6 |         cuda_ipc_comm.cu
 7 |         allreduce.cu
 8 |         allgather.cu
 9 |         fused_allreduce.cu
10 |         fused_allreduce_ex.cu)
11 | 
12 | target_link_libraries(cuda_ipc_comm PRIVATE
13 |         rms_norm
14 |         host_comm
15 |         core
16 |         cuda_utils
17 |         CUDA::cuda_driver
18 |         logger)
19 | 
20 | set_property(TARGET cuda_ipc_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
21 | set_property(TARGET cuda_ipc_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
22 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/cuda_ipc/group_sum.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/core/common.h"
 6 | 
 7 | namespace turbomind::comm {
 8 | 
 9 | namespace detail {
10 | 
11 | template<class Syncgroup>
12 | __device__ float GroupSum(const float val, int warps, Syncgroup syncgroup)
13 | {
14 |     const int warp_id = threadIdx.x / WARP_SIZE;
15 |     const int lane_id = threadIdx.x % WARP_SIZE;
16 |     float     sum     = val;
17 |     PRAGMA_UNROLL
18 |     for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
19 |         sum += __shfl_xor_sync((uint32_t)-1, sum, mask);
20 |     }
21 |     __shared__ float smem[32];
22 |     // syncgroup();
23 |     if (lane_id == 0) {
24 |         smem[warp_id] = sum;
25 |     }
26 |     syncgroup();
27 |     for (int i = 1; i < warps; ++i) {
28 |         sum += smem[warp_id / warps * warps + i];
29 |     }
30 |     // sum = {};
31 |     // for (int i = 0; i < warps; ++i) {
32 |     //     sum += smem[warp_id / warps * warps + i];
33 |     // }
34 |     return sum;
35 | }
36 | 
37 | }  // namespace detail
38 | 
39 | }  // namespace turbomind::comm
40 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/device_comm.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/comm/device_comm.h"
 4 | #include "src/turbomind/utils/cuda_utils.h"
 5 | 
 6 | namespace turbomind::comm {
 7 | 
 8 | DeviceCommImpl::~DeviceCommImpl() = default;
 9 | 
10 | DeviceComm CreateNcclCommunicator(int n_ranks, int rank, HostComm h_comm);
11 | 
12 | DeviceComm CreateCudaIpcCommunicator(int n_ranks, int rank, HostComm h_comm);
13 | 
14 | DeviceComm CreateDeviceCommunicator(const std::string& backend, int n_ranks, int rank, HostComm h_comm)
15 | {
16 | #if BUILD_MULTI_GPU && USE_NCCL
17 |     if (backend == "nccl") {
18 |         return CreateNcclCommunicator(n_ranks, rank, h_comm);
19 |     }
20 | #endif
21 | 
22 | #if BUILD_MULTI_GPU
23 |     if (backend == "native" || backend == "cuda-ipc") {
24 |         return CreateCudaIpcCommunicator(n_ranks, rank, h_comm);
25 |     }
26 | #endif
27 | 
28 |     TM_CHECK(0) << "Unknown communication backend: " << backend;
29 |     return {};
30 | }
31 | 
32 | }  // namespace turbomind::comm
33 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/host_comm.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/comm/host_comm.h"
 4 | 
 5 | namespace turbomind::comm {
 6 | 
 7 | HostCommImpl::~HostCommImpl() = default;
 8 | 
 9 | std::unique_ptr<HostGroupId> CreateThreadGroupId();
10 | 
11 | std::unique_ptr<HostGroupId> CreateHostGroupId(const std::string& backend)
12 | {
13 |     return CreateThreadGroupId();
14 | }
15 | 
16 | }  // namespace turbomind::comm
17 | 


--------------------------------------------------------------------------------
/src/turbomind/comm/nccl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | 
 5 | add_library(nccl_comm STATIC nccl.cu)
 6 | target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger)
 7 | target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS})
 8 | 
 9 | set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
10 | set_property(TARGET nccl_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
11 | 


--------------------------------------------------------------------------------
/src/turbomind/core/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | 
 5 | add_library(core STATIC
 6 |         check.cc
 7 |         allocator.cc
 8 |         stream.cc
 9 |         context.cc
10 |         buffer.cc
11 |         layout.cc
12 |         tensor.cc
13 |         tensor.cu
14 |         module.cc)
15 | 
16 | target_link_libraries(core PUBLIC cuda_utils logger CUDA::cudart CUDA::cuda_driver)
17 | 
18 | set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON)
19 | set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
20 | 
21 | target_compile_options(core PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-v>)
22 | 
23 | if (BUILD_TEST)
24 |     add_executable(test_core test_core.cc)
25 |     target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain)
26 | endif ()
27 | 


--------------------------------------------------------------------------------
/src/turbomind/core/common.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include <cstddef>
 5 | #include <memory>
 6 | #include <vector>
 7 | 
 8 | /// TODO: remove this dependency
 9 | #include "src/turbomind/utils/cuda_utils.h"
10 | 
11 | namespace turbomind::core {
12 | 
13 | class Allocator;
14 | class Buffer;
15 | class Stream;
16 | class Event;
17 | class Context;
18 | 
19 | using std::shared_ptr;
20 | using std::vector;
21 | 
22 | using ssize_t = std::ptrdiff_t;
23 | 
24 | }  // namespace turbomind::core
25 | 


--------------------------------------------------------------------------------
/src/turbomind/core/context.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "src/turbomind/core/allocator.h"
 4 | #include "src/turbomind/core/common.h"
 5 | #include "src/turbomind/core/stream.h"
 6 | 
 7 | namespace turbomind::core {
 8 | 
 9 | class Context {
10 | public:
11 |     static Stream&    stream();
12 |     static Allocator& host_alloc();
13 |     static Allocator& device_alloc();
14 |     static Allocator& pinned_alloc();
15 |     static Allocator& alloc(Device device);
16 | 
17 | private:
18 |     friend class ContextGuard;
19 |     static void push(const Stream& stream);
20 |     static void push(const Allocator& alloc);
21 |     static void pop();
22 | };
23 | 
24 | class ContextGuard {
25 | public:
26 |     template<class... Args>
27 |     explicit ContextGuard(Args&&... args): n_{}
28 |     {
29 |         (Context::push((Args &&) args), ...);
30 |         n_ = sizeof...(Args);
31 |     }
32 |     ~ContextGuard()
33 |     {
34 |         for (int i = 0; i < n_; ++i) {
35 |             Context::pop();
36 |         }
37 |     }
38 | 
39 | private:
40 |     int n_;
41 | };
42 | 
43 | }  // namespace turbomind::core
44 | 


--------------------------------------------------------------------------------
/src/turbomind/core/core.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "src/turbomind/core/allocator.h"
 4 | #include "src/turbomind/core/buffer.h"
 5 | #include "src/turbomind/core/check.h"
 6 | #include "src/turbomind/core/context.h"
 7 | #include "src/turbomind/core/data_type.h"
 8 | #include "src/turbomind/core/layout.h"
 9 | #include "src/turbomind/core/stream.h"
10 | #include "src/turbomind/core/tensor.h"
11 | 
12 | namespace turbomind {
13 | 
14 | using core::ssize_t;
15 | using core::Buffer;
16 | using core::Buffer_;
17 | using core::Tensor;
18 | using core::Tensor_;
19 | using core::TensorMap;
20 | using core::Ref;
21 | using core::Layout;
22 | using core::Allocator;
23 | using core::Stream;
24 | using core::Event;
25 | 
26 | }  // namespace turbomind
27 | 


--------------------------------------------------------------------------------
/src/turbomind/core/module.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "src/turbomind/core/tensor.h"
 3 | 
 4 | namespace turbomind::core {
 5 | 
 6 | class Module {
 7 | public:
 8 |     virtual ~Module();
 9 | 
10 |     Module();
11 | 
12 |     Module(const Module&) = delete;
13 |     Module& operator=(const Module&) = delete;
14 | 
15 |     Module(Module&&) noexcept = delete;
16 |     Module& operator=(Module&&) noexcept = delete;
17 | 
18 |     void register_module(std::string name, Module& module, std::optional<int> index = {});
19 |     void register_parameter(std::string name, Tensor& param);
20 | 
21 |     void remove_module(Module& module);
22 |     void remove_parameter(Tensor& param);
23 | 
24 |     TensorMap get_parameters() const;
25 | 
26 | private:
27 |     void get_parameters_impl(std::string prefix, TensorMap& m) const;
28 | 
29 | protected:
30 |     Module* parent_;
31 | 
32 |     std::vector<std::pair<std::string, Module*>> modules_;
33 |     std::vector<std::pair<std::string, Tensor*>> params_;
34 | };
35 | 
36 | }  // namespace turbomind::core
37 | 


--------------------------------------------------------------------------------
/src/turbomind/core/stream.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "src/turbomind/core/stream.h"
 3 | #include <memory>
 4 | 
 5 | namespace turbomind::core {
 6 | 
 7 | Stream Stream::create(int priority)
 8 | {
 9 |     Stream stream;
10 |     stream.impl_ = std::make_shared<StreamImpl>(priority);
11 |     return stream;
12 | }
13 | 
14 | void StreamImpl::Wait(const Event& event)
15 | {
16 |     check_cuda_error(cudaStreamWaitEvent(stream_, event));
17 | }
18 | 
19 | }  // namespace turbomind::core
20 | 


--------------------------------------------------------------------------------
/src/turbomind/engine/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 
3 | cmake_minimum_required(VERSION 3.8)
4 | 
5 | add_library(engine STATIC gateway.cc request_queue.cc model_request.cc)
6 | target_link_libraries(engine PRIVATE core)
7 | set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE  ON)
8 | set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
9 | 


--------------------------------------------------------------------------------
/src/turbomind/engine/request_queue.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/engine/request_queue.h"
 4 | #include "src/turbomind/engine/gateway.h"
 5 | 
 6 | #include "src/turbomind/engine/request.h"
 7 | 
 8 | namespace turbomind {
 9 | 
10 | }  // namespace turbomind
11 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/activation_kernels.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cuda_runtime.h>
20 | 
21 | #include "src/turbomind/core/core.h"
22 | 
23 | namespace turbomind {
24 | 
25 | // clang-format off
26 | template<typename T> struct GeluActivation;
27 | template<typename T> struct ReluActivation;
28 | template<typename T> struct SiluActivation;
29 | template<typename T> struct IdentityActivation;
30 | // clang-format on
31 | 
32 | template<template<typename T> class Activation>
33 | void invokeGenericActivation_v3(Ref<Tensor> inter_, const Tensor& gate, cudaStream_t stream);
34 | 
35 | }  // namespace turbomind
36 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/arch.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::arch {
 6 | 
 7 | // tags for dispatching & conditional codegen
 8 | 
 9 | template<int Begin, int End = -1>
10 | struct Arch {
11 |     static constexpr bool is_compatible(int arch)
12 |     {
13 |         return Begin <= arch && (End == -1 || arch < End);
14 |     }
15 | };
16 | 
17 | struct Sm70: Arch<700, 750> {
18 | };
19 | 
20 | struct Sm75: Arch<750, 800> {
21 | };
22 | 
23 | struct Sm80: Arch<800> {
24 | };
25 | 
26 | }  // namespace turbomind::arch
27 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/attention.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "attention_params.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | constexpr int MAX_CTA_S = 64;
10 | 
11 | template<typename T>
12 | void dispatchAttention(const AttentionParams<T>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm70_128_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 128, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 128, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 64, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm70, half, 64, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm75_128_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 128, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | // ! register spill
14 | // template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 128, CacheType::kBlock>::Kernel>(
15 | //     const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 64, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | // ! register spill
14 | // template void invokeAttention<typename AttentionConfig<arch::Sm75, half, 64, CacheType::kBlock>::Kernel>(
15 | //     const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_128_bf16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 128, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 128, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<nv_bfloat16>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_128_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 128, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 128, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 192, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 192, CacheType::kLinear>::Kernel>(
14 |     const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 64, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm80, nv_bfloat16, 64, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<nv_bfloat16>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_config.h"
 4 | #include "../attention_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 64, CacheType::kLinear>::Kernel>(
11 |     const AttentionParams<half>& params);
12 | 
13 | template void invokeAttention<typename AttentionConfig<arch::Sm80, half, 64, CacheType::kBlock>::Kernel>(
14 |     const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 1, 128>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 2, 128>>(const AttentionParams<half>& params);
13 | 
14 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 3, 128>>(const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_params.h"
 4 | #include "../decoding_config.h"
 5 | #include "../decoding_template.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | using namespace attention;
10 | 
11 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 1, 128>>(const AttentionParams<half>& params);
12 | 
13 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 2, 128>>(const AttentionParams<half>& params);
14 | 
15 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 3, 128>>(const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_params.h"
 4 | #include "../decoding_config.h"
 5 | #include "../decoding_template.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | using namespace attention;
10 | 
11 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 1, 128>>(const AttentionParams<half>& params);
12 | 
13 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 2, 128>>(const AttentionParams<half>& params);
14 | 
15 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 3, 128>>(const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 1, 64>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 2, 64>>(const AttentionParams<half>& params);
13 | 
14 | template bool invokeDecoding<Decoding<arch::Sm70, half, half, 3, 64>>(const AttentionParams<half>& params);
15 | 
16 | }  // namespace turbomind
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_params.h"
 4 | #include "../decoding_config.h"
 5 | #include "../decoding_template.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | using namespace attention;
10 | 
11 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 1, 64>>(const AttentionParams<half>& params);
12 | 
13 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 2, 64>>(const AttentionParams<half>& params);
14 | 
15 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint4_t, 3, 64>>(const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../attention_params.h"
 4 | #include "../decoding_config.h"
 5 | #include "../decoding_template.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | using namespace attention;
10 | 
11 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 1, 64>>(const AttentionParams<half>& params);
12 | 
13 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 2, 64>>(const AttentionParams<half>& params);
14 | 
15 | template bool invokeDecoding<Decoding<arch::Sm70, half, uint8_t, 3, 64>>(const AttentionParams<half>& params);
16 | 
17 | }  // namespace turbomind
18 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, half, 8, 128>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, half, 16, 128>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 8, 128>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 16, 128>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 8, 128>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 16, 128>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, half, 8, 64>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, half, 16, 64>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 8, 64>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint4_t, 16, 64>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 8, 64>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm75, half, uint8_t, 16, 64>>(const AttentionParams<half>& params);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_bf16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool
11 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 128>>(const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template bool
14 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 2, 128>>(const AttentionParams<nv_bfloat16>& params);
15 | 
16 | template bool
17 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 8, 128>>(const AttentionParams<nv_bfloat16>& params);
18 | 
19 | template bool
20 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 16, 128>>(const AttentionParams<nv_bfloat16>& params);
21 | 
22 | }  // namespace turbomind
23 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 8, 128>>(const AttentionParams<nv_bfloat16>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 16, 128>>(const AttentionParams<nv_bfloat16>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 8, 128>>(const AttentionParams<nv_bfloat16>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 16, 128>>(const AttentionParams<nv_bfloat16>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 128>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 2, 128>>(const AttentionParams<half>& params);
13 | 
14 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 8, 128>>(const AttentionParams<half>& params);
15 | 
16 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 16, 128>>(const AttentionParams<half>& params);
17 | 
18 | }  // namespace turbomind
19 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 8, 128>>(const AttentionParams<half>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 16, 128>>(const AttentionParams<half>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 8, 128>>(const AttentionParams<half>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 16, 128>>(const AttentionParams<half>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool
11 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 192>>(const AttentionParams<half>& params);
14 | 
15 | template bool
16 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 1, 192>>(const AttentionParams<nv_bfloat16>& params);
17 | 
18 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 1, 192>>(const AttentionParams<half>& params);
19 | 
20 | }  // namespace turbomind
21 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool
11 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 1, 64>>(const AttentionParams<nv_bfloat16>& params);
12 | 
13 | template bool
14 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 2, 64>>(const AttentionParams<nv_bfloat16>& params);
15 | 
16 | template bool
17 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 8, 64>>(const AttentionParams<nv_bfloat16>& params);
18 | 
19 | template bool
20 | invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, nv_bfloat16, 16, 64>>(const AttentionParams<nv_bfloat16>& params);
21 | 
22 | }  // namespace turbomind
23 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 8, 64>>(const AttentionParams<nv_bfloat16>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint4_t, 16, 64>>(const AttentionParams<nv_bfloat16>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 8, 64>>(const AttentionParams<nv_bfloat16>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, nv_bfloat16, uint8_t, 16, 64>>(const AttentionParams<nv_bfloat16>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 1, 64>>(const AttentionParams<half>& params);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 2, 64>>(const AttentionParams<half>& params);
13 | 
14 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 8, 64>>(const AttentionParams<half>& params);
15 | 
16 | template bool invokeDecoding<Decoding<arch::Sm80, half, half, 16, 64>>(const AttentionParams<half>& params);
17 | 
18 | }  // namespace turbomind
19 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 8, 64>>(const AttentionParams<half>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint4_t, 16, 64>>(const AttentionParams<half>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "../decoding_config.h"
 4 | #include "../decoding_template.h"
 5 | 
 6 | namespace turbomind {
 7 | 
 8 | using namespace attention;
 9 | 
10 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 8, 64>>(const AttentionParams<half>&);
11 | 
12 | template bool invokeDecoding<Decoding<arch::Sm80, half, uint8_t, 16, 64>>(const AttentionParams<half>&);
13 | 
14 | }  // namespace turbomind
15 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/decoding.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "attention_params.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | template<class T>
10 | void dispatchDecoding(const AttentionParams<T>& params);
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/impl.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | namespace attention {
 8 | 
 9 | struct MMA_16816 {
10 | };
11 | 
12 | struct MMA_81616 {
13 | };  // MMA_16816 transposed
14 | 
15 | struct MMA_1688 {
16 | };
17 | 
18 | struct MMA_884 {
19 | };
20 | 
21 | struct MMA_SIMT {
22 | };
23 | 
24 | template<class Tag,
25 |          class T,
26 |          class Tkv,
27 |          int CTA_H,
28 |          int CTA_Q,
29 |          int CTA_S,
30 |          int WARP_H,
31 |          int WARP_Q,
32 |          int WARP_S,
33 |          int HeadDim,
34 |          int Stages = 2>
35 | struct Impl {
36 | };
37 | 
38 | }  // namespace attention
39 | 
40 | }  // namespace turbomind
41 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/mainloop.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::attention {
 6 | 
 7 | template<class Tag, class Attention>
 8 | struct Mainloop {
 9 | };
10 | 
11 | }  // namespace turbomind::attention
12 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/reduce.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "cta_map.h"
 6 | #include "src/turbomind/kernels/core/array_ops.h"
 7 | #include "src/turbomind/kernels/core/thread_map.h"
 8 | #include <cstddef>
 9 | #include <cuda_runtime.h>
10 | #include <type_traits>
11 | 
12 | namespace turbomind::attention {
13 | 
14 | template<int HeadDim, class T>
15 | void invokeReduce(T*           out,
16 |                   float*       partial_M,
17 |                   float*       partial_L,
18 |                   float*       partial_O,
19 |                   const int*   split_cnt,
20 |                   int          partial_len,
21 |                   int          max_split_cnt,
22 |                   int          query_num,
23 |                   int          head_num,
24 |                   float        exp_scale,
25 |                   cudaStream_t stream);
26 | 
27 | }  // namespace turbomind::attention
28 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | int GetSplitCount(int   max_split_cnt,
 8 |                   int   grid_size,
 9 |                   int   max_active_ctas,
10 |                   int   sm_count,
11 |                   int   max_wave_cnt,
12 |                   float alpha = 1,
13 |                   float beta  = 1e-3);
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/data_type.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cuda_fp16.h>
 6 | #if ENABLE_BF16
 7 | #include <cuda_bf16.h>
 8 | #endif
 9 | 
10 | #include <cstdint>
11 | 
12 | #include "src/turbomind/core/data_type.h"
13 | 
14 | namespace turbomind {
15 | 
16 | namespace detail {
17 | 
18 | struct __uint4_t {
19 |     uint32_t x;
20 | };
21 | 
22 | }  // namespace detail
23 | 
24 | template<class T, class SFINAE = void>
25 | struct get_pointer_type_t {
26 |     using type = T*;
27 | };
28 | 
29 | template<class T>
30 | using get_pointer_type = typename get_pointer_type_t<T>::type;
31 | 
32 | }  // namespace turbomind
33 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/meta.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | template<class T>
 8 | struct basic_type {
 9 |     using type = T;
10 | };
11 | 
12 | template<class T>
13 | constexpr basic_type<T> type_c{};
14 | 
15 | template<auto v>
16 | struct constant {
17 |     using type       = constant;
18 |     using value_type = decltype(v);
19 | 
20 |     static constexpr value_type value = v;
21 | 
22 |     constexpr value_type operator()() const noexcept
23 |     {
24 |         return v;
25 |     }
26 |     constexpr operator value_type() const noexcept
27 |     {
28 |         return v;
29 |     }
30 | };
31 | 
32 | template<auto u, auto v>
33 | struct pair {
34 | };
35 | 
36 | template<auto u, auto v>
37 | constexpr auto first(pair<u, v>)
38 | {
39 |     return u;
40 | }
41 | 
42 | template<auto u, auto v>
43 | constexpr auto second(pair<u, v>)
44 | {
45 |     return v;
46 | }
47 | 
48 | template<auto u, auto v, auto w>
49 | struct triplet {
50 | };
51 | 
52 | }  // namespace turbomind
53 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/core/pipe_iter.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | template<int Stages, int Step = 1>
 8 | struct PipeIter {
 9 |     static constexpr int kMaxStep = Stages * Step;
10 | 
11 |     int r = 0;
12 |     int w = kMaxStep - Step;
13 | 
14 |     __inline__ __device__ PipeIter& operator++()
15 |     {
16 |         w = r;
17 |         r += Step;
18 |         if (r == kMaxStep) {
19 |             r -= kMaxStep;
20 |         }
21 |         return *this;
22 |     }
23 | };
24 | 
25 | }  // namespace turbomind
26 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | add_subdirectory(fused_multi_head_attention)
 4 | 
 5 | add_library(flash_attention STATIC flash_attention.cu)
 6 | set_property(TARGET flash_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
 7 | set_property(TARGET flash_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 8 | target_link_libraries(flash_attention PRIVATE llama_fmha)
 9 | 
10 | if (NOT MSVC)
11 |     add_subdirectory(flash_attention2)
12 |     target_link_libraries(flash_attention PRIVATE flash_attention2)
13 | endif()
14 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | cmake_minimum_required(VERSION 3.8)
 3 | project(flash_attention2)
 4 | 
 5 | add_library(${PROJECT_NAME} STATIC
 6 |     flash_api.cpp
 7 |     # flash_fwd_hdim32_fp16_sm80.cu
 8 |     # flash_fwd_hdim64_fp16_sm80.cu
 9 |     flash_fwd_hdim128_fp16_sm80.cu
10 |     flash_fwd_hdim128_bf16_sm80.cu
11 |     flash_fwd_hdim256_bf16_sm80.cu
12 |     flash_fwd_hdim256_fp16_sm80.cu
13 |     )
14 | target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
15 | target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
16 | 
17 | set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
18 | set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
19 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/README.md:
--------------------------------------------------------------------------------
1 | #Flash Attention 2
2 | 
3 | This is flash attention2 implementation modified from https://github.com/Dao-AILab/flash-attention
4 | 
5 | - remove dropout
6 | - remove backward
7 | - cutlass 3.1.0
8 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | #ifdef ENABLE_BF16
 8 | template<>
 9 | void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 |     run_mha_fwd_hdim128<cutlass::bfloat16_t>(params, stream);
12 | }
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 128>(Flash_fwd_params& params, cudaStream_t stream)
 9 | {
10 |     run_mha_fwd_hdim128<cutlass::half_t>(params, stream);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | #ifdef ENABLE_BF16
 8 | template<>
 9 | void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 |     run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
12 | }
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params& params, cudaStream_t stream)
 9 | {
10 |     run_mha_fwd_hdim256<cutlass::half_t>(params, stream);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | #ifdef ENABLE_BF16
 8 | template<>
 9 | void run_mha_fwd_<cutlass::bfloat16_t, 32>(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 |     run_mha_fwd_hdim32<cutlass::bfloat16_t>(params, stream);
12 | }
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params& params, cudaStream_t stream)
 9 | {
10 |     run_mha_fwd_hdim32<cutlass::half_t>(params, stream);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | #ifdef ENABLE_BF16
 8 | template<>
 9 | void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 |     run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
12 | }
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | 
 3 | // Splitting the different head dimensions to different files to speed up compilation.
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params& params, cudaStream_t stream)
 9 | {
10 |     run_mha_fwd_hdim64<cutlass::half_t>(params, stream);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/fused_multi_head_attention/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | cmake_minimum_required(VERSION 3.8)
3 | 
4 | add_library(llama_fmha STATIC llama_flash_attention_kernel.cu)
5 | target_include_directories(llama_fmha PRIVATE ${CUTLASS_DIR}/examples)
6 | target_link_libraries(llama_fmha PRIVATE nvidia::cutlass::cutlass)
7 | set_property(TARGET llama_fmha PROPERTY POSITION_INDEPENDENT_CODE  ON)
8 | set_property(TARGET llama_fmha PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
9 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | // tags for dispatching & conditional codegen
 8 | 
 9 | template<int Begin, int End = -1>
10 | struct Arch {
11 |     static constexpr bool is_compatible(int arch)
12 |     {
13 |         return Begin <= arch && (End == -1 || arch < End);
14 |     }
15 | };
16 | 
17 | struct Sm70: Arch<700, 750> {
18 |     static constexpr int value = 700;
19 | };
20 | 
21 | struct Sm75: Arch<750, 800> {
22 |     static constexpr int value = 750;
23 | };
24 | 
25 | struct Sm80: Arch<800, 900> {
26 |     static constexpr int value = 800;
27 | };
28 | 
29 | struct Sm90: Arch<900> {
30 |     static constexpr int value = 900;
31 | };
32 | 
33 | inline bool is_arch_compatible(int karch, int darch)
34 | {
35 |     switch (karch) {
36 |         case 700:
37 |             return Sm70::is_compatible(darch);
38 |         case 750:
39 |             return Sm75::is_compatible(darch);
40 |         case 800:
41 |             return Sm80::is_compatible(darch);
42 |         case 900:
43 |             return Sm90::is_compatible(darch);
44 |         default:
45 |             return false;
46 |     }
47 | }
48 | 
49 | }  // namespace turbomind::gemm
50 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/dispatch_cache.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/desc.h"
 4 | 
 5 | #include <memory>
 6 | #include <optional>
 7 | #include <vector>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | class DispatchCache {
12 | public:
13 |     DispatchCache(std::vector<Kernel*> kernels);
14 | 
15 |     ~DispatchCache();
16 | 
17 |     std::optional<LaunchSpec> LowerBound(const GemmDesc& desc) const;
18 | 
19 |     std::optional<LaunchSpec> Find(const GemmDesc& desc) const;
20 | 
21 |     bool Insert(const GemmDesc& desc, const LaunchSpec& spec);
22 | 
23 |     int Export(std::ostream& os) const;
24 | 
25 |     int Import(std::istream& is);
26 | 
27 | private:
28 |     struct Impl;
29 |     std::unique_ptr<Impl> impl_;
30 | };
31 | 
32 | }  // namespace turbomind::gemm
33 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/gpu_metric.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | 
 7 | namespace turbomind::gemm {
 8 | 
 9 | // bytes / second
10 | float MeasureL2CacheThroughput();
11 | 
12 | // fused multiply-add / second
13 | float MeasureMmaThroughput(int proble_size = 16384);
14 | 
15 | }  // namespace turbomind::gemm
16 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/iterator_sm90.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cute/arch/copy_sm90_desc.hpp>
 4 | #include <cute/arch/copy_sm90_tma.hpp>
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | template<int multicast>
 9 | struct GmemIteratorSm90 {
10 | 
11 |     const CUtensorMap* desc_ptr_;
12 |     int2               offset_;
13 |     int2               step_;
14 | 
15 |     __device__ GmemIteratorSm90(const CUtensorMap* desc_ptr, int2 offset, int2 step)
16 |     {
17 |         desc_ptr_ = desc_ptr;
18 |         offset_   = offset;
19 |         step_     = step;
20 |     }
21 | 
22 |     __device__ void Step(uint64_t* mbar_ptr, void* smem_ptr, uint16_t mask, uint64_t cache_hint = 0)
23 |     {
24 |         if constexpr (multicast > 1) {
25 |             cute::SM90_TMA_LOAD_MULTICAST_2D::copy(
26 |                 desc_ptr_, mbar_ptr, mask, cache_hint, smem_ptr, offset_.x, offset_.y);
27 |         }
28 |         else {
29 |             cute::SM90_TMA_LOAD_2D::copy(desc_ptr_, mbar_ptr, cache_hint, smem_ptr, offset_.x, offset_.y);
30 |         }
31 |         offset_.x += step_.x;
32 |         offset_.y += step_.y;
33 |     }
34 | };
35 | 
36 | }  // namespace turbomind::gemm
37 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/predicate.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | #include <type_traits>
 7 | 
 8 | namespace turbomind::gemm {
 9 | 
10 | template<int S, int C, bool AlignedS, bool AlignedC>
11 | struct Predicate {
12 | 
13 |     static constexpr int kSizeC = AlignedC ? 1 : C;
14 | 
15 |     static_assert(S * kSizeC <= 32);
16 | 
17 |     static constexpr bool is_active = true;
18 | 
19 |     uint32_t pred_{};
20 | 
21 |     __device__ int operator()(int s, int c) const
22 |     {
23 |         return (pred_ & (1 << (s * kSizeC + c))) != 0;
24 |     }
25 | 
26 |     __device__ void set(int s, int c)
27 |     {
28 |         pred_ |= (1 << (s * kSizeC + c));
29 |     }
30 | 
31 |     __device__ void clear()
32 |     {
33 |         pred_ = 0;
34 |     }
35 | };
36 | 
37 | template<int S, int C>
38 | struct Predicate<S, C, true, true> {
39 | 
40 |     static constexpr bool is_active = false;
41 | 
42 |     __device__ constexpr std::integral_constant<int, 1> operator()(int, int) const
43 |     {
44 |         return {};
45 |     }
46 | 
47 |     __device__ void set(int, int) {}
48 | 
49 |     __device__ void clear()
50 |     {
51 |         // pred_ = 0;
52 |     }
53 | };
54 | 
55 | }  // namespace turbomind::gemm
56 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/simt.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind::gemm::simt {
 6 | 
 7 | // constexpr int OP_M = 2;
 8 | // constexpr int OP_N = 16;
 9 | // constexpr int OP_K = 4;
10 | 
11 | // constexpr int OP_M = 4;
12 | // constexpr int OP_N = 8;
13 | // constexpr int OP_K = 8;
14 | 
15 | constexpr int OP_M = 1;
16 | constexpr int OP_N = 32;
17 | constexpr int OP_K = 8;
18 | 
19 | }  // namespace turbomind::gemm::simt
20 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/quantization.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/types.h"
 4 | #include <thrust/device_vector.h>
 5 | #include <thrust/universal_vector.h>
 6 | 
 7 | #pragma once
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | template<class D, class S>
12 | void Quantize(const thrust::universal_vector<S>&  x,
13 |               int                                 m,
14 |               int                                 k,
15 |               Order                               order,
16 |               int                                 group_size,
17 |               thrust::universal_vector<S>&        x_p,  // pseudo-quantized
18 |               thrust::universal_vector<uint16_t>& x_q,  // quantized ushort
19 |               thrust::universal_vector<S>&        x_u,  // scales & zeros (always m-major)
20 |               cudaStream_t                        stream);
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/reference.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/types.h"
 6 | 
 7 | #include <cublas_v2.h>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | class Reference {
12 | public:
13 |     Reference();
14 |     ~Reference();
15 | 
16 |     void set_stream(cudaStream_t stream);
17 | 
18 |     void gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc);
19 | 
20 | private:
21 |     cublasHandle_t handle_;
22 | };
23 | 
24 | }  // namespace turbomind::gemm
25 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/cache_utils.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h"
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | CacheFlushing::CacheFlushing()
 8 | {
 9 |     cudaDeviceProp props{};
10 |     cudaGetDeviceProperties(&props, 0);
11 | 
12 |     size_ = props.l2CacheSize;
13 | 
14 |     cudaMalloc(&buffer_, size_);
15 | }
16 | 
17 | void CacheFlushing::flush(cudaStream_t stream)
18 | {
19 |     thread_local CacheFlushing inst{};
20 |     inst(stream);
21 | }
22 | 
23 | void CacheFlushing::operator()(cudaStream_t stream) const
24 | {
25 |     cudaMemsetAsync(buffer_, 0, size_, stream);
26 | }
27 | 
28 | }  // namespace turbomind::gemm
29 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/cache_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <cstdint>
 6 | 
 7 | namespace turbomind::gemm {
 8 | 
 9 | class CacheFlushing {
10 | public:
11 |     static void flush(cudaStream_t stream = {});
12 | 
13 | private:
14 |     CacheFlushing();
15 |     void operator()(cudaStream_t stream) const;
16 | 
17 |     uint32_t* buffer_;
18 |     size_t    size_;
19 | };
20 | 
21 | }  // namespace turbomind::gemm
22 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/measurer.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/desc.h"
 4 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
 5 | #include <climits>
 6 | #include <functional>
 7 | #include <memory>
 8 | #include <vector>
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | struct Measurement {
13 |     cudaError_t status;
14 |     int         sample_count;
15 |     float       mean;
16 |     float       variance;
17 | };
18 | 
19 | using Launcher = std::function<int(LaunchSpec, cudaStream_t)>;
20 | 
21 | class Measurer {
22 | public:
23 |     Measurer(std::unique_ptr<StoppingCriterion> stop_criterion);
24 | 
25 |     ~Measurer();
26 | 
27 |     std::vector<Measurement>
28 |     Measure(const std::vector<LaunchSpec>& specs, const Launcher& launcher, cudaStream_t stream);
29 | 
30 | private:
31 |     Measurement MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
32 | 
33 |     std::pair<float, cudaError_t> ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream);
34 | 
35 | private:
36 |     cudaEvent_t                        ev_beg_;
37 |     cudaEvent_t                        ev_end_;
38 |     std::unique_ptr<StoppingCriterion> stop_criterion_;
39 | };
40 | 
41 | }  // namespace turbomind::gemm
42 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/params.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <array>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | namespace turbomind::gemm {
10 | 
11 | struct TuningParams {
12 |     // Split-k params
13 |     int max_splits = 8;
14 |     int max_waves  = 10;
15 | 
16 |     // Swizzling params
17 |     std::vector<int> swizzle{3};
18 | 
19 |     // Sampling params
20 |     float top_k    = 0;
21 |     int   clusters = 5;
22 |     int   min_iter = 1;
23 |     int   max_iter = 10;
24 |     float max_time = 1.f;
25 | 
26 |     std::vector<int> seq;
27 | };
28 | 
29 | // example
30 | //   max_splits=8,top_splits=5,max_waves=16,top_k=10,swizzle=[2,3,4],clusters=5,max_iter=10,min_iter=1,max_time=10.0
31 | void ParseTuningParams(TuningParams& params, const std::string& str);
32 | 
33 | // example
34 | //   16-16-128,256-128-1024,8192
35 | std::vector<int> ParseTuningSequence(const std::string& str);
36 | 
37 | std::vector<int> GenerateTuningSequence(const std::vector<std::array<int, 3>>& generators);
38 | 
39 | std::vector<std::array<int, 3>> GetDefaultTuningGenerators();
40 | 
41 | }  // namespace turbomind::gemm
42 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/sampler.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/kernels/gemm/desc.h"
 6 | #include "src/turbomind/kernels/gemm/tuner/measurer.h"
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace turbomind::gemm {
11 | 
12 | class Sampler {
13 | public:
14 |     explicit Sampler(Measurer& measurer, int k_clusters): measurer_{measurer}, k_clusters_{k_clusters} {}
15 | 
16 |     std::vector<LaunchSpec> Run(std::vector<LaunchSpec> specs, const Launcher& launcher, cudaStream_t stream);
17 | 
18 | private:
19 |     Measurer& measurer_;
20 |     int       k_clusters_;
21 | };
22 | 
23 | }  // namespace turbomind::gemm
24 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stats.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <limits>
 4 | 
 5 | namespace turbomind::gemm {
 6 | 
 7 | class Stats {
 8 | public:
 9 |     Stats(): count_{}, mean_{}, m2_{} {}
10 | 
11 |     float mean() const noexcept
12 |     {
13 |         return mean_;
14 |     }
15 | 
16 |     float sum() const noexcept
17 |     {
18 |         return mean_ * count_;
19 |     }
20 | 
21 |     int count() const noexcept
22 |     {
23 |         return count_;
24 |     }
25 | 
26 |     float get_variance() const noexcept
27 |     {
28 |         return count_ < 2 ? std::numeric_limits<float>::quiet_NaN() : m2_ / count_;
29 |     }
30 | 
31 |     void add_sample(float x) noexcept
32 |     {
33 |         ++count_;
34 |         float delta = x - mean_;
35 |         mean_ += delta / count_;
36 |         float delta2 = x - mean_;
37 |         m2_ += delta * delta2;
38 |     }
39 | 
40 | private:
41 |     int   count_;
42 |     float mean_;
43 |     float m2_;
44 | };
45 | 
46 | }  // namespace turbomind::gemm
47 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stopping_criterion.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h"
 4 | #include <memory>
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | namespace stopping_criterions {
 9 | 
10 | class Optimistic: public StoppingCriterion {
11 | public:
12 |     Optimistic(int min_iter, int max_iter, float max_ms)
13 |     {
14 |         min_iter_ = std::max(min_iter, 1);
15 |         max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits<int>::max();
16 |         max_ms_   = max_ms > 0 ? max_ms : std::numeric_limits<float>::infinity();
17 |     }
18 |     bool should_stop(const Stats& stats) override
19 |     {
20 |         return stats.count() >= min_iter_ && (stats.count() >= max_iter_ || stats.sum() >= max_ms_);
21 |     }
22 | 
23 | private:
24 |     int   min_iter_;
25 |     int   max_iter_;
26 |     float max_ms_;
27 | };
28 | 
29 | }  // namespace stopping_criterions
30 | 
31 | std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms)
32 | {
33 |     return std::make_unique<stopping_criterions::Optimistic>(min_iter, max_iter, max_ms);
34 | }
35 | 
36 | }  // namespace turbomind::gemm
37 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/tuner/stopping_criterion.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include "src/turbomind/kernels/gemm/tuner/stats.h"
 4 | #include <memory>
 5 | 
 6 | namespace turbomind::gemm {
 7 | 
 8 | class StoppingCriterion {
 9 | public:
10 |     virtual ~StoppingCriterion()                 = default;
11 |     virtual bool should_stop(const Stats& stats) = 0;
12 | };
13 | 
14 | std::unique_ptr<StoppingCriterion> CreateStoppingCriterion(int min_iter, int max_iter, float max_ms);
15 | 
16 | }  // namespace turbomind::gemm
17 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/norm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | 
3 | add_library(rms_norm rms_norm.cu)
4 | set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON)
5 | set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
6 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/norm/rms_norm.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <cuda_runtime.h>
 4 | 
 5 | #include "src/turbomind/core/core.h"
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st);
10 | 
11 | void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st);
12 | 
13 | template<class T>
14 | void invokeBiasResidualRMSNorm(
15 |     T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
16 | 
17 | void invokeResidualBiasRMSNorm(void*        hidden_states,
18 |                                void*        residual,
19 |                                const void*  weights,
20 |                                const void*  bias,
21 |                                DataType     dtype,
22 |                                int          dims,
23 |                                int          num,
24 |                                float        eps,
25 |                                cudaStream_t st);
26 | 
27 | }  // namespace turbomind
28 | 


--------------------------------------------------------------------------------
/src/turbomind/kernels/quantization.h:
--------------------------------------------------------------------------------
 1 | #include "src/turbomind/core/core.h"
 2 | 
 3 | namespace turbomind {
 4 | 
 5 | void QuantizeSymm(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st);
 6 | 
 7 | void DequantizeSymm(Tensor& out, const Tensor& src, const Tensor& scale, cudaStream_t st);
 8 | 
 9 | void QuantizeSymmBlock(Tensor& out, Tensor& scale, const Tensor& src, cudaStream_t st);
10 | 
11 | void DequantizeSymmBlock(Tensor& out, const Tensor& src, const Tensor& scale, cudaStream_t st);
12 | 
13 | }  // namespace turbomind
14 | 


--------------------------------------------------------------------------------
/src/turbomind/layers/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | cmake_minimum_required(VERSION 3.8)
16 | 
17 | add_subdirectory(sampling_layers)
18 | 
19 | find_package(CUDAToolkit REQUIRED)
20 | add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
21 | set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
22 | set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
23 | target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart
24 |         LogitsProcessorLayer SamplingLayer StopCriteriaLayer
25 |         gpt_kernels nvtx_utils)
26 | 


--------------------------------------------------------------------------------
/src/turbomind/macro.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
 4 | 
 5 | #define __PRETTY_FUNCTION__ __FUNCSIG__
 6 | 
 7 | #endif
 8 | 
 9 | typedef unsigned int uint;
10 | 


--------------------------------------------------------------------------------
/src/turbomind/models/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(llama)
16 | 


--------------------------------------------------------------------------------
/src/turbomind/models/llama/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | 
 5 | 
 6 | find_package(CUDAToolkit REQUIRED)
 7 | 
 8 | add_library(Llama STATIC
 9 |         LlamaV2.cc
10 |         LlamaBatch.cc
11 |         LlamaLinear.cu
12 |         BlockManager.cc
13 |         BlockTrie.cc
14 |         SequenceManager.cc
15 |         LlamaWeight.cc
16 |         LlamaDenseWeight.cc
17 |         LlamaDecoderLayerWeight.cc
18 |         LlamaFfnLayer.cc
19 |         moe_ffn_layer.cc
20 |         unified_decoder.cc
21 |         unified_attention_layer.cc
22 |         llama_kernels.cu
23 |         llama_utils.cu
24 |         mla_utils.cu)
25 | set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
26 | set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
27 | target_link_libraries(Llama PUBLIC CUDA::cudart
28 |         engine
29 |         core
30 |         gemm2
31 |         CUDA::cublas
32 |         rms_norm
33 |         DynamicDecodeLayer
34 |         activation_kernels
35 |         attention
36 |         decoding_kernels
37 |         quantization_kernels
38 |         unfused_attention_kernels
39 |         gpt_kernels
40 |         memory_utils
41 |         cuda_utils
42 |         logger
43 |         anomaly_handler)
44 | 


--------------------------------------------------------------------------------
/src/turbomind/models/llama/copy.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/turbomind/models/llama/llama_kernels.h"
 6 | #include "src/turbomind/utils/cuda_utils.h"
 7 | 
 8 | namespace turbomind {
 9 | 
10 | class BatchedCopy {
11 | public:
12 |     template<class T, std::enable_if_t<alignof(T) <= alignof(uint32_t), int> = 0>
13 |     T* Add(const T* src, int size, T* dst)
14 |     {
15 |         src_.push_back((void*)src);
16 |         dst_.push_back((void*)dst);
17 |         size_.push_back(sizeof(T) * size);
18 |         return dst + size;
19 |     }
20 | 
21 |     void Submit(cudaStream_t stream)
22 |     {
23 |         if (size_.empty()) {
24 |             return;
25 |         }
26 | 
27 |         invokeBatchedCopy(src_.data(), dst_.data(), size_.data(), size_.size(), stream);
28 |         sync_check_cuda_error();
29 | 
30 |         src_.clear();
31 |         dst_.clear();
32 |         size_.clear();
33 |     }
34 | 
35 | private:
36 |     std::vector<void*> src_;
37 |     std::vector<void*> dst_;
38 |     std::vector<int>   size_;
39 | };
40 | 
41 | }  // namespace turbomind
42 | 


--------------------------------------------------------------------------------
/src/turbomind/models/llama/mla_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | #pragma once
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | #include "src/turbomind/core/data_type.h"
 7 | 
 8 | namespace turbomind {
 9 | 
10 | void MLACopyQKV(DataType     dtype,
11 |                 void*        qkv,
12 |                 const void*  q,
13 |                 const void*  kv_a,
14 |                 const void*  kv_b,
15 |                 int          token_num,
16 |                 int          head_num,
17 |                 int          nope_dim,
18 |                 int          rope_dim,
19 |                 int          kv_lora_rank,
20 |                 int          v_head_dim,
21 |                 cudaStream_t stream);
22 | 
23 | }  // namespace turbomind
24 | 


--------------------------------------------------------------------------------
/src/turbomind/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | cmake_minimum_required(VERSION 3.8)
 4 | project(_turbomind)
 5 | 
 6 | find_package(pybind11 CONFIG)
 7 | if(NOT pybind11_FOUND)
 8 |     execute_process(COMMAND "pybind11-config" "--cmakedir"
 9 |                     RESULT_VARIABLE _COMMAND_SUCCESS
10 |                     OUTPUT_VARIABLE pybind11_DIR
11 |                     OUTPUT_STRIP_TRAILING_WHITESPACE)
12 |     find_package(pybind11 CONFIG)
13 | endif()
14 | 
15 | pybind11_add_module(${PROJECT_NAME} bind.cpp)
16 | target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend)
17 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
18 | 
19 | set(_INSTALL_CUDA_RPATH
20 |     "\$ORIGIN"
21 |     "\$ORIGIN/../../nvidia/nccl/lib/"
22 |     "\$ORIGIN/../../nvidia/cuda_runtime/lib/"
23 |     "\$ORIGIN/../../nvidia/cublas/lib/"
24 |     "\$ORIGIN/../../nvidia/curand/lib/"
25 | )
26 | set_target_properties(${PROJECT_NAME} PROPERTIES
27 |     BUILD_RPATH "\$ORIGIN"
28 |     INSTALL_RPATH "${_INSTALL_CUDA_RPATH}"
29 | )
30 | 


--------------------------------------------------------------------------------
/src/turbomind/triton_backend/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | add_subdirectory(llama)
3 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/constant.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | namespace turbomind {
 6 | 
 7 | const int kMaxLogProb = 1024;
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/cuda_bf16_wrapper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #ifdef ENABLE_BF16
20 | #include <cuda_bf16.h>
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/debug_utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #if __has_include("3rdparty/dbg.h")
4 | #include "3rdparty/dbg.h"
5 | #else
6 | #define dbg(...)
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/dispatch.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <utility>
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | namespace detail {
10 | 
11 | template<int X>
12 | inline constexpr std::integral_constant<int, X> _Int{};
13 | 
14 | template<class F, class P, class G, int... Xs, std::size_t... Is>
15 | bool dispatch_impl(F&& f, P&& p, G g, std::integer_sequence<int, Xs...>, std::index_sequence<Is...>)
16 | {
17 |     constexpr int N = sizeof...(Xs);
18 |     return (((((P &&) p)(_Int<Xs>) || (g && Is == N - 1)) && (((F &&) f)(_Int<Xs>), 1)) || ...);
19 | }
20 | 
21 | }  // namespace detail
22 | 
23 | template<class F, class P, int... Is, class G = std::true_type>
24 | bool dispatch(std::integer_sequence<int, Is...> seq, P&& p, F&& f, G g = {})
25 | {
26 |     return detail::dispatch_impl((F &&) f, (P &&) p, g, seq, std::make_index_sequence<sizeof...(Is)>{});
27 | }
28 | 
29 | template<class F, int... Is, class G = std::true_type>
30 | bool dispatch(std::integer_sequence<int, Is...> seq, F&& f)
31 | {
32 |     return (((F &&) f)(detail::_Int<Is>) || ...);
33 | }
34 | 
35 | }  // namespace turbomind
36 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/memory_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cuda_runtime.h>
20 | 
21 | namespace turbomind {
22 | 
23 | template<typename T>
24 | void invokeInPlaceTranspose102(
25 |     T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0);
26 | 
27 | }  // namespace turbomind
28 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/monotonic.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <cstdlib>
 5 | #include <utility>
 6 | 
 7 | namespace turbomind {
 8 | 
 9 | class Monotonic {
10 | public:
11 |     Monotonic(void* base, size_t alignment = 256): ptr_{base}, alignment_{alignment}
12 |     {
13 |         ptr_ = align(ptr_);
14 |     }
15 | 
16 |     template<class T>
17 |     void operator()(T** ptr, size_t numel) noexcept
18 |     {
19 |         *ptr = (T*)std::exchange(ptr_, align((T*)ptr_ + numel));
20 |     }
21 | 
22 |     void* ptr() const noexcept
23 |     {
24 |         return ptr_;
25 |     }
26 | 
27 | private:
28 |     template<class T>
29 |     void* align(T* p)
30 |     {
31 |         static_assert(sizeof(T*) == sizeof(uintptr_t));
32 |         auto x = reinterpret_cast<uintptr_t>(p);
33 |         if (auto remainder = x % alignment_) {
34 |             x += alignment_ - remainder;
35 |         }
36 |         return reinterpret_cast<void*>(x);
37 |     }
38 | 
39 |     void*  ptr_;
40 |     size_t alignment_;
41 | };
42 | 
43 | }  // namespace turbomind
44 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/parser.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) OpenMMLab. All rights reserved.
 2 | 
 3 | #include <iostream>
 4 | #include <regex>
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | namespace turbomind {
 9 | 
10 | std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str)
11 | {
12 |     const std::regex regex(R"((\w+)=([^,\[\(]+|\[.*\]|\(.*\)))");
13 | 
14 |     std::sregex_iterator beg(str.begin(), str.end(), regex);
15 |     std::sregex_iterator end{};
16 | 
17 |     std::vector<std::pair<std::string, std::string>> ret;
18 |     for (auto it = beg; it != end; ++it) {
19 |         std::smatch match = *it;
20 |         ret.emplace_back(match[1], match[2]);
21 |     }
22 | 
23 |     return ret;
24 | }
25 | 
26 | std::vector<std::string> ParseListOrTuple(const std::string& str)
27 | {
28 |     const std::regex regex(R"([,\[\]\(\)]+)");
29 | 
30 |     std::vector<std::string> ret;
31 |     std::copy_if(std::sregex_token_iterator(str.begin(), str.end(), regex, -1),
32 |                  std::sregex_token_iterator{},
33 |                  std::back_inserter(ret),
34 |                  [](const std::string& s) { return !s.empty(); });
35 | 
36 |     return ret;
37 | }
38 | 
39 | }  // namespace turbomind
40 | 


--------------------------------------------------------------------------------
/src/turbomind/utils/parser.h:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | #include <vector>
 3 | 
 4 | namespace turbomind {
 5 | 
 6 | std::vector<std::pair<std::string, std::string>> ParseArgsList(const std::string& str);
 7 | 
 8 | std::vector<std::string> ParseListOrTuple(const std::string& str);
 9 | 
10 | inline void Parse(int& value, const std::string& str)
11 | {
12 |     value = std::stoi(str);
13 | }
14 | 
15 | inline void Parse(float& value, const std::string& str)
16 | {
17 |     value = std::stof(str);
18 | }
19 | 
20 | template<class T>
21 | void Parse(std::vector<T>& xs, const std::string& str)
22 | {
23 |     const auto ss = ParseListOrTuple(str);
24 |     for (const auto& s : ss) {
25 |         xs.emplace_back();
26 |         Parse(xs.back(), s);
27 |     }
28 | }
29 | 
30 | }  // namespace turbomind
31 | 


--------------------------------------------------------------------------------
/tests/csrc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | add_subdirectory(unittests)
16 | 


--------------------------------------------------------------------------------
/tests/pytorch/kernel/test_activation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | 
 5 | class TestSiluAndMul:
 6 | 
 7 |     @pytest.fixture
 8 |     def seqlen(self, request):
 9 |         yield request.param
10 | 
11 |     @pytest.fixture
12 |     def feat_size(self, request):
13 |         yield request.param
14 | 
15 |     @pytest.fixture
16 |     def x(self, seqlen, feat_size):
17 |         yield torch.rand(seqlen, feat_size, dtype=torch.float16, device='cuda')
18 | 
19 |     @pytest.fixture
20 |     def gt(self, x):
21 |         gate, up = x.chunk(2, -1)
22 |         gate = torch.nn.functional.silu(gate)
23 |         yield gate * up
24 | 
25 |     @pytest.mark.parametrize('seqlen', [65536, 256], indirect=True)
26 |     @pytest.mark.parametrize('feat_size', [4096, 768], indirect=True)
27 |     def test_silu_and_mul(self, x, gt):
28 |         from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul
29 | 
30 |         out = silu_and_mul(x)
31 |         torch.testing.assert_close(out, gt)
32 | 


--------------------------------------------------------------------------------
/tests/pytorch/tools/test_layout_convert.py:
--------------------------------------------------------------------------------
 1 | # yapf: disable
 2 | import pytest
 3 | import torch
 4 | 
 5 | from lmdeploy.pytorch.tools.layout_convert import batch_tensor, continuous_tensor
 6 | 
 7 | # yapf: enable
 8 | 
 9 | 
10 | class TestContinuous:
11 | 
12 |     @pytest.fixture
13 |     def batched_tensor(self):
14 |         yield torch.tensor([[1, 2, 3, 0, 0], [4, 5, 6, 7, 8], [9, 10, 0, 0, 0]])
15 | 
16 |     @pytest.fixture
17 |     def seq_len(self):
18 |         yield torch.tensor([3, 5, 2])
19 | 
20 |     @pytest.fixture
21 |     def conti_tensor(self):
22 |         yield torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
23 | 
24 |     def test_conti_tensor(self, batched_tensor, seq_len, conti_tensor):
25 |         conti_out = continuous_tensor(batched_tensor, seq_len)
26 |         torch.testing.assert_close(conti_out, conti_tensor)
27 | 
28 |         batched_out = batch_tensor(conti_tensor, seq_len)
29 |         torch.testing.assert_close(batched_out, batched_tensor)
30 | 


--------------------------------------------------------------------------------
/tests/test_lmdeploy/test_async_engine.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from lmdeploy.serve.async_engine import get_names_from_model
 5 | 
 6 | 
 7 | def test_get_names_from_hf_model():
 8 |     cases = [
 9 |         # model repo_id from huggingface hub, model_name, chat_template_name
10 |         ('InternLM/internlm2_5-7b-chat', 'internlm2.5-7b-chat', 'internlm2'),
11 |         ('InternLM/internlm2_5-7b-chat', None, 'internlm2'),
12 |     ]
13 |     for model_path, model_name, chat_template in cases:
14 |         _model_name, _chat_template = get_names_from_model(model_path=model_path, model_name=model_name)
15 |         assert _chat_template == chat_template
16 |         assert _model_name == model_name if model_name else model_path
17 | 
18 | 
19 | def test_get_names_from_turbomind_model():
20 |     workspace = tempfile.TemporaryDirectory('internlm2_5-7b-chat').name
21 |     os.makedirs(os.path.join(workspace, 'triton_models', 'weights'), exist_ok=True)
22 | 
23 |     import yaml
24 | 
25 |     expected_chat_template = 'internlm2'
26 |     config = dict(model_config=dict(chat_template=expected_chat_template))
27 |     with open(f'{workspace}/triton_models/weights/config.yaml', 'w') as f:
28 |         yaml.safe_dump(config, f)
29 | 
30 |     _, chat_template = get_names_from_model(workspace)
31 |     assert chat_template == expected_chat_template
32 | 


--------------------------------------------------------------------------------
/tests/test_lmdeploy/test_messages.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import pytest
 4 | 
 5 | from lmdeploy import GenerationConfig, Tokenizer
 6 | from lmdeploy.utils import get_hf_gen_cfg
 7 | 
 8 | 
 9 | def test_engine_generation_config():
10 |     tokenizer = Tokenizer('internlm/internlm-chat-7b')
11 |     config = GenerationConfig(n=3, stop_words=['<eoa>'])
12 |     stop_token_ids = tokenizer.encode('<eoa>', add_bos=False)
13 |     config.convert_stop_bad_words_to_ids(tokenizer)
14 |     assert stop_token_ids == config.stop_token_ids
15 |     assert isinstance(config.stop_token_ids, List) and \
16 |         isinstance(config.stop_token_ids[0], int)
17 | 
18 | 
19 | @pytest.mark.parametrize('model_path', [
20 |     'deepseek-ai/DeepSeek-V3',
21 |     'Qwen/Qwen2.5-32B-Instruct',
22 |     'internlm/internlm3-8b-instruct',
23 | ])
24 | def test_update_from_hf_gen_cfg(model_path):
25 |     tokenizer = Tokenizer(model_path)
26 |     model_cfg = get_hf_gen_cfg(model_path)
27 | 
28 |     generation_config = GenerationConfig()
29 |     generation_config.update_from_hf_gen_cfg(model_cfg, tokenizer.eos_token_id)
30 |     assert generation_config.stop_token_ids is not None
31 | 


--------------------------------------------------------------------------------
/tests/test_lmdeploy/test_utils.py:
--------------------------------------------------------------------------------
 1 | # yapf: disable
 2 | from transformers import AutoConfig
 3 | 
 4 | from lmdeploy.turbomind.deploy.config import ModelConfig, TurbomindModelConfig, config_from_dict
 5 | from lmdeploy.utils import _get_and_verify_max_len
 6 | 
 7 | # yapf: enable
 8 | 
 9 | 
10 | def test_get_and_verify_max_len():
11 |     # with PretrainedConfig
12 |     config = AutoConfig.from_pretrained('OpenGVLab/InternVL-Chat-V1-5-AWQ', trust_remote_code=True)
13 |     assert (_get_and_verify_max_len(config, None) == 32768)
14 |     assert (_get_and_verify_max_len(config, 1024) == 1024)
15 |     assert (_get_and_verify_max_len(config, 102400) == 102400)
16 | 
17 |     # with PretrainedConfig
18 |     config = AutoConfig.from_pretrained('internlm/internlm2-chat-7b', trust_remote_code=True)
19 |     assert (_get_and_verify_max_len(config, None) == 32768)
20 |     assert (_get_and_verify_max_len(config, 1024) == 1024)
21 |     assert (_get_and_verify_max_len(config, 102400) == 102400)
22 | 
23 |     # with TurbomindModelConfig
24 |     config = config_from_dict(TurbomindModelConfig, {})
25 |     config.model_config = config_from_dict(ModelConfig, dict(session_len=4096))
26 |     assert (_get_and_verify_max_len(config, None) == config.session_len)
27 |     assert (_get_and_verify_max_len(config, 1024) == 1024)
28 | 


--------------------------------------------------------------------------------
/tests/test_lmdeploy/test_vl/test_vl_encode.py:
--------------------------------------------------------------------------------
 1 | # yapf: disable
 2 | from lmdeploy.vl.utils import encode_image_base64, load_image, load_image_from_base64
 3 | 
 4 | # yapf: enable
 5 | 
 6 | 
 7 | def test_encode_image_base64():
 8 |     url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
 9 |     im1 = load_image(url)
10 |     base64 = encode_image_base64(url)
11 |     im2 = load_image_from_base64(base64)
12 |     assert im1 == im2.convert('RGB')
13 | 
14 | 
15 | def test_load_truncated_image():
16 |     url = 'https://github.com/irexyc/lmdeploy/releases/download/v0.0.1/tr.jpeg'
17 |     im = load_image(url)
18 |     assert im.width == 1638
19 |     assert im.height == 2048
20 | 
21 | 
22 | def test_load_invalid_url():
23 |     url = ('https://raw.githubusercontent.com/open-mmlab/'
24 |            'mmdeploy/main/tests/data/tiger.jpeg')
25 |     # invalid
26 |     im1 = load_image(url[:-1])
27 |     assert im1.width == 32
28 |     assert im1.height == 32
29 |     # valid
30 |     im2 = load_image(url)
31 |     assert im2.height == 182
32 |     assert im2.width == 278
33 | 
34 | 
35 | def test_load_invalid_base64():
36 |     base64 = 'data:image/jpeg;base64,xxx'
37 |     im = load_image(base64)
38 |     assert im.width == 32
39 |     assert im.height == 32
40 | 


--------------------------------------------------------------------------------