├── .clang-format
├── .github
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE
│ ├── 1-bug-report.yml
│ ├── 2-feature-request.yml
│ └── 3-documentation.yml
├── md-link-config.json
├── pull_request_template.md
├── release.yml
├── scripts
│ ├── action_tools.py
│ ├── check_lmdeploy.py
│ ├── doc_link_checker.py
│ ├── eval_base_config.py
│ ├── eval_chat_config.py
│ ├── eval_regression_base_models.py
│ ├── eval_regression_chat_models.py
│ ├── eval_stable_object_config.py
│ └── eval_stable_subject_config.py
└── workflows
│ ├── benchmark.yml
│ ├── cuda11.8-whl-release.yml
│ ├── daily_ete_test.yml
│ ├── daily_ete_test_3090.yml
│ ├── docker.yml
│ ├── evaluate.yml
│ ├── evaluate_remote.yml
│ ├── lint.yml
│ ├── linux-x64-gpu.yml
│ ├── pr_ete_test.yml
│ ├── pr_full_test.yml
│ ├── pypi.yml
│ ├── stable.yml
│ ├── stale.yml
│ ├── unit-test.yml
│ └── windows-x64-gpu.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_ja.md
├── README_zh-CN.md
├── autotest
├── benchmark
│ ├── test_apiserver_performance.py
│ ├── test_generation_performance.py
│ └── test_throughput_performance.py
├── chat_prompt_case.yaml
├── config-3090.yaml
├── config-pr.yaml
├── config.yaml
├── conftest.py
├── interface
│ ├── pipeline
│ │ ├── test_pipeline_func.py
│ │ └── test_pipeline_longtext_func.py
│ └── restful
│ │ ├── test_restful_chat_func.py
│ │ └── test_restful_completions_v1.py
├── prompt_case.yaml
├── pytest.ini
├── template.json
├── toolchain
│ └── test_lagent.py
├── tools
│ ├── chat
│ │ ├── test_command_chat_hf_pytorch.py
│ │ └── test_command_chat_hf_turbomind.py
│ ├── pipeline
│ │ ├── llm_case.py
│ │ ├── mllm_case.py
│ │ ├── test_pipeline_chat_pytorch_llm.py
│ │ ├── test_pipeline_chat_pytorch_mllm.py
│ │ ├── test_pipeline_chat_turbomind_llm.py
│ │ └── test_pipeline_chat_turbomind_mllm.py
│ ├── quantization
│ │ ├── test_quantization_awq.py
│ │ └── test_quantization_w8a8.py
│ └── restful
│ │ ├── test_restful_chat_hf_pytorch_llm.py
│ │ ├── test_restful_chat_hf_pytorch_mllm.py
│ │ ├── test_restful_chat_hf_turbomind_llm.py
│ │ └── test_restful_chat_hf_turbomind_mllm.py
└── utils
│ ├── benchmark_utils.py
│ ├── config_utils.py
│ ├── get_run_config.py
│ ├── mp_log_utils.py
│ ├── pipeline_chat.py
│ ├── quantization_utils.py
│ ├── restful_return_check.py
│ ├── rule_condition_assert.py
│ ├── run_client_chat.py
│ └── run_restful_chat.py
├── benchmark
├── README.md
├── benchmark_decode.py
├── benchmark_serving.py
├── lmdeploy.yml
├── profile_generation.py
├── profile_pipeline_api.py
├── profile_restful_api.py
└── profile_throughput.py
├── builder
├── manywheel
│ ├── Dockerfile_2014
│ ├── README.md
│ ├── build_all_docker.sh
│ ├── build_all_wheel.sh
│ ├── build_docker.sh
│ ├── build_wheel.sh
│ ├── entrypoint_build.sh
│ └── scripts
│ │ ├── install_conda.sh
│ │ ├── install_cuda.sh
│ │ └── install_openmpi.sh
└── windows
│ ├── README.md
│ ├── generate.ps1
│ └── setup_cuda.ps1
├── cmake
├── Modules
│ ├── FindCUDNN.cmake
│ └── FindNCCL.cmake
├── TritonTurboMindBackendConfig.cmake.in
└── TurboMindConfig.cmake.in
├── debug.sh
├── docker
├── Dockerfile
├── Dockerfile_Hopper
├── Dockerfile_aarch64_ascend
├── InternVL_Dockerfile
└── Qwen2VL_Dockerfile
├── docs
├── en
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── _static
│ │ ├── css
│ │ │ └── readthedocs.css
│ │ └── image
│ │ │ └── lmdeploy-logo.svg
│ ├── advance
│ │ ├── chat_template.md
│ │ ├── debug_turbomind.md
│ │ ├── long_context.md
│ │ ├── pytorch_multinodes.md
│ │ ├── pytorch_multithread.md
│ │ ├── pytorch_new_model.md
│ │ ├── pytorch_profiling.md
│ │ └── structed_output.md
│ ├── api
│ │ └── pipeline.rst
│ ├── benchmark
│ │ ├── a100_fp16.md
│ │ ├── benchmark.md
│ │ └── evaluate_with_opencompass.md
│ ├── conf.py
│ ├── faq.md
│ ├── get_started
│ │ ├── ascend
│ │ │ └── get_started.md
│ │ ├── get_started.md
│ │ ├── index.rst
│ │ └── installation.md
│ ├── index.rst
│ ├── inference
│ │ ├── load_hf.md
│ │ ├── pytorch.md
│ │ ├── turbomind.md
│ │ └── turbomind_config.md
│ ├── llm
│ │ ├── api_server.md
│ │ ├── api_server_lora.md
│ │ ├── api_server_reasoning.md
│ │ ├── api_server_tools.md
│ │ ├── codellama.md
│ │ ├── gradio.md
│ │ ├── pipeline.md
│ │ └── proxy_server.md
│ ├── make.bat
│ ├── multi_modal
│ │ ├── api_server_vl.md
│ │ ├── cogvlm.md
│ │ ├── deepseek_vl2.md
│ │ ├── gemma3.md
│ │ ├── index.rst
│ │ ├── internvl.md
│ │ ├── llava.md
│ │ ├── minicpmv.md
│ │ ├── mllama.md
│ │ ├── molmo.md
│ │ ├── phi3.md
│ │ ├── qwen2_5_vl.md
│ │ ├── qwen2_vl.md
│ │ ├── vl_pipeline.md
│ │ └── xcomposer2d5.md
│ ├── quantization
│ │ ├── kv_quant.md
│ │ ├── w4a16.md
│ │ └── w8a8.md
│ └── supported_models
│ │ └── supported_models.md
└── zh_cn
│ ├── .readthedocs.yaml
│ ├── Makefile
│ ├── _static
│ ├── css
│ │ └── readthedocs.css
│ └── image
│ │ └── lmdeploy-logo.svg
│ ├── advance
│ ├── chat_template.md
│ ├── debug_turbomind.md
│ ├── long_context.md
│ ├── pytorch_multinodes.md
│ ├── pytorch_multithread.md
│ ├── pytorch_new_model.md
│ ├── pytorch_profiling.md
│ └── structed_output.md
│ ├── api
│ └── pipeline.rst
│ ├── benchmark
│ ├── benchmark.md
│ └── evaluate_with_opencompass.md
│ ├── conf.py
│ ├── faq.md
│ ├── get_started
│ ├── ascend
│ │ └── get_started.md
│ ├── get_started.md
│ ├── index.rst
│ └── installation.md
│ ├── index.rst
│ ├── inference
│ ├── load_hf.md
│ ├── pytorch.md
│ ├── turbomind.md
│ └── turbomind_config.md
│ ├── llm
│ ├── api_server.md
│ ├── api_server_lora.md
│ ├── api_server_reasoning.md
│ ├── api_server_tools.md
│ ├── codellama.md
│ ├── gradio.md
│ ├── pipeline.md
│ └── proxy_server.md
│ ├── make.bat
│ ├── multi_modal
│ ├── api_server_vl.md
│ ├── cogvlm.md
│ ├── deepseek_vl2.md
│ ├── gemma3.md
│ ├── index.rst
│ ├── internvl.md
│ ├── llava.md
│ ├── minicpmv.md
│ ├── mllama.md
│ ├── molmo.md
│ ├── phi3.md
│ ├── qwen2_5_vl.md
│ ├── qwen2_vl.md
│ ├── vl_pipeline.md
│ └── xcomposer2d5.md
│ ├── quantization
│ ├── kv_quant.md
│ ├── w4a16.md
│ └── w8a8.md
│ └── supported_models
│ └── supported_models.md
├── generate.sh
├── k8s
├── deployment.yaml
└── service.yaml
├── lmdeploy
├── __init__.py
├── __main__.py
├── api.py
├── archs.py
├── cli
│ ├── __init__.py
│ ├── cli.py
│ ├── entrypoint.py
│ ├── lite.py
│ ├── serve.py
│ └── utils.py
├── lite
│ ├── __init__.py
│ ├── apis
│ │ ├── __init__.py
│ │ ├── auto_awq.py
│ │ ├── calibrate.py
│ │ ├── get_small_sharded_hf.py
│ │ ├── gptq.py
│ │ ├── kv_qparams.py
│ │ └── smooth_quant.py
│ ├── defaults.py
│ ├── modeling
│ │ ├── __init__.py
│ │ ├── internlm2_gptq.py
│ │ └── internlm3_gptq.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── activation
│ │ │ ├── __init__.py
│ │ │ └── observer.py
│ │ ├── awq.py
│ │ ├── calibration.py
│ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ └── linear.py
│ │ └── weight
│ │ │ ├── __init__.py
│ │ │ └── quantizer.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── batch_split.py
│ │ ├── cal_qparams.py
│ │ ├── calib_dataloader.py
│ │ ├── collect.py
│ │ ├── global_avail.py
│ │ ├── load.py
│ │ └── memory_efficient.py
├── logger.py
├── messages.py
├── model.py
├── profiler.py
├── pytorch
│ ├── __init__.py
│ ├── accel.py
│ ├── adapter
│ │ ├── __init__.py
│ │ └── adapter.py
│ ├── backends
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── apply_rotary_emb.py
│ │ ├── attention.py
│ │ ├── awq_modules.py
│ │ ├── base.py
│ │ ├── blockedf8_modules.py
│ │ ├── cuda
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── apply_rotary_emb.py
│ │ │ ├── attention.py
│ │ │ ├── awq_modules.py
│ │ │ ├── blockedf8_modules.py
│ │ │ ├── flash_attention.py
│ │ │ ├── graph_runner.py
│ │ │ ├── lora.py
│ │ │ ├── moe.py
│ │ │ ├── multinomial_sampling.py
│ │ │ ├── norm.py
│ │ │ ├── op_backend.py
│ │ │ ├── qmodules.py
│ │ │ ├── token_dispatcher.py
│ │ │ └── warmup_manager.py
│ │ ├── default
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── apply_rotary_emb.py
│ │ │ ├── awq_modules.py
│ │ │ ├── linear.py
│ │ │ ├── moe.py
│ │ │ ├── multinomial_sampling.py
│ │ │ ├── norm.py
│ │ │ ├── op_backend.py
│ │ │ ├── rotary_embedding.py
│ │ │ └── token_dispatcher.py
│ │ ├── dlinfer
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── apply_rotary_emb.py
│ │ │ ├── ascend
│ │ │ │ ├── __init__.py
│ │ │ │ ├── graph_runner.py
│ │ │ │ └── op_backend.py
│ │ │ ├── attention.py
│ │ │ ├── awq_modules.py
│ │ │ ├── camb
│ │ │ │ ├── __init__.py
│ │ │ │ └── op_backend.py
│ │ │ ├── flash_attention.py
│ │ │ ├── linear.py
│ │ │ ├── maca
│ │ │ │ ├── __init__.py
│ │ │ │ └── op_backend.py
│ │ │ ├── moe.py
│ │ │ ├── norm.py
│ │ │ ├── op_backend.py
│ │ │ ├── qmodules.py
│ │ │ └── rotary_embedding.py
│ │ ├── flash_attention.py
│ │ ├── graph_runner.py
│ │ ├── linear.py
│ │ ├── lora.py
│ │ ├── moe.py
│ │ ├── multinomial_sampling.py
│ │ ├── norm.py
│ │ ├── qmodules.py
│ │ ├── rotary_embedding.py
│ │ ├── selector.py
│ │ └── token_dispatcher.py
│ ├── block.py
│ ├── chat.py
│ ├── check_env
│ │ ├── __init__.py
│ │ ├── adapter.py
│ │ ├── base.py
│ │ ├── deeplink.py
│ │ ├── dist.py
│ │ ├── model.py
│ │ ├── torch.py
│ │ ├── transformers.py
│ │ ├── triton.py
│ │ └── triton_custom_add.py
│ ├── config.py
│ ├── configurations
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── chatglm.py
│ │ ├── cogvlm.py
│ │ ├── deepseek_v2.py
│ │ ├── deepseek_vl2.py
│ │ ├── default.py
│ │ ├── gemma.py
│ │ ├── internvl.py
│ │ ├── llama4.py
│ │ ├── llava_hf.py
│ │ ├── minicpm3.py
│ │ ├── mllama.py
│ │ ├── qwen.py
│ │ └── utils.py
│ ├── devices
│ │ ├── __init__.py
│ │ └── device_manager.py
│ ├── disagg
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── backend
│ │ │ ├── __init__.py
│ │ │ ├── backend.py
│ │ │ ├── base.py
│ │ │ ├── dlslime.py
│ │ │ ├── infinistore.py
│ │ │ └── mooncake.py
│ │ ├── config.py
│ │ ├── conn.py
│ │ ├── messages.py
│ │ └── request.py
│ ├── distributed.py
│ ├── engine
│ │ ├── __init__.py
│ │ ├── cache_engine.py
│ │ ├── engine.py
│ │ ├── engine_checker.py
│ │ ├── engine_instance.py
│ │ ├── executor
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── base_worker.py
│ │ │ ├── dist_utils.py
│ │ │ ├── mp_executor.py
│ │ │ ├── ray_executor.py
│ │ │ └── uni_executor.py
│ │ ├── guided_process.py
│ │ ├── input_process.py
│ │ ├── logits_process.py
│ │ ├── model_agent.py
│ │ └── request.py
│ ├── envs.py
│ ├── kernels
│ │ ├── __init__.py
│ │ ├── alibi_pagedattention.py
│ │ ├── apply_rotary_pos_emb.py
│ │ ├── cuda
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── alibi_pagedattention.py
│ │ │ ├── apply_rotary_pos_emb.py
│ │ │ ├── awq_kernels.py
│ │ │ ├── blocked_fp8_fused_moe.py
│ │ │ ├── blocked_gemm_fp8.py
│ │ │ ├── ep_moe.py
│ │ │ ├── fill_kv_cache.py
│ │ │ ├── flash_mla.py
│ │ │ ├── flashattention.py
│ │ │ ├── flatten_kv_cache.py
│ │ │ ├── fused_lora.py
│ │ │ ├── fused_moe.py
│ │ │ ├── fused_rotary_emb.py
│ │ │ ├── multinomial_sampling.py
│ │ │ ├── pagedattention.py
│ │ │ ├── rms_norm.py
│ │ │ ├── triton_utils.py
│ │ │ ├── utils.py
│ │ │ ├── w8a8_fused_moe.py
│ │ │ └── w8a8_triton_kernels.py
│ │ ├── default
│ │ │ ├── __init__.py
│ │ │ ├── multinomial_sampling.py
│ │ │ └── w8a8_kernels.py
│ │ ├── dispatcher.py
│ │ ├── dlinfer
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── apply_rotary_pos_emb.py
│ │ │ ├── awq_kernels.py
│ │ │ ├── fill_kv_cache.py
│ │ │ ├── flash_attention.py
│ │ │ ├── fused_moe.py
│ │ │ ├── fused_rotary_emb.py
│ │ │ ├── linear.py
│ │ │ ├── moe_gating_topk_softmax.py
│ │ │ ├── pagedattention.py
│ │ │ ├── rms_norm.py
│ │ │ └── w8a8_kernels.py
│ │ ├── fill_kv_cache.py
│ │ ├── flash_mla.py
│ │ ├── fused_moe.py
│ │ ├── fused_rotary_emb.py
│ │ ├── moe_gating_topk_softmax.py
│ │ ├── multinomial_sampling.py
│ │ ├── pagedattention.py
│ │ ├── rms_norm.py
│ │ └── w8a8_triton_kernels.py
│ ├── messages.py
│ ├── model_inputs.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── baichuan.py
│ │ ├── chatglm2.py
│ │ ├── cogvlm.py
│ │ ├── deepseek.py
│ │ ├── deepseek_v2.py
│ │ ├── deepseek_vl2.py
│ │ ├── gemma.py
│ │ ├── gemma3_vl.py
│ │ ├── internlm.py
│ │ ├── internlm2.py
│ │ ├── internlm2_reward.py
│ │ ├── internlm2_ve.py
│ │ ├── internlm3.py
│ │ ├── internvl.py
│ │ ├── internvl_patch.py
│ │ ├── llama.py
│ │ ├── llama4.py
│ │ ├── llava.py
│ │ ├── minicpm3.py
│ │ ├── minicpmv26.py
│ │ ├── mistral.py
│ │ ├── mixtral.py
│ │ ├── mllama.py
│ │ ├── module_map.py
│ │ ├── patch.py
│ │ ├── phi3.py
│ │ ├── phi3_moe.py
│ │ ├── phi3_v.py
│ │ ├── q_modules.py
│ │ ├── qwen.py
│ │ ├── qwen2.py
│ │ ├── qwen2_5_vl.py
│ │ ├── qwen2_moe.py
│ │ ├── qwen2_reward.py
│ │ ├── qwen2_vl.py
│ │ ├── qwen3.py
│ │ ├── qwen3_moe.py
│ │ ├── siglip.py
│ │ ├── starcoder2.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── cudagraph.py
│ │ │ ├── micro_batch.py
│ │ │ ├── model.py
│ │ │ └── multimodal.py
│ ├── multimodal
│ │ ├── __init__.py
│ │ ├── data_type.py
│ │ └── image_type.py
│ ├── nn
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── attention.py
│ │ ├── linear.py
│ │ ├── moe.py
│ │ ├── multinomial_sampling.py
│ │ ├── norm.py
│ │ ├── rotary_embedding.py
│ │ └── utils.py
│ ├── paging
│ │ ├── __init__.py
│ │ ├── block_manager
│ │ │ ├── __init__.py
│ │ │ ├── base_block_manager.py
│ │ │ ├── default_block_manager.py
│ │ │ └── window_block_manager.py
│ │ ├── block_trie.py
│ │ ├── eviction_helper
│ │ │ ├── __init__.py
│ │ │ ├── base_eviction_helper.py
│ │ │ └── recompute_eviction_helper.py
│ │ └── scheduler.py
│ ├── supported_models.py
│ ├── tools
│ │ ├── __init__.py
│ │ ├── layout_convert.py
│ │ ├── make_inputs.py
│ │ └── utils.py
│ ├── utils.py
│ └── weight_loader
│ │ ├── __init__.py
│ │ └── model_weight_loader.py
├── serve
│ ├── __init__.py
│ ├── async_engine.py
│ ├── gradio
│ │ ├── __init__.py
│ │ ├── api_server_backend.py
│ │ ├── app.py
│ │ ├── constants.py
│ │ ├── turbomind_coupled.py
│ │ └── vl.py
│ ├── openai
│ │ ├── __init__.py
│ │ ├── api_client.py
│ │ ├── api_server.py
│ │ ├── launch_server.py
│ │ ├── protocol.py
│ │ ├── reasoning_parser
│ │ │ ├── __init__.py
│ │ │ ├── deepseek_r1_reasoning_parser.py
│ │ │ ├── qwen_qwq_reasoning_parser.py
│ │ │ └── reasoning_parser.py
│ │ └── tool_parser
│ │ │ ├── __init__.py
│ │ │ ├── internlm2_parser.py
│ │ │ ├── llama3_parser.py
│ │ │ ├── qwen2d5_parser.py
│ │ │ ├── tool_parser.py
│ │ │ └── utils.py
│ ├── proxy
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ └── proxy.py
│ ├── turbomind
│ │ ├── __init__.py
│ │ └── triton_python_backend
│ │ │ ├── README.md
│ │ │ ├── client.py
│ │ │ ├── config.pbtxt
│ │ │ └── model.py
│ ├── utils.py
│ └── vl_async_engine.py
├── tokenizer.py
├── turbomind
│ ├── __init__.py
│ ├── chat.py
│ ├── deploy
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── converter.py
│ │ ├── loader.py
│ │ ├── module.py
│ │ ├── parameter.py
│ │ ├── policy.py
│ │ ├── source_model
│ │ │ ├── __init__.py
│ │ │ ├── baichuan.py
│ │ │ ├── base.py
│ │ │ ├── deepseek2.py
│ │ │ ├── deepseek_vl.py
│ │ │ ├── glm4.py
│ │ │ ├── internlm2.py
│ │ │ ├── internvl.py
│ │ │ ├── llama.py
│ │ │ ├── llava.py
│ │ │ ├── minicpmv.py
│ │ │ ├── mixtral.py
│ │ │ ├── molmo.py
│ │ │ ├── qwen.py
│ │ │ └── xcomposer2.py
│ │ └── target_model
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── fp.py
│ ├── generate_gemm_config.py
│ ├── supported_models.py
│ ├── turbomind.py
│ └── utils.py
├── utils.py
├── version.py
└── vl
│ ├── __init__.py
│ ├── constants.py
│ ├── engine.py
│ ├── model
│ ├── __init__.py
│ ├── base.py
│ ├── builder.py
│ ├── cogvlm.py
│ ├── deepseek.py
│ ├── deepseek_vl2.py
│ ├── gemma3_vl.py
│ ├── glm_4v.py
│ ├── internvl.py
│ ├── internvl_llava.py
│ ├── llama4.py
│ ├── llava.py
│ ├── llava_hf.py
│ ├── llava_next.py
│ ├── minicpmv.py
│ ├── mllama.py
│ ├── molmo.py
│ ├── phi3_vision.py
│ ├── qwen.py
│ ├── qwen2.py
│ ├── utils.py
│ ├── xcomposer2.py
│ └── yi.py
│ ├── tools
│ ├── __init__.py
│ └── merge_xcomposer2d5_task.py
│ └── utils.py
├── requirements
├── build.txt
├── docs.txt
├── lite.txt
├── readthedocs.txt
├── runtime_ascend.txt
├── runtime_camb.txt
├── runtime_cuda.txt
├── runtime_maca.txt
├── serve.txt
└── test.txt
├── requirements_ascend.txt
├── requirements_camb.txt
├── requirements_cuda.txt
├── requirements_maca.txt
├── resources
└── batch_memory.png
├── setup.py
├── src
├── CMakeLists.txt
└── turbomind
│ ├── CMakeLists.txt
│ ├── comm
│ ├── CMakeLists.txt
│ ├── barrier.h
│ ├── cuda_ipc
│ │ ├── CMakeLists.txt
│ │ ├── allgather.cu
│ │ ├── allreduce.cu
│ │ ├── bootstrap.h
│ │ ├── cuda_ipc_comm.cu
│ │ ├── cuda_ipc_comm.h
│ │ ├── device_semaphore.h
│ │ ├── fused_allreduce.cu
│ │ ├── fused_allreduce_ex.cu
│ │ ├── group_sum.h
│ │ └── mscclpp.h
│ ├── device_comm.cc
│ ├── device_comm.h
│ ├── host_comm.cc
│ ├── host_comm.h
│ ├── nccl
│ │ ├── CMakeLists.txt
│ │ └── nccl.cu
│ ├── test_comm.cu
│ └── thread_comm.cc
│ ├── core
│ ├── CMakeLists.txt
│ ├── allocator.cc
│ ├── allocator.h
│ ├── buffer.cc
│ ├── buffer.h
│ ├── check.cc
│ ├── check.h
│ ├── common.h
│ ├── context.cc
│ ├── context.h
│ ├── core.h
│ ├── cuda_data_type.h
│ ├── data_type.h
│ ├── layout.cc
│ ├── layout.h
│ ├── module.cc
│ ├── module.h
│ ├── stream.cc
│ ├── stream.h
│ ├── tensor.cc
│ ├── tensor.cu
│ ├── tensor.h
│ └── test_core.cc
│ ├── engine
│ ├── CMakeLists.txt
│ ├── gateway.cc
│ ├── gateway.h
│ ├── model_request.cc
│ ├── model_request.h
│ ├── request.h
│ ├── request_queue.cc
│ ├── request_queue.h
│ └── signal_buffer.h
│ ├── kernels
│ ├── CMakeLists.txt
│ ├── activation_kernels.cu
│ ├── activation_kernels.h
│ ├── attention
│ │ ├── CMakeLists.txt
│ │ ├── arch.h
│ │ ├── attention.cu
│ │ ├── attention.h
│ │ ├── attention_config.h
│ │ ├── attention_params.h
│ │ ├── attention_template.h
│ │ ├── attention_universal.h
│ │ ├── block.h
│ │ ├── block_iterator.h
│ │ ├── codegen
│ │ │ ├── attention_sm70_128_f16.cu
│ │ │ ├── attention_sm70_64_f16.cu
│ │ │ ├── attention_sm75_128_f16.cu
│ │ │ ├── attention_sm75_64_f16.cu
│ │ │ ├── attention_sm80_128_bf16.cu
│ │ │ ├── attention_sm80_128_f16.cu
│ │ │ ├── attention_sm80_192.cu
│ │ │ ├── attention_sm80_64_bf16.cu
│ │ │ ├── attention_sm80_64_f16.cu
│ │ │ ├── decoding_sm70_128_f16_f16.cu
│ │ │ ├── decoding_sm70_128_f16_u4.cu
│ │ │ ├── decoding_sm70_128_f16_u8.cu
│ │ │ ├── decoding_sm70_64_f16_f16.cu
│ │ │ ├── decoding_sm70_64_f16_u4.cu
│ │ │ ├── decoding_sm70_64_f16_u8.cu
│ │ │ ├── decoding_sm75_128_f16_f16.cu
│ │ │ ├── decoding_sm75_128_f16_u4.cu
│ │ │ ├── decoding_sm75_128_f16_u8.cu
│ │ │ ├── decoding_sm75_64_f16_f16.cu
│ │ │ ├── decoding_sm75_64_f16_u4.cu
│ │ │ ├── decoding_sm75_64_f16_u8.cu
│ │ │ ├── decoding_sm80_128_bf16_bf16.cu
│ │ │ ├── decoding_sm80_128_bf16_u4.cu
│ │ │ ├── decoding_sm80_128_bf16_u8.cu
│ │ │ ├── decoding_sm80_128_f16_f16.cu
│ │ │ ├── decoding_sm80_128_f16_u4.cu
│ │ │ ├── decoding_sm80_128_f16_u8.cu
│ │ │ ├── decoding_sm80_192.cu
│ │ │ ├── decoding_sm80_64_bf16_bf16.cu
│ │ │ ├── decoding_sm80_64_bf16_u4.cu
│ │ │ ├── decoding_sm80_64_bf16_u8.cu
│ │ │ ├── decoding_sm80_64_f16_f16.cu
│ │ │ ├── decoding_sm80_64_f16_u4.cu
│ │ │ └── decoding_sm80_64_f16_u8.cu
│ │ ├── cta_map.h
│ │ ├── decoding.cu
│ │ ├── decoding.h
│ │ ├── decoding_config.h
│ │ ├── decoding_template.h
│ │ ├── impl.h
│ │ ├── impl_16816.h
│ │ ├── impl_1688.h
│ │ ├── impl_81616.h
│ │ ├── impl_884.h
│ │ ├── impl_m16n8.h
│ │ ├── impl_simt.h
│ │ ├── iterator.h
│ │ ├── iterator_sm70.h
│ │ ├── iterator_sm80.h
│ │ ├── kv_cache_utils_v2.cu
│ │ ├── kv_cache_utils_v2.h
│ │ ├── linear_iterator.h
│ │ ├── mainloop.h
│ │ ├── mainloop_sm70.h
│ │ ├── mainloop_sm80.h
│ │ ├── quantization.h
│ │ ├── reduce.cu
│ │ ├── reduce.h
│ │ ├── reduce_kernel.h
│ │ ├── reference.cu
│ │ ├── reference.h
│ │ ├── rotary_embedding.h
│ │ ├── test_attention.cu
│ │ ├── test_quant.cu
│ │ ├── test_utils.cu
│ │ ├── test_utils.h
│ │ ├── utils.cc
│ │ └── utils.h
│ ├── ban_bad_words.cu
│ ├── ban_bad_words.h
│ ├── core
│ │ ├── array.h
│ │ ├── array_ops.h
│ │ ├── common.h
│ │ ├── data_type.h
│ │ ├── layout.h
│ │ ├── math.h
│ │ ├── meta.h
│ │ ├── mma.h
│ │ ├── pipe_iter.h
│ │ ├── smem.h
│ │ ├── sub_byte_ptr.h
│ │ ├── sync.h
│ │ └── thread_map.h
│ ├── decoding_kernels.cu
│ ├── decoding_kernels.h
│ ├── flash_attention
│ │ ├── CMakeLists.txt
│ │ ├── flash_attention.cu
│ │ ├── flash_attention.h
│ │ ├── flash_attention2
│ │ │ ├── CMakeLists.txt
│ │ │ ├── README.md
│ │ │ ├── block_info.h
│ │ │ ├── flash.h
│ │ │ ├── flash_api.cpp
│ │ │ ├── flash_fwd_hdim128_bf16_sm80.cu
│ │ │ ├── flash_fwd_hdim128_fp16_sm80.cu
│ │ │ ├── flash_fwd_hdim256_bf16_sm80.cu
│ │ │ ├── flash_fwd_hdim256_fp16_sm80.cu
│ │ │ ├── flash_fwd_hdim32_bf16_sm80.cu
│ │ │ ├── flash_fwd_hdim32_fp16_sm80.cu
│ │ │ ├── flash_fwd_hdim64_bf16_sm80.cu
│ │ │ ├── flash_fwd_hdim64_fp16_sm80.cu
│ │ │ ├── flash_fwd_kernel.h
│ │ │ ├── flash_fwd_launch_template.h
│ │ │ ├── kernel_traits.h
│ │ │ ├── softmax.h
│ │ │ ├── static_switch.h
│ │ │ └── utils.h
│ │ └── fused_multi_head_attention
│ │ │ ├── CMakeLists.txt
│ │ │ ├── llama_flash_attention_kernel.cu
│ │ │ ├── mma_accum_lambda_iterator.h
│ │ │ └── tile_smem_loader.h
│ ├── gemm
│ │ ├── CMakeLists.txt
│ │ ├── arch.h
│ │ ├── arch
│ │ │ ├── config_simt.h
│ │ │ ├── config_sm70_s884.h
│ │ │ ├── config_sm75_s16816.h
│ │ │ ├── config_sm80_s16816.h
│ │ │ ├── mma_simt.h
│ │ │ ├── mma_sm70.h
│ │ │ ├── mma_sm80.h
│ │ │ ├── operand_simt.h
│ │ │ ├── operand_sm70_s884.h
│ │ │ ├── operand_sm80_s16816.h
│ │ │ ├── smem_copy_simt.h
│ │ │ ├── smem_copy_sm70.h
│ │ │ └── smem_copy_sm80.h
│ │ ├── cast.cu
│ │ ├── cast.h
│ │ ├── context.cu
│ │ ├── context.h
│ │ ├── convert_v2.cu
│ │ ├── convert_v2.h
│ │ ├── cp_async.h
│ │ ├── cta_map.h
│ │ ├── desc.h
│ │ ├── dispatch_cache.cu
│ │ ├── dispatch_cache.h
│ │ ├── epilogue.h
│ │ ├── format.h
│ │ ├── gemm.cu
│ │ ├── gemm.h
│ │ ├── gemm_universal.h
│ │ ├── gpu_metric.cu
│ │ ├── gpu_metric.h
│ │ ├── iterator.h
│ │ ├── iterator_sm70.h
│ │ ├── iterator_sm80.h
│ │ ├── kernel.cu
│ │ ├── kernel.h
│ │ ├── kernel
│ │ │ ├── f16_u4g128_f16_tnt_sm70_s884.cu
│ │ │ ├── f16_u4g128_f16_tnt_sm75_s16816.cu
│ │ │ ├── f16_u4g128_f16_tnt_sm75_simt.cu
│ │ │ ├── f16_u4g128_f16_tnt_sm80_s16816.cu
│ │ │ ├── f16_u4g128_f16_tnt_sm90_s16816.cu
│ │ │ ├── sm70_s884_dynamic.cu
│ │ │ ├── sm75_s16816_dynamic.cu
│ │ │ ├── sm80_s16816_dynamic.cu
│ │ │ ├── sm90_s16816_dynamic.cu
│ │ │ └── u4g128_f16_f16_nnn_sm80_s16816.cu
│ │ ├── kernel_impl.h
│ │ ├── mainloop_sm70.h
│ │ ├── mainloop_sm80_v2.h
│ │ ├── matrix_ptr.h
│ │ ├── moe_utils_v2.cu
│ │ ├── moe_utils_v2.h
│ │ ├── operand.h
│ │ ├── predicate.h
│ │ ├── registry.cu
│ │ ├── registry.h
│ │ ├── simt.h
│ │ ├── smem_copy.h
│ │ ├── test
│ │ │ ├── gemm_bench.cu
│ │ │ ├── gemm_test.cu
│ │ │ ├── models.h
│ │ │ ├── quantization.cu
│ │ │ ├── quantization.h
│ │ │ ├── quantization_impl.h
│ │ │ ├── reference.cu
│ │ │ ├── reference.h
│ │ │ ├── test_moe_utils.cu
│ │ │ ├── test_utils.cu
│ │ │ ├── test_utils.h
│ │ │ └── testbed.h
│ │ ├── thread_group_map.h
│ │ ├── thread_map.h
│ │ ├── tiled_mma.h
│ │ ├── transform.h
│ │ ├── tuner
│ │ │ ├── cache_utils.cu
│ │ │ ├── cache_utils.h
│ │ │ ├── measurer.cu
│ │ │ ├── measurer.h
│ │ │ ├── params.cc
│ │ │ ├── params.h
│ │ │ ├── sampler.cu
│ │ │ ├── sampler.h
│ │ │ ├── stats.h
│ │ │ ├── stopping_criterion.cc
│ │ │ └── stopping_criterion.h
│ │ ├── types.h
│ │ ├── unpack.cu
│ │ └── utils.h
│ ├── gpt_kernels.cu
│ ├── gpt_kernels.h
│ ├── logprob_kernels.cu
│ ├── logprob_kernels.h
│ ├── norm
│ │ ├── CMakeLists.txt
│ │ ├── rms_norm.cu
│ │ └── rms_norm.h
│ ├── penalty_types.h
│ ├── reduce_kernel_utils.cuh
│ ├── sampling_kernels.cu
│ ├── sampling_kernels.h
│ ├── sampling_penalty_kernels.cu
│ ├── sampling_penalty_kernels.h
│ ├── sampling_topk_kernels.cu
│ ├── sampling_topk_kernels.h
│ ├── sampling_topp_kernels.cu
│ ├── sampling_topp_kernels.h
│ ├── stop_criteria_kernels.cu
│ ├── stop_criteria_kernels.h
│ ├── unfused_attention_kernels.cu
│ └── unfused_attention_kernels.h
│ ├── layers
│ ├── BaseDynamicDecodeLayer.h
│ ├── CMakeLists.txt
│ ├── DynamicDecodeLayer.cc
│ ├── DynamicDecodeLayer.h
│ └── sampling_layers
│ │ ├── CMakeLists.txt
│ │ ├── LogitsProcessorLayer.cc
│ │ ├── LogitsProcessorLayer.h
│ │ ├── SamplingLayer.cc
│ │ ├── SamplingLayer.h
│ │ ├── StopCriteriaLayer.cc
│ │ ├── StopCriteriaLayer.h
│ │ └── utils.h
│ ├── macro.h
│ ├── models
│ ├── CMakeLists.txt
│ └── llama
│ │ ├── Barrier.h
│ │ ├── BlockManager.cc
│ │ ├── BlockManager.h
│ │ ├── BlockTrie.cc
│ │ ├── BlockTrie.h
│ │ ├── CMakeLists.txt
│ │ ├── LlamaBatch.cc
│ │ ├── LlamaBatch.h
│ │ ├── LlamaDecoderLayerWeight.cc
│ │ ├── LlamaDecoderLayerWeight.h
│ │ ├── LlamaDenseWeight.cc
│ │ ├── LlamaDenseWeight.h
│ │ ├── LlamaFfnLayer.cc
│ │ ├── LlamaFfnLayer.h
│ │ ├── LlamaLinear.cu
│ │ ├── LlamaLinear.h
│ │ ├── LlamaV2.cc
│ │ ├── LlamaV2.h
│ │ ├── LlamaWeight.cc
│ │ ├── LlamaWeight.h
│ │ ├── SequenceManager.cc
│ │ ├── SequenceManager.h
│ │ ├── context.h
│ │ ├── copy.h
│ │ ├── llama_kernels.cu
│ │ ├── llama_kernels.h
│ │ ├── llama_params.h
│ │ ├── llama_rope.h
│ │ ├── llama_utils.cu
│ │ ├── llama_utils.h
│ │ ├── mla_utils.cu
│ │ ├── mla_utils.h
│ │ ├── moe_ffn_layer.cc
│ │ ├── moe_ffn_layer.h
│ │ ├── test_cache_manager.cc
│ │ ├── unified_attention_layer.cc
│ │ ├── unified_attention_layer.h
│ │ ├── unified_decoder.cc
│ │ └── unified_decoder.h
│ ├── python
│ ├── CMakeLists.txt
│ ├── bind.cpp
│ └── dlpack.h
│ ├── triton_backend
│ ├── CMakeLists.txt
│ └── llama
│ │ ├── CMakeLists.txt
│ │ ├── LlamaTritonModel.cc
│ │ └── LlamaTritonModel.h
│ └── utils
│ ├── CMakeLists.txt
│ ├── anomaly_handler.cu
│ ├── anomaly_handler.h
│ ├── constant.h
│ ├── cuda_bf16_fallbacks.cuh
│ ├── cuda_bf16_wrapper.h
│ ├── cuda_type_utils.cuh
│ ├── cuda_utils.cc
│ ├── cuda_utils.h
│ ├── debug_utils.h
│ ├── dispatch.h
│ ├── logger.cc
│ ├── logger.h
│ ├── memory_utils.cu
│ ├── memory_utils.h
│ ├── monotonic.h
│ ├── nvtx_utils.cc
│ ├── nvtx_utils.h
│ ├── parser.cc
│ ├── parser.h
│ ├── string_utils.h
│ └── test_utils.h
└── tests
├── csrc
├── CMakeLists.txt
└── unittests
│ ├── CMakeLists.txt
│ ├── gtest_utils.h
│ ├── test_logprob_kernels.cu
│ ├── test_penalty_kernels.cu
│ ├── test_sampling_kernels.cu
│ ├── test_sampling_layer.cu
│ └── unittest_utils.h
├── pytorch
├── engine
│ ├── test_logits_process.py
│ └── test_request.py
├── kernel
│ ├── test_activation.py
│ ├── test_apply_rotary.py
│ ├── test_fill_kv_cache.py
│ ├── test_flash_attention.py
│ ├── test_flatten_kv_cache.py
│ ├── test_fuse_moe_blocked_fp8.py
│ ├── test_fused_lora.py
│ ├── test_fused_moe.py
│ ├── test_fused_rotary_emb.py
│ ├── test_gemm_fp8.py
│ ├── test_multinomial_sampling.py
│ ├── test_paged_attention.py
│ └── test_rms_norm.py
├── paging
│ ├── test_block_manager.py
│ ├── test_block_trie.py
│ └── test_scheduler.py
└── tools
│ ├── test_layout_convert.py
│ └── test_make_inputs.py
└── test_lmdeploy
├── test_async_engine.py
├── test_auto_backend.py
├── test_lite
└── test_quantization
│ └── test_utils
│ └── test_cal_qparams.py
├── test_messages.py
├── test_model.py
├── test_tokenizer.py
├── test_turbomind
└── test_converter.py
├── test_utils.py
└── test_vl
└── test_vl_encode.py
/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
1 | name: 🚀 Feature request
2 | description: Suggest an idea for this project
3 | title: "[Feature] "
4 |
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | We strongly appreciate you creating a PR to implement this feature [here](https://github.com/InternLM/lmdeploy/pulls)!
10 | If you need our help, please fill in as much of the following form as you're able to.
11 |
12 | **The less clear the description, the longer it will take to solve it.**
13 | - type: textarea
14 | attributes:
15 | label: Motivation
16 | description: |
17 | A clear and concise description of the motivation of the feature.
18 | Ex1. It is inconvenient when \[....\].
19 | validations:
20 | required: true
21 | - type: textarea
22 | attributes:
23 | label: Related resources
24 | description: |
25 | If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
26 | - type: textarea
27 | attributes:
28 | label: Additional context
29 | description: |
30 | Add any other context or screenshots about the feature request here.
31 | If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
32 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-documentation.yml:
--------------------------------------------------------------------------------
1 | name: 📚 Documentation
2 | description: Report an issue related to the documentation.
3 | labels: "kind/doc,status/unconfirmed"
4 | title: "[Docs] "
5 |
6 | body:
7 | - type: textarea
8 | attributes:
9 | label: 📚 The doc issue
10 | description: >
11 | A clear and concise description the issue.
12 | validations:
13 | required: true
14 |
15 | - type: textarea
16 | attributes:
17 | label: Suggest a potential alternative/fix
18 | description: >
19 | Tell us how we could improve the documentation in this regard.
20 | - type: markdown
21 | attributes:
22 | value: >
23 | Thanks for contributing 🎉!
24 |
--------------------------------------------------------------------------------
/.github/md-link-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "ignorePatterns": [
3 | {
4 | "pattern": "^https://www.reddit.com/"
5 | },
6 | {
7 | "pattern": "^https://developer.nvidia.com/"
8 | },
9 | {
10 | "pattern": "^https://docs.openvino.ai/"
11 | },
12 | {
13 | "pattern": "^https://developer.android.com/"
14 | },
15 | {
16 | "pattern": "^https://developer.qualcomm.com/"
17 | },
18 | {
19 | "pattern": "^http://localhost"
20 | },
21 | {
22 | "pattern": "^https://twitter.com"
23 | },
24 | {
25 | "pattern": "^https://platform.openai.com"
26 | },
27 | {
28 | "pattern": "^http://0.0.0.0"
29 | }
30 | ],
31 | "httpHeaders": [
32 | {
33 | "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"],
34 | "headers": {
35 | "Accept-Encoding": "zstd, br, gzip, deflate"
36 | }
37 | }
38 | ],
39 | "timeout": "20s",
40 | "retryOn429": true,
41 | "retryCount": 5,
42 | "fallbackRetryDelay": "30s",
43 | "aliveStatusCodes": [200, 206, 429]
44 | }
45 |
--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
1 | changelog:
2 | categories:
3 | - title: 🚀 Features
4 | labels:
5 | - feature
6 | - enhancement
7 | - title: 💥 Improvements
8 | labels:
9 | - improvement
10 | - title: 🐞 Bug fixes
11 | labels:
12 | - bug
13 | - Bug:P0
14 | - Bug:P1
15 | - Bug:P2
16 | - Bug:P3
17 | - title: 📚 Documentations
18 | labels:
19 | - documentation
20 | - title: 🌐 Other
21 | labels:
22 | - '*'
23 | exclude:
24 | labels:
25 | - feature
26 | - enhancement
27 | - improvement
28 | - bug
29 | - documentation
30 | - Bug:P0
31 | - Bug:P1
32 | - Bug:P2
33 | - Bug:P3
34 |
--------------------------------------------------------------------------------
/.github/scripts/check_lmdeploy.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) MegFlow. All rights reserved.
2 | import glob
3 | import os
4 |
5 | import fire
6 |
7 |
8 | def check_module_init(root: str):
9 | """Check if a module has __init__.py file."""
10 | all_files = glob.glob(os.path.join(root, '**/*'), recursive=True)
11 | not_exist = []
12 | for d in all_files:
13 | if not os.path.isdir(d):
14 | continue
15 | if '__pycache__' in d:
16 | continue
17 | elif d.startswith('lmdeploy/bin'):
18 | continue
19 | elif d.startswith('lmdeploy/lib'):
20 | continue
21 | elif d.startswith('lmdeploy/serve/turbomind/triton_models'):
22 | continue
23 | elif d.startswith('lmdeploy/serve/turbomind/triton_python_backend'):
24 | continue
25 | init_file = os.path.join(d, '__init__.py')
26 | if not os.path.exists(init_file):
27 | not_exist.append(init_file)
28 |
29 | assert len(not_exist) == 0, f'Missing files: {not_exist}'
30 |
31 |
32 | if __name__ == '__main__':
33 | fire.Fire()
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | .vscode/
6 | .idea/
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | triton-rerope/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | *build*/
46 | !builder/
47 | lmdeploy/lib/
48 | lmdeploy/bin/
49 | dist/
50 | examples/cpp/llama/*.csv
51 | *.npy
52 | *.weight
53 | install/
54 |
55 | # LMDeploy
56 | workspace/
57 | work_dir*/
58 |
59 | # Huggingface
60 | *.bin
61 | *config.json
62 | *generate_config.json
63 | !lmdeploy/turbomind/hf_repo/config.json
64 |
65 | # Pytorch
66 | *.pt
67 | *.pth
68 | *.py~
69 | *.sh~
70 | *.pyc
71 | **/src/pytorch-sphinx-theme/
72 |
73 | # Outputs and logs
74 | *.txt
75 | *.log
76 | *.out
77 | *.csv
78 | !start_ids.csv
79 | *.pkl
80 |
81 | !CMakeLists.txt
82 | proxy_config.yml
83 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 |
2 | include lmdeploy/lib/*.so
3 | include lmdeploy/lib/*.so*
4 | include lmdeploy/lib/*.dll
5 | include lmdeploy/lib/*.pyd
6 | include lmdeploy/bin/*
7 |
--------------------------------------------------------------------------------
/autotest/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 | import yaml
5 |
6 | cli_prompt_case_file = 'autotest/chat_prompt_case.yaml'
7 | common_prompt_case_file = 'autotest/prompt_case.yaml'
8 | config_file = 'autotest/config.yaml'
9 |
10 |
11 | @pytest.fixture(scope='session')
12 | def config():
13 | config_path = os.path.join(config_file)
14 | with open(config_path) as f:
15 | env_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
16 | return env_config
17 |
18 |
19 | @pytest.fixture(scope='session')
20 | def cli_case_config():
21 | case_path = os.path.join(cli_prompt_case_file)
22 | with open(case_path) as f:
23 | case_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
24 | return case_config
25 |
26 |
27 | @pytest.fixture(scope='class', autouse=True)
28 | def common_case_config():
29 | case_path = os.path.join(common_prompt_case_file)
30 | with open(case_path) as f:
31 | case_config = yaml.load(f.read(), Loader=yaml.SafeLoader)
32 | return case_config
33 |
34 |
35 | def pytest_addoption(parser):
36 | parser.addoption('--run_id', action='store', default='', help='github run_id')
37 |
38 |
39 | @pytest.fixture(scope='session')
40 | def run_id(request):
41 | return request.config.getoption('--run_id')
42 |
--------------------------------------------------------------------------------
/autotest/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = test*_*.py # test file
3 | python_classes = Test* # test class
4 | python_functions = test_* # test function
5 | pytest_runtest_call.tryfirst = True
6 | filterwarnings = ignore::UserWarning
7 | reruns = 2
8 | reruns_delay = 1
9 |
--------------------------------------------------------------------------------
/autotest/template.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_name": "base",
3 | "capability": "completion"
4 | }
5 |
--------------------------------------------------------------------------------
/autotest/toolchain/test_lagent.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | @pytest.mark.order(10)
5 | @pytest.mark.lagent
6 | @pytest.mark.flaky(reruns=2)
7 | @pytest.mark.parametrize('model', ['internlm/internlm2_5-7b-chat'])
8 | def test_repeat(config, model):
9 | from lagent.llms import INTERNLM2_META, LMDeployPipeline
10 |
11 | model = LMDeployPipeline(
12 | path='/'.join([config.get('model_path'), model]),
13 | meta_template=INTERNLM2_META,
14 | tp=1,
15 | top_k=40,
16 | top_p=0.8,
17 | temperature=1.2,
18 | stop_words=['<|im_end|>'],
19 | max_new_tokens=4096,
20 | )
21 | response_list = []
22 | for i in range(3):
23 | print(f'run_{i}:')
24 | response = model.chat([{
25 | 'role':
26 | 'user',
27 | 'content':
28 | '已知$$z_{1}=1$$,$$z_{2}=\\text{i}$$,$$z_{3}=-1$$,$$z_{4}=-\\text{i}$$,顺次连结它们所表示的点,则所得图形围成的面积为( )\nA. $$\\dfrac{1}{4}$$\n B. $$\\dfrac{1}{2}$$\n C. $$1$$\n D. $$2$$\n\n' # noqa: F401, E501
29 | }])
30 | print(response)
31 | response_list.append(response)
32 | assert len(response) > 10
33 | assert response_list[0] != response_list[1] and response_list[1] != response_list[2]
34 |
--------------------------------------------------------------------------------
/autotest/tools/quantization/test_quantization_w8a8.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import allure
4 | import pytest
5 | from utils.config_utils import get_cuda_prefix_by_workerid, get_quantization_model_list
6 | from utils.quantization_utils import quantization
7 |
8 |
9 | @pytest.mark.order(2)
10 | @pytest.mark.quantization_w8a8
11 | @pytest.mark.timeout(900)
12 | @pytest.mark.parametrize('model', get_quantization_model_list('w8a8'))
13 | def test_quantization_w8a8(config, model, worker_id):
14 | quantization_w8a8(config, model + '-inner-w8a8', model, get_cuda_prefix_by_workerid(worker_id))
15 |
16 |
17 | def quantization_w8a8(config, quantization_model_name, origin_model_name, cuda_prefix):
18 | quantization_type = 'w8a8'
19 | result, msg = quantization(config, quantization_model_name, origin_model_name, quantization_type, cuda_prefix)
20 | log_path = config.get('log_path')
21 | quantization_log = os.path.join(
22 | log_path, '_'.join(['quantization', quantization_type,
23 | quantization_model_name.split('/')[1]]) + '.log')
24 |
25 | allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT)
26 | assert result, msg
27 |
--------------------------------------------------------------------------------
/autotest/utils/mp_log_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import allure
4 | from pytest_assume.plugin import assume
5 |
6 |
7 | def write_log(config, result, msg, is_new: bool = True, case_path_tag: str = 'default'):
8 | try:
9 | log_path = os.path.join(config.get('log_path'), case_path_tag)
10 |
11 | if is_new:
12 | file = open(log_path, 'w')
13 | else:
14 | file = open(log_path, 'a')
15 |
16 | file.writelines('result:' + result + ', reason:' + msg + '\n')
17 | file.close()
18 | except Exception as e:
19 | return False, None, f'Unknown error: {e}'
20 |
21 |
22 | def assert_log(config, case_path_tag: str = 'default'):
23 | log_path = os.path.join(config.get('log_path'), case_path_tag)
24 |
25 | with open(log_path, 'r') as f:
26 | lines = f.readlines()
27 |
28 | for line in lines:
29 | if 'result:False, reason:' in line:
30 | result = False
31 | msg = line
32 | break
33 | if 'result:True, reason:' in line and not result:
34 | result = True
35 |
36 | allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT)
37 | with assume:
38 | assert result, msg
39 |
--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark
2 |
3 | We provide several profiling tools to benchmark our models.
4 |
5 | ## profile with dataset
6 |
7 | Download the dataset below or create your own dataset.
8 |
9 | ```bash
10 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
11 | ```
12 |
13 | Profiling your model with `profile_throughput.py`
14 |
15 | ```bash
16 | python profile_throughput.py \
17 | ShareGPT_V3_unfiltered_cleaned_split.json \
18 | /path/to/your/model \
19 | --concurrency 64
20 | ```
21 |
22 | ## profile without dataset
23 |
24 | `profile_generation.py` perform benchmark with dummy data.
25 |
26 | ```shell
27 | pip install nvidia-ml-py
28 | ```
29 |
30 | ```bash
31 | python profile_generation.py \
32 | /path/to/your/model \
33 | --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
34 | ```
35 |
36 | ## profile restful api
37 |
38 | `profile_restful_api.py` is used to do benchmark on api server.
39 |
40 | ```bash
41 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
42 |
43 | python3 profile_restful_api.py --backend lmdeploy --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json
44 | ```
45 |
--------------------------------------------------------------------------------
/benchmark/lmdeploy.yml:
--------------------------------------------------------------------------------
1 | num_promts: &num_prompts 1
2 | dataset_path: &dataset_path "/nvme1/shared/ShareGPT_V3_unfiltered_cleaned_split.json"
3 | dataset_name: &dataset_name "sharegpt"
4 | server:
5 | - tp: 2
6 | "model_path": "Qwen/Qwen2.5-32B-Instruct"
7 | "max-batch-size": 1024
8 | "cache-max-entry-count": 0.8
9 | - tp: 4
10 | "model_path": "Qwen/Qwen2.5-32B-Instruct"
11 | "max-batch-size": 1024
12 | "cache-max-entry-count": 0.8
13 | data:
14 | - "dataset-name": "sharegpt"
15 | "dataset-path": *dataset_path
16 | "num-prompts": *num_prompts
17 | - "dataset-name": *dataset_name
18 | "dataset-path": *dataset_path
19 | "sharegpt-output-len": 2048
20 | "num-prompts": *num_prompts
21 | - "dataset-name": *dataset_name
22 | "dataset-path": *dataset_path
23 | "sharegpt-output-len": 4096
24 | "num-prompts": *num_prompts
25 | - "dataset-name": *dataset_name
26 | "dataset-path": *dataset_path
27 | "sharegpt-output-len": 8192
28 | "num-prompts": *num_prompts
29 | - "dataset-name": *dataset_name
30 | "dataset-path": *dataset_path
31 | "sharegpt-output-len": 16384
32 | "num-prompts": *num_prompts
33 | - "dataset-name": *dataset_name
34 | "dataset-path": *dataset_path
35 | "sharegpt-output-len": 32768
36 | "num-prompts": *num_prompts
37 |
--------------------------------------------------------------------------------
/builder/manywheel/README.md:
--------------------------------------------------------------------------------
1 | # Build lmdeploy manylinux wheel
2 |
3 | ## Prepare docker image
4 |
5 | To build all docker images you can use the convenient script:
6 |
7 | ```bash
8 | ./build_all_docker.sh
9 | # Build with pushing
10 | WITH_PUSH=true ./build_all_docker.sh
11 | ```
12 |
13 | To build a docker image with specific cuda version or manylinux-docker version, you may use:
14 |
15 | ```bash
16 | MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION=11.8 ./build_docker.sh
17 | ```
18 |
19 | ## Build lmdeploy wheel
20 |
21 | ```bash
22 | ./build_all_wheel.sh
23 | ```
24 |
--------------------------------------------------------------------------------
/builder/manywheel/build_all_docker.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eou pipefail
4 |
5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
6 |
7 | for cuda_version in 11.8; do
8 | MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/manywheel/build_docker.sh"
9 | done
10 |
--------------------------------------------------------------------------------
/builder/manywheel/build_all_wheel.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eou pipefail
4 |
5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
6 |
7 | CUDA_VER=${CUDA_VER:-11.8}
8 |
9 | PLAT_NAME=manylinux2014_x86_64
10 | for cuver in ${CUDA_VER}; do
11 | DOCKER_TAG=cuda${cuver}
12 | OUTPUT_FOLDER=cuda${cuver}_dist
13 | for pyver in py38 py39 py310 py311 py312; do
14 | bash ${TOPDIR}/manywheel/build_wheel.sh ${pyver} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} \
15 | |& tee ${PLAT_NAME}.${pyver}.cuda${cuver}.log.txt
16 | done
17 | done
18 |
--------------------------------------------------------------------------------
/builder/manywheel/build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eou pipefail
4 |
5 | TOPDIR=$(git rev-parse --show-toplevel)/builder
6 | GPU_ARCH_VERSION=${GPU_ARCH_VERSION}
7 | WITH_PUSH=${WITH_PUSH:-}
8 |
9 | TARGET=cuda_final
10 | DOCKER_TAG=cuda${GPU_ARCH_VERSION}
11 | DOCKER_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
12 | DOCKER_TAG=cuda${GPU_ARCH_VERSION}
13 |
14 | DOCKER_IMAGE=openmmlab/lmdeploy-builder:${DOCKER_TAG}
15 | if [[ -n ${MANY_LINUX_VERSION} ]]; then
16 | DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
17 | else
18 | DOCKERFILE_SUFFIX=''
19 | fi
20 |
21 | (
22 | set -x
23 | DOCKER_BUILDKIT=1 docker build \
24 | -t "${DOCKER_IMAGE}" \
25 | ${DOCKER_BUILD_ARG} \
26 | --target "${TARGET}" \
27 | -f "${TOPDIR}/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
28 | "${TOPDIR}"
29 | )
30 |
31 | if [[ "${WITH_PUSH}" == true ]]; then
32 | (
33 | set -x
34 | docker push "${DOCKER_IMAGE}"
35 | )
36 | fi
37 |
--------------------------------------------------------------------------------
/builder/manywheel/build_wheel.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eux
3 |
4 | PYTHON_VERSION="$1"
5 | PLAT_NAME="$2"
6 | DOCKER_TAG="$3"
7 | OUTPUT_DIR="$4"
8 |
9 | DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}"
10 | export USERID=$(id -u)
11 | export GROUPID=$(id -g)
12 |
13 | cd "$(dirname "$0")" # move inside the script directory
14 | mkdir -p "${OUTPUT_DIR}"
15 | docker pull ${DOCKER_IMAGE}
16 | docker run --rm -it \
17 | --env PYTHON_VERSION="${PYTHON_VERSION}" \
18 | --env PLAT_NAME="${PLAT_NAME}" \
19 | --env USERID="${USERID}" \
20 | --env GROUPID="${GROUPID}" \
21 | --volume "$(pwd)/../../:/lmdeploy" \
22 | --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \
23 | --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \
24 | --entrypoint /entrypoint_build.sh \
25 | ${DOCKER_IMAGE}
26 |
--------------------------------------------------------------------------------
/builder/manywheel/entrypoint_build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -eux
3 |
4 | export PYTHON_VERSION=$PYTHON_VERSION
5 | export PLAT_NAME=$PLAT_NAME
6 | export USERID=${USERID}
7 | export GROUPID=${GROUPID}
8 | export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p')
9 | export NCCL_INCLUDE_DIR=/usr/local/cuda/include
10 | export NCCL_LIB_DIR=/usr/local/cuda/lib64
11 |
12 | source /opt/conda/bin/activate
13 | conda activate $PYTHON_VERSION
14 |
15 | cd lmdeploy
16 | rm -rf lmdeploy/lib
17 | mkdir -p build && cd build && rm -rf *
18 | bash ../generate.sh make
19 | make -j$(nproc) && make install
20 | if [ $? != 0 ]; then
21 | echo "build failed"
22 | exit 1
23 | fi
24 | cd ..
25 | rm -rf build
26 | python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/
27 | chown ${USERID}:${GROUPID} /tmpbuild/*
28 | mv /tmpbuild/* /lmdeploy_build/
29 |
--------------------------------------------------------------------------------
/builder/manywheel/scripts/install_conda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
6 | chmod +x Miniconda3-latest-Linux-x86_64.sh
7 | bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda
8 | rm Miniconda3-latest-Linux-x86_64.sh
9 |
--------------------------------------------------------------------------------
/builder/manywheel/scripts/install_openmpi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -ex
4 |
5 | wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
6 | tar xf openmpi-4.1.5.tar.gz
7 | cd openmpi-4.1.5
8 | ./configure --prefix=/usr/local/mpi
9 | make -j$(nproc)
10 | make install
11 |
--------------------------------------------------------------------------------
/builder/windows/README.md:
--------------------------------------------------------------------------------
1 | # Build lmdeploy on windows
2 |
3 | ## Requirements
4 |
5 | - [CMake 3.17+](https://github.com/Kitware/CMake/releases)
6 | - [Visual Studio 2019+](https://visualstudio.microsoft.com/downloads/)
7 | - [CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive)
8 |
9 | ## Build lmdeploy wheel
10 |
11 | ```powershell
12 | mkdir build
13 | cd build
14 | ..\builder\windows\generate.ps1
15 | cmake --build . --config Release -- /m
16 | cmake --install . --config Release
17 | cd ..
18 | rm build -Force -Recurse
19 | python setup.py bdist_wheel -d build\wheel
20 | ```
21 |
--------------------------------------------------------------------------------
/builder/windows/generate.ps1:
--------------------------------------------------------------------------------
1 | cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
2 | -DCMAKE_BUILD_TYPE=Release `
3 | -DCMAKE_INSTALL_PREFIX=install `
4 | -DBUILD_PY_FFI=ON `
5 | -DBUILD_MULTI_GPU=OFF `
6 | -DUSE_NVTX=OFF `
7 | -DBUILD_TEST="$env:BUILD_TEST"
8 |
--------------------------------------------------------------------------------
/debug.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | builder="-G Ninja"
4 |
5 | if [ "$1" == "make" ]; then
6 | builder=""
7 | fi
8 |
9 | cmake ${builder} .. \
10 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \
11 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
12 | -DCMAKE_INSTALL_PREFIX=./install \
13 | -DBUILD_PY_FFI=ON \
14 | -DBUILD_MULTI_GPU=ON \
15 | -DCMAKE_CUDA_FLAGS="-lineinfo" \
16 | -DUSE_NVTX=ON \
17 | -DPYTHON_EXECUTABLE=$(which python3) \
18 | -DBUILD_TEST=ON
19 |
--------------------------------------------------------------------------------
/docker/InternVL_Dockerfile:
--------------------------------------------------------------------------------
1 | ARG CUDA_VERSION=cu12
2 |
3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12
4 | ENV CUDA_VERSION_SHORT=cu123
5 |
6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11
7 | ENV CUDA_VERSION_SHORT=cu118
8 |
9 | FROM ${CUDA_VERSION} AS final
10 |
11 | RUN python3 -m pip install timm
12 |
13 | RUN python3 -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+${CUDA_VERSION_SHORT}torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
14 |
--------------------------------------------------------------------------------
/docker/Qwen2VL_Dockerfile:
--------------------------------------------------------------------------------
1 | ARG CUDA_VERSION=cu12
2 |
3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12
4 | ENV CUDA_VERSION_SHORT=cu123
5 |
6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11
7 | ENV CUDA_VERSION_SHORT=cu118
8 |
9 | FROM ${CUDA_VERSION} AS final
10 |
11 | # we use transformers to load vision part of qwen2_vl and it needs transformers > v4.44.2
12 | RUN python3 -m pip install git+https://github.com/huggingface/transformers.git
13 |
14 | RUN python3 -m pip install qwen_vl_utils
15 |
--------------------------------------------------------------------------------
/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | formats: all
4 |
5 | build:
6 | os: "ubuntu-22.04"
7 | tools:
8 | python: "3.10"
9 |
10 |
11 | sphinx:
12 | configuration: docs/en/conf.py
13 |
14 |
15 | python:
16 | install:
17 | - requirements: requirements/docs.txt
18 | - requirements: requirements/readthedocs.txt
19 |
--------------------------------------------------------------------------------
/docs/en/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/en/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
1 | table.autosummary td {
2 | width: 50%
3 | }
4 |
5 | img.align-center {
6 | display: block;
7 | margin-left: auto;
8 | margin-right: auto;
9 | }
10 |
--------------------------------------------------------------------------------
/docs/en/api/pipeline.rst:
--------------------------------------------------------------------------------
1 | inference pipeline
2 | ==================
3 | .. currentmodule:: lmdeploy
4 |
5 | pipeline
6 | --------
7 | .. autofunction:: pipeline
8 |
9 | serving
10 | --------
11 | .. autofunction:: serve
12 | .. autofunction:: client
13 |
14 |
15 | PytorchEngineConfig
16 | -------------------
17 | .. autoclass:: PytorchEngineConfig
18 |
19 |
20 | TurbomindEngineConfig
21 | ---------------------
22 | .. autoclass:: TurbomindEngineConfig
23 |
24 |
25 | GenerationConfig
26 | ----------------
27 | .. autoclass:: GenerationConfig
28 |
29 |
30 | ChatTemplateConfig
31 | ------------------
32 | .. autoclass:: ChatTemplateConfig
33 |
--------------------------------------------------------------------------------
/docs/en/get_started/index.rst:
--------------------------------------------------------------------------------
1 | On Other Platforms
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 1
6 | :caption: NPU(Huawei)
7 |
8 | ascend/get_started.md
9 |
--------------------------------------------------------------------------------
/docs/en/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/en/multi_modal/index.rst:
--------------------------------------------------------------------------------
1 | Vision-Language Models
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: Examples
7 |
8 | deepseek_vl2.md
9 | llava.md
10 | internvl.md
11 | xcomposer2d5.md
12 | cogvlm.md
13 | minicpmv.md
14 | phi3.md
15 | mllama.md
16 | qwen2_vl.md
17 | qwen2_5_vl.md
18 | molmo.md
19 | gemma3.md
20 |
--------------------------------------------------------------------------------
/docs/zh_cn/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | formats: all
4 |
5 | build:
6 | os: "ubuntu-22.04"
7 | tools:
8 | python: "3.10"
9 |
10 |
11 | sphinx:
12 | configuration: docs/zh_cn/conf.py
13 |
14 |
15 | python:
16 | install:
17 | - requirements: requirements/docs.txt
18 | - requirements: requirements/readthedocs.txt
19 |
--------------------------------------------------------------------------------
/docs/zh_cn/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/zh_cn/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
1 | table.autosummary td {
2 | width: 50%
3 | }
4 |
5 | img.align-center {
6 | display: block;
7 | margin-left: auto;
8 | margin-right: auto;
9 | }
10 |
--------------------------------------------------------------------------------
/docs/zh_cn/api/pipeline.rst:
--------------------------------------------------------------------------------
1 | 推理 pipeline
2 | ==================
3 | .. currentmodule:: lmdeploy
4 |
5 | pipeline
6 | --------
7 | .. autofunction:: pipeline
8 |
9 | serving
10 | --------
11 | .. autofunction:: serve
12 | .. autofunction:: client
13 |
14 |
15 | PytorchEngineConfig
16 | -------------------
17 | .. autoclass:: PytorchEngineConfig
18 |
19 |
20 | TurbomindEngineConfig
21 | ---------------------
22 | .. autoclass:: TurbomindEngineConfig
23 |
24 |
25 | GenerationConfig
26 | ----------------
27 | .. autoclass:: GenerationConfig
28 |
29 |
30 | ChatTemplateConfig
31 | ------------------
32 | .. autoclass:: ChatTemplateConfig
33 |
--------------------------------------------------------------------------------
/docs/zh_cn/get_started/index.rst:
--------------------------------------------------------------------------------
1 | 其他软硬件平台
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 1
6 | :caption: NPU(Huawei)
7 |
8 | ascend/get_started.md
9 |
--------------------------------------------------------------------------------
/docs/zh_cn/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/zh_cn/multi_modal/cogvlm.md:
--------------------------------------------------------------------------------
1 | # cogvlm
2 |
3 | ## 简介
4 |
5 | CogVLM 是一个强大的开源视觉语言模型(VLM). LMDeploy 已在PyTorch后端支持 CogVLM-17B 模型 [THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf) 和 CogVLM2-19B 模型如[THUDM/cogvlm2-llama3-chat-19B](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B)
6 |
7 | ## 快速开始
8 |
9 | 请参考[安装文档](../get_started/installation.md)安装 LMDeploy
10 |
11 | ### 准备
12 |
13 | 当使用LMDeploy部署 **CogVLM** 模型时,需要下载模型至本地目录。由于 **CogVLM** 模型使用外部Tokenizer,因而需要将相关文件下载至模型目录。然而对于**CogVLM2**模型,则可跳过此步骤。
14 |
15 | 以 **CogVLM** 模型 `cogvlm-chat-hf` 为例,可执行如下脚本下载模型:
16 |
17 | ```shell
18 | huggingface-cli download THUDM/cogvlm-chat-hf --local-dir ./cogvlm-chat-hf --local-dir-use-symlinks False
19 | huggingface-cli download lmsys/vicuna-7b-v1.5 special_tokens_map.json tokenizer.model tokenizer_config.json --local-dir ./cogvlm-chat-hf --local-dir-use-symlinks False
20 | ```
21 |
22 | ### 离线推理 pipeline
23 |
24 | 以下是使用pipeline进行离线推理的示例,更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
25 |
26 | ```python
27 | from lmdeploy import pipeline
28 | from lmdeploy.vl import load_image
29 |
30 |
31 | if __name__ == "__main__":
32 | pipe = pipeline('cogvlm-chat-hf')
33 |
34 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
35 | response = pipe(('describe this image', image))
36 | print(response)
37 | ```
38 |
--------------------------------------------------------------------------------
/docs/zh_cn/multi_modal/gemma3.md:
--------------------------------------------------------------------------------
1 | # Gemma3
2 |
3 | ## 简介
4 |
5 | Gemma 是 Google 推出的轻量级、最先进的开放模型系列,采用与创建 Gemini 模型相同的研究和技术构建而成。Gemma3 模型是多模态模型,可处理文本和图像输入并生成文本输出,对预训练和指令微调均具有开源的权重。Gemma3 具有 128K 的大型上下文窗口,支持 140 多种语言,并且比以前的版本提供更多尺寸。Gemma3 模型非常适合各种文本生成和图像理解任务,包括问答、总结和推理。它们的尺寸相对较小,因此可以将其部署在资源有限的环境中,例如笔记本电脑、台式机或您自己的云基础设施,从而让每个人都能轻松访问最先进的 AI 模型,并帮助促进创新。
6 |
7 | ## 快速开始
8 |
9 | 请参考[安装文档](../get_started/installation.md)安装 LMDeploy。
10 |
11 | ### 准备
12 |
13 | 在使用 LMDeploy 部署 **Gemma3** 模型时,请安装最新的 transformers。
14 |
15 | ### 离线推理 pipeline
16 |
17 | 以下是使用pipeline进行离线推理的示例,更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)。
18 |
19 | ```python
20 | from lmdeploy import pipeline
21 | from lmdeploy.vl import load_image
22 |
23 |
24 | if __name__ == "__main__":
25 | pipe = pipeline('google/gemma-3-12b-it')
26 |
27 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
28 | response = pipe(('describe this image', image))
29 | print(response)
30 | ```
31 |
--------------------------------------------------------------------------------
/docs/zh_cn/multi_modal/index.rst:
--------------------------------------------------------------------------------
1 | 视觉语言模型
2 | =================================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: 示例
7 |
8 | deepseek_vl2.md
9 | llava.md
10 | internvl.md
11 | xcomposer2d5.md
12 | cogvlm.md
13 | minicpmv.md
14 | phi3.md
15 | mllama.md
16 | qwen2_vl.md
17 | qwen2_5_vl.md
18 | molmo.md
19 | gemma3.md
20 |
--------------------------------------------------------------------------------
/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | WORKSPACE_PATH=$(dirname "$(readlink -f "$0")")
3 |
4 | builder="-G Ninja"
5 |
6 | if [ "$1" == "make" ]; then
7 | builder=""
8 | fi
9 |
10 | cmake ${builder} .. \
11 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \
12 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
13 | -DCMAKE_INSTALL_PREFIX=${WORKSPACE_PATH}/install \
14 | -DBUILD_PY_FFI=ON \
15 | -DBUILD_MULTI_GPU=ON \
16 | -DCMAKE_CUDA_FLAGS="-lineinfo" \
17 | -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
18 | -DUSE_NVTX=ON
19 |
--------------------------------------------------------------------------------
/k8s/service.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Service
3 | metadata:
4 | labels:
5 | app: internlm2-chat-7b
6 | name: internlm2-chat-7b-svc
7 | spec:
8 | ports:
9 | - name: main
10 | port: 23333
11 | protocol: TCP
12 | targetPort: main
13 | selector:
14 | app: internlm2-chat-7b
15 | type: ClusterIP
16 |
--------------------------------------------------------------------------------
/lmdeploy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .api import client, pipeline, serve
4 | from .messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
5 | from .model import ChatTemplateConfig
6 | from .tokenizer import Tokenizer
7 | from .version import __version__, version_info
8 |
9 | __all__ = [
10 | 'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig', '__version__', 'version_info', 'ChatTemplateConfig',
11 | 'PytorchEngineConfig', 'TurbomindEngineConfig', 'VisionConfig'
12 | ]
13 |
--------------------------------------------------------------------------------
/lmdeploy/__main__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .cli import run
3 |
4 | if __name__ == '__main__':
5 | run()
6 |
--------------------------------------------------------------------------------
/lmdeploy/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .entrypoint import run
3 |
4 | __all__ = ['run']
5 |
--------------------------------------------------------------------------------
/lmdeploy/lite/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .apis import * # noqa: F401,F403
3 | from .quantization import * # noqa: F401,F403
4 | from .utils import * # noqa: F401,F403
5 |
--------------------------------------------------------------------------------
/lmdeploy/lite/apis/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/lite/defaults.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch import nn
3 |
4 | OFFLOAD_MOD = (nn.Linear, )
5 | KV_CACHE_SIGNATURE = 'past_key_value'
6 |
--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/internlm2_gptq.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from auto_gptq.modeling import BaseGPTQForCausalLM
3 |
4 |
5 | class InternLM2GPTQForCausalLM(BaseGPTQForCausalLM):
6 | layer_type = 'InternLM2DecoderLayer'
7 | layers_block_name = 'model.layers'
8 | outside_layer_modules = ['model.tok_embeddings', 'model.norm']
9 | inside_layer_modules = [
10 | ['attention.wqkv'],
11 | ['attention.wo'],
12 | ['feed_forward.w3', 'feed_forward.w1'],
13 | ['feed_forward.w2'],
14 | ]
15 |
--------------------------------------------------------------------------------
/lmdeploy/lite/modeling/internlm3_gptq.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from auto_gptq.modeling import BaseGPTQForCausalLM
3 |
4 |
5 | class InternLM3GPTQForCausalLM(BaseGPTQForCausalLM):
6 | layer_type = 'InternLM3DecoderLayer'
7 | layers_block_name = 'model.layers'
8 | outside_layer_modules = ['model.embed_tokens', 'model.norm']
9 | inside_layer_modules = [
10 | ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
11 | ['self_attn.o_proj'],
12 | ['mlp.up_proj', 'mlp.gate_proj'],
13 | ['mlp.down_proj'],
14 | ]
15 |
--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .activation import ActivationObserver, KVCacheObserver
3 | from .calibration import CalibrationContext, CalibrationContextV2
4 | from .weight import WeightQuantizer
5 |
6 | __all__ = ['WeightQuantizer', 'ActivationObserver', 'KVCacheObserver', 'CalibrationContext', 'CalibrationContextV2']
7 |
--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/activation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .observer import ActivationObserver, KVCacheObserver
3 |
4 | __all__ = ['ActivationObserver', 'KVCacheObserver']
5 |
--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .linear import WeightOnlyQLinear
3 |
4 | __all__ = ['WeightOnlyQLinear']
5 |
--------------------------------------------------------------------------------
/lmdeploy/lite/quantization/weight/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .quantizer import WeightQuantizer
3 |
4 | __all__ = ['WeightQuantizer']
5 |
--------------------------------------------------------------------------------
/lmdeploy/lite/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
4 | from .cal_qparams import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax,
5 | cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax,
6 | cal_qparams_per_tensor_minmax, precise_round)
7 | from .calib_dataloader import get_calib_loaders
8 | from .collect import bimap_name_mod, collect_target_modules, collect_target_weights
9 | from .global_avail import GlobalAvailMixin
10 | from .load import load_hf_from_pretrained
11 |
12 | __all__ = [
13 | 'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax', 'cal_qparams_per_group_absmax',
14 | 'cal_qparams_per_group_minmax', 'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax', 'QParams',
15 | 'get_calib_loaders', 'collect_target_modules', 'precise_round', 'collect_target_weights', 'GlobalAvailMixin',
16 | 'split_decoder_layer_inputs', 'bimap_name_mod', 'concat_decoder_layer_outputs', 'load_hf_from_pretrained'
17 | ]
18 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/adapter/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import OpType # noqa: F401
3 | from .selector import get_backend # noqa: F401
4 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 |
4 |
5 | class SiluAndMulImpl(ABC):
6 | """Silu + multiple residual fused implementation."""
7 |
8 | @abstractmethod
9 | def forward(self, x):
10 | """forward."""
11 | raise NotImplementedError
12 |
13 |
14 | class SiluAndMulBuilder(ABC):
15 | """Silu and mul implementation builder."""
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def build(inplace: bool = False):
20 | """build."""
21 | raise NotImplementedError
22 |
23 |
24 | class GeluAndMulImpl(ABC):
25 | """Gelu + multiple residual fused implementation."""
26 |
27 | @abstractmethod
28 | def forward(self, x):
29 | """forward."""
30 | raise NotImplementedError
31 |
32 |
33 | class GeluAndMulBuilder(ABC):
34 | """Gelu and mul implementation builder."""
35 |
36 | @staticmethod
37 | @abstractmethod
38 | def build(approximate: str = 'none'):
39 | """build."""
40 | raise NotImplementedError
41 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/apply_rotary_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 |
4 | from torch import Tensor
5 |
6 |
7 | class ApplyRotaryEmbImpl(ABC):
8 | """Apply rotary embedding implementation."""
9 |
10 | @abstractmethod
11 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
12 | """forward."""
13 | raise NotImplementedError
14 |
15 |
16 | class ApplyRotaryEmbBuilder(ABC):
17 | """Apply rotary embedding implementation builder."""
18 |
19 | @staticmethod
20 | @abstractmethod
21 | def build():
22 | """Build implementation."""
23 | raise NotImplementedError
24 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/awq_modules.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 | from typing import Optional
4 |
5 | import torch
6 |
7 |
8 | class LinearW4A16Impl(ABC):
9 | """W4a16 linear implementation."""
10 |
11 | def update_weights(self,
12 | qweight: torch.Tensor,
13 | scales: torch.Tensor,
14 | qzeros: torch.Tensor,
15 | bias: Optional[torch.Tensor] = None):
16 | """Update weights."""
17 | return qweight, scales, qzeros, bias
18 |
19 | @abstractmethod
20 | def forward(self, x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, all_reduce: bool = False):
21 | """forward."""
22 | raise NotImplementedError
23 |
24 |
25 | class LinearW4A16Builder(ABC):
26 | """W4a16 linear implementation builder."""
27 |
28 | @staticmethod
29 | @abstractmethod
30 | def build(in_features: int,
31 | out_features: int,
32 | w_bit: int,
33 | group_size: int,
34 | bias: bool = False,
35 | dtype: torch.dtype = None):
36 | """build."""
37 | raise NotImplementedError
38 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/blockedf8_modules.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 | from typing import List, Optional
4 |
5 | import torch
6 |
7 |
8 | class LinearBlockedF8Impl(ABC):
9 | """Linear BlockedF8 implementation api."""
10 |
11 | def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
12 | """Update weights."""
13 | return weight, scale, bias
14 |
15 | @abstractmethod
16 | def forward(self,
17 | x,
18 | weight: torch.Tensor,
19 | scale: torch.Tensor,
20 | bias: Optional[torch.Tensor] = None,
21 | all_reduce: bool = False,
22 | rank: int = 0,
23 | scatter_size: List[int] = None):
24 | """forward."""
25 | raise NotImplementedError
26 |
27 |
28 | class LinearBlockedF8Builder(ABC):
29 | """Linear BlockedF8 implementation builder."""
30 |
31 | @staticmethod
32 | @abstractmethod
33 | def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None):
34 | """build."""
35 | raise NotImplementedError
36 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import CudaOpsBackend # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul
3 |
4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl
5 |
6 |
7 | class TritonSiluAndMulImpl(SiluAndMulImpl):
8 | """Silu + multiple residual fused implementation."""
9 |
10 | def __init__(self, inplace: bool):
11 | self.inplace = inplace
12 |
13 | def forward(self, x):
14 | """forward."""
15 | out = None
16 | x_shape = None
17 | if x.dim() != 2:
18 | x_shape = x.shape
19 | x = x.flatten(0, -2)
20 | if self.inplace:
21 | out = x.chunk(2, -1)[0]
22 |
23 | out = silu_and_mul(x, out)
24 |
25 | if x_shape is not None:
26 | out = out.unflatten(0, x_shape[:-1])
27 | return out
28 |
29 |
30 | class TritonSiluAndMulBuilder(SiluAndMulBuilder):
31 | """Silu and mul implementation builder."""
32 |
33 | @staticmethod
34 | def build(inplace: bool = False):
35 | """build."""
36 | return TritonSiluAndMulImpl(inplace)
37 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/apply_rotary_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | from torch import Tensor
4 |
5 | from lmdeploy.pytorch.kernels.cuda import apply_rotary_pos_emb
6 |
7 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl
8 |
9 |
10 | class TritonApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
11 | """Apply rotary embedding implementation."""
12 |
13 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
14 | """forward."""
15 | if inplace:
16 | q_embed = query
17 | k_embed = key
18 | else:
19 | q_embed = torch.empty_like(query)
20 | k_embed = torch.empty_like(key)
21 | return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
22 |
23 |
24 | class TritonApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
25 | """Apply rotary embedding implementation builder."""
26 |
27 | @staticmethod
28 | def build():
29 | """Build implementation."""
30 | return TritonApplyRotaryEmbImpl()
31 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | import torch
4 |
5 | from lmdeploy.pytorch.kernels.cuda import multinomial_sampling
6 |
7 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl
8 |
9 |
10 | class TritonMultinomialSamplingImpl(MultinomialSamplingImpl):
11 |
12 | def forward(self,
13 | scores: torch.Tensor,
14 | seeds: torch.LongTensor,
15 | offsets: torch.LongTensor,
16 | indices: torch.Tensor = None):
17 | """forward."""
18 | return multinomial_sampling(scores, seeds, offsets, indices)
19 |
20 |
21 | class TritonMultinomialSamplingBuilder(MultinomialSamplingBuilder):
22 | """Triton multinomial sampling builder."""
23 |
24 | def build():
25 | """build."""
26 | return TritonMultinomialSamplingImpl()
27 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/cuda/norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from lmdeploy.pytorch.kernels.cuda import rms_norm
5 |
6 | from ..norm import RMSNormBuilder, RMSNormImpl
7 |
8 |
9 | class TritonRMSNormImpl(RMSNormImpl):
10 | """Triton RMS norm implementation."""
11 |
12 | def __init__(self, hidden_size: int, eps: float = 1e-6):
13 | self.hidden_size = hidden_size
14 | self.eps = eps
15 |
16 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
17 | """forward."""
18 | if residual is None:
19 | x = rms_norm(x, weight, self.eps)
20 | return x
21 | else:
22 | x, residual = rms_norm(x, weight, self.eps, residual=residual)
23 | return x, residual
24 |
25 |
26 | class TritonRMSNormBuilder(RMSNormBuilder):
27 | """Triton RMS norm implementation builder."""
28 |
29 | @staticmethod
30 | def build(weight: torch.Tensor, eps: float = 1e-6):
31 | """build."""
32 | return TritonRMSNormImpl(weight, eps)
33 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import DefaultOpsBackend # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/moe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from ..moe import SoftmaxTopKBuilder, SoftmaxTopKImpl
5 |
6 |
7 | class DefaultSoftmaxTopKImpl(SoftmaxTopKImpl):
8 | """RMS norm implementation api."""
9 |
10 | def __init__(self, top_k: int, dim: int = -1):
11 | self.top_k = top_k
12 | self.dim = dim
13 |
14 | def forward(self, x: torch.Tensor):
15 | """forward."""
16 | routing_weights = torch.softmax(x, dim=self.dim, dtype=torch.float32)
17 | topk_weights, topk_ids = torch.topk(routing_weights, self.top_k, dim=self.dim)
18 | return topk_weights, topk_ids
19 |
20 |
21 | class DefaultSoftmaxTopKBuilder(SoftmaxTopKBuilder):
22 | """RMS norm implementation builder."""
23 |
24 | @staticmethod
25 | def build(top_k: int, dim: int = -1):
26 | """build."""
27 | return DefaultSoftmaxTopKImpl(top_k, dim)
28 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/default/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | import torch
4 |
5 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl
6 |
7 |
8 | class DefaultMultinomialSamplingImpl(MultinomialSamplingImpl):
9 | """Multinomial sampling implementation api."""
10 |
11 | def forward(self,
12 | scores: torch.Tensor,
13 | seeds: torch.LongTensor,
14 | offsets: torch.LongTensor,
15 | indices: torch.Tensor = None):
16 | """forward."""
17 | sampled_index = torch.multinomial(scores, num_samples=1, replacement=True)
18 | outputs = torch.gather(indices, dim=1, index=sampled_index)
19 | return outputs.view(-1)
20 |
21 |
22 | class DefaultMultinomialSamplingBuilder(MultinomialSamplingBuilder):
23 | """Multinomial sampling implementation builder."""
24 |
25 | def build():
26 | """build."""
27 | return DefaultMultinomialSamplingImpl()
28 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .ascend import AscendOpsBackend # noqa: F401
3 | from .camb import CambOpsBackend # noqa: F401
4 | from .maca import MacaOpsBackend # noqa: F401
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from lmdeploy.pytorch.kernels.dlinfer.activation import silu_and_mul
3 |
4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl
5 |
6 |
7 | class DlinferSiluAndMulImpl(SiluAndMulImpl):
8 | """Silu + multiple fused implementation."""
9 |
10 | def forward(self, x):
11 | """forward."""
12 | return silu_and_mul(x)
13 |
14 |
15 | class DlinferSiluAndMulBuilder(SiluAndMulBuilder):
16 | """Silu and mul implementation builder."""
17 |
18 | @staticmethod
19 | def build(inplace: bool = False):
20 | """build."""
21 | return DlinferSiluAndMulImpl()
22 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch import Tensor
3 |
4 | from lmdeploy.pytorch.kernels.dlinfer import apply_rotary_pos_emb
5 |
6 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl
7 |
8 |
9 | class DlinferApplyRotaryEmbImpl(ApplyRotaryEmbImpl):
10 | """Apply rotary embedding implementation."""
11 |
12 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True):
13 | """forward."""
14 | if inplace:
15 | q_embed = None
16 | k_embed = None
17 | else:
18 | q_embed = query.new_empty(query.shape)
19 | k_embed = key.new_empty(key.shape)
20 | return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed)
21 |
22 |
23 | class DlinferApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder):
24 | """Apply rotary embedding implementation builder."""
25 |
26 | @staticmethod
27 | def build():
28 | """Build implementation."""
29 | return DlinferApplyRotaryEmbImpl()
30 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/ascend/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import AscendOpsBackend, SocVersion # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/camb/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import CambOpsBackend # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/maca/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .op_backend import MacaOpsBackend # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/dlinfer/norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from lmdeploy.pytorch.kernels.dlinfer import rms_norm
5 |
6 | from ..norm import RMSNormBuilder, RMSNormImpl
7 |
8 |
9 | class DlinferRMSNormImpl(RMSNormImpl):
10 | """Dlinfer RMS norm implementation."""
11 |
12 | def __init__(self, hidden_size: int, eps: float = 1e-6):
13 | self.hidden_size = hidden_size
14 | self.eps = eps
15 |
16 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
17 | """forward."""
18 | if residual is None:
19 | x = rms_norm(x, weight, self.eps)
20 | return x
21 | else:
22 | x, residual = rms_norm(x, weight, self.eps, residual=residual)
23 | return x, residual
24 |
25 |
26 | class DlinferRMSNormBuilder(RMSNormBuilder):
27 | """Dlinfer RMS norm implementation builder."""
28 |
29 | @staticmethod
30 | def build(weight: torch.Tensor, eps: float = 1e-6):
31 | """build."""
32 | return DlinferRMSNormImpl(weight, eps)
33 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/flash_attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 |
4 | from torch import Tensor
5 |
6 |
7 | class FlashAttentionImpl(ABC):
8 | """FlashAttention implementation."""
9 |
10 | def forward(self,
11 | query: Tensor,
12 | key: Tensor,
13 | value: Tensor,
14 | q_start_loc: Tensor,
15 | q_seqlens: Tensor,
16 | kv_start_loc: Tensor,
17 | kv_seqlens: Tensor,
18 | max_q_seqlen: int = None):
19 | """forward."""
20 | raise NotImplementedError
21 |
22 |
23 | class FlashAttentionBuilder(ABC):
24 | """FlashAttention implementation builder."""
25 |
26 | @staticmethod
27 | @abstractmethod
28 | def build(
29 | num_heads: int,
30 | head_dim: int,
31 | scale: float = None,
32 | num_kv_heads: int = None,
33 | v_head_dim: int = None,
34 | causal: bool = True,
35 | sliding_window: int = None,
36 | logical_softcapping: float = None,
37 | **kwargs,
38 | ) -> FlashAttentionImpl:
39 | """build."""
40 | raise NotImplementedError
41 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/linear.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 | from typing import List, Optional
4 |
5 | import torch
6 |
7 |
8 | class LinearImpl(ABC):
9 | """Linear implementation api."""
10 |
11 | def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
12 | """Update weights."""
13 | return weight, bias
14 |
15 | @abstractmethod
16 | def forward(self,
17 | x,
18 | weight: torch.Tensor,
19 | bias: Optional[torch.Tensor] = None,
20 | all_reduce: bool = False,
21 | rank: int = 0,
22 | scatter_size: List[int] = None):
23 | """forward."""
24 | raise NotImplementedError
25 |
26 |
27 | class LinearBuilder(ABC):
28 | """Linear implementation builder."""
29 |
30 | @staticmethod
31 | @abstractmethod
32 | def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None):
33 | """build."""
34 | raise NotImplementedError
35 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 |
4 | import torch
5 |
6 |
7 | class MultinomialSamplingImpl(ABC):
8 | """Multinomial sampling implementation api."""
9 |
10 | @abstractmethod
11 | def forward(scores: torch.Tensor, seeds: torch.LongTensor, offsets: torch.LongTensor, indices: torch.Tensor = None):
12 | """forward."""
13 | raise NotImplementedError
14 |
15 |
16 | class MultinomialSamplingBuilder(ABC):
17 | """Multinomial sampling implementation builder."""
18 |
19 | @staticmethod
20 | @abstractmethod
21 | def build():
22 | """build."""
23 | raise NotImplementedError
24 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/backends/norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABC, abstractmethod
3 |
4 | import torch
5 |
6 |
7 | class RMSNormImpl(ABC):
8 | """RMS norm implementation api."""
9 |
10 | @abstractmethod
11 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None):
12 | """forward."""
13 | raise NotImplementedError
14 |
15 |
16 | class RMSNormBuilder(ABC):
17 | """RMS norm implementation builder."""
18 |
19 | @staticmethod
20 | @abstractmethod
21 | def build(hidden_size: int, eps: float = 1e-6):
22 | """build."""
23 | raise NotImplementedError
24 |
25 |
26 | class LayerNormImpl(ABC):
27 | """Layer norm implementation api."""
28 |
29 | @abstractmethod
30 | def forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor = None, residual: torch.Tensor = None):
31 | """forward."""
32 | raise NotImplementedError
33 |
34 |
35 | class LayerNormBuilder(ABC):
36 | """Layer norm implementation builder."""
37 |
38 | @staticmethod
39 | @abstractmethod
40 | def build(normalized_shape: int, eps: float = 1e-6):
41 | """build."""
42 | raise NotImplementedError
43 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/adapter.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import BaseChecker
3 |
4 |
5 | class AdapterChecker(BaseChecker):
6 | """Check adapter is available."""
7 |
8 | def __init__(self, adapter_path: str, logger=None):
9 | super().__init__(logger)
10 | self.adapter_path = adapter_path
11 |
12 | def check(self):
13 | """check."""
14 | path = self.adapter_path
15 |
16 | try:
17 | import peft # noqa: F401
18 | except Exception as e:
19 | self.log_and_exit(e, 'Adapter', message='Failed to import peft.')
20 |
21 | try:
22 | from peft import PeftConfig
23 | PeftConfig.from_pretrained(path)
24 | except Exception as e:
25 | message = ('Please make sure the adapter can be loaded with '
26 | '`peft.PeftConfig.from_pretrained`\n')
27 | err_msg = '' if len(e.args) == 0 else e.args[0]
28 | if 'got an unexpected keyword argument' in err_msg:
29 | message += ('Or try remove all unexpected keywords '
30 | 'in `adapter_config.json`.')
31 | self.log_and_exit(e, 'Adapter', message=message)
32 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/deeplink.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from lmdeploy.utils import try_import_deeplink
3 |
4 | from .base import BaseChecker
5 |
6 |
7 | class DeeplinkChecker(BaseChecker):
8 | """Check pytorch is available."""
9 |
10 | def __init__(self, device_type: str, logger=None) -> None:
11 | super().__init__(logger=logger)
12 | self.device_type = device_type
13 |
14 | def check(self):
15 | """check."""
16 | try_import_deeplink(self.device_type)
17 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/torch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import BaseChecker
3 |
4 |
5 | class TorchChecker(BaseChecker):
6 | """Check pytorch is available."""
7 |
8 | def __init__(self, device: str = 'cuda', logger=None) -> None:
9 | super().__init__(logger=logger)
10 | self.device = device
11 |
12 | def check(self):
13 | """check."""
14 | try:
15 | import torch
16 | a = torch.tensor([1, 2], device=self.device)
17 | b = a.new_tensor([3, 4], device=self.device)
18 | c = a + b
19 | torch.testing.assert_close(c, a.new_tensor([4, 6]))
20 | except Exception as e:
21 | self.log_and_exit(e, 'PyTorch', 'PyTorch is not available.')
22 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/transformers.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from packaging import version
3 |
4 | from .base import BaseChecker
5 |
6 | MIN_TRANSFORMERS_VERSION = '4.33.0'
7 | MAX_TRANSFORMERS_VERSION = '4.49.0'
8 |
9 |
10 | class TransformersChecker(BaseChecker):
11 | """Check transformers is available."""
12 |
13 | def check(self):
14 | """check."""
15 | import transformers
16 | logger = self.get_logger()
17 | try:
18 | trans_version = version.parse(transformers.__version__)
19 | min_version = version.parse(MIN_TRANSFORMERS_VERSION)
20 | max_version = version.parse(MAX_TRANSFORMERS_VERSION)
21 | if trans_version < min_version or trans_version > max_version:
22 | logger.warning('LMDeploy requires transformers version: '
23 | f'[{MIN_TRANSFORMERS_VERSION} ~ '
24 | f'{MAX_TRANSFORMERS_VERSION}], '
25 | 'but found version: '
26 | f'{transformers.__version__}')
27 | except Exception as e:
28 | self.log_and_exit(e, 'transformers', 'transformers is not available.')
29 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/check_env/triton_custom_add.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import triton
4 | import triton.language as tl
5 |
6 |
7 | @triton.jit
8 | def _add_kernel(A, B, C, size, BLOCK: tl.constexpr):
9 | """Add kernel."""
10 | prog_id = tl.program_id(0)
11 | offs = prog_id * BLOCK + tl.arange(0, BLOCK)
12 | a = tl.load(A + offs, mask=offs < size)
13 | b = tl.load(B + offs, mask=offs < size)
14 | tl.store(C + offs, a + b, mask=offs < size)
15 |
16 |
17 | def custom_add(a, b):
18 | """Custom add one."""
19 | c = torch.empty_like(a)
20 | size = c.size(0)
21 | BLOCK = 16
22 |
23 | grid = (triton.cdiv(size, BLOCK), )
24 | _add_kernel[grid](a, b, c, size, BLOCK=BLOCK)
25 | return c
26 |
27 |
28 | if __name__ == '__main__':
29 | a = torch.tensor([1, 2], device='cuda')
30 | b = a.new_tensor([3, 4], device='cuda')
31 | c = custom_add(a, b)
32 | torch.testing.assert_close(c, a + b)
33 | print('Done.')
34 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import importlib
3 | import pkgutil
4 |
5 | from .builder import AutoModelConfigBuilder
6 |
7 | __all__ = []
8 |
9 | # load all submodule
10 | for loader, module_name, is_pkg in pkgutil.walk_packages(__path__):
11 | __all__.append(module_name)
12 | _module = importlib.import_module('{}.{}'.format(__name__, module_name))
13 | globals()[module_name] = _module
14 |
15 | __all__ += ['AutoModelConfigBuilder']
16 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/cogvlm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class CogVLMModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | model_arch = hf_config.architectures[0] if hf_config.architectures else None
12 | return model_arch == 'CogVLMForCausalLM'
13 |
14 | @classmethod
15 | def build(cls, hf_config, model_path: str = None, **kwargs):
16 | """build."""
17 | from lmdeploy.utils import is_bf16_supported
18 | if getattr(hf_config, 'num_multi_query_heads', None):
19 | hf_config.num_key_value_heads = hf_config.num_multi_query_heads
20 | else:
21 | hf_config.num_key_value_heads = hf_config.num_attention_heads
22 |
23 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
24 | cfg.cogvlm_style = True
25 | torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
26 | hf_config.torch_dtype = torch_dtype
27 | return cfg
28 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/deepseek_vl2.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class DeepseekVLV2ModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.model_type in ['deepseek_vl_v2']
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """Build deepseek-vl2."""
16 |
17 | if hf_config.language_config.use_mla:
18 | from .deepseek_v2 import DeepseekV2ModelConfigBuilder
19 | cfg = DeepseekV2ModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs)
20 | cfg.hf_config = hf_config
21 | else:
22 | # deepseek-vl2-tiny uses MHA, rather than MLA
23 | # in this case, we use DefaultModelConfigBuilder
24 | cfg = DefaultModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs)
25 | cfg.hf_config = hf_config
26 |
27 | return cfg
28 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/gemma.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class GemmaModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.model_type in ['gemma', 'gemma2', 'gemma3_text']
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """Build gemma."""
16 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
17 | cfg.head_dim = hf_config.head_dim
18 | return cfg
19 |
20 |
21 | class GemmaVLModelConfigBuilder(AutoModelConfigBuilder):
22 |
23 | @classmethod
24 | def condition(cls, hf_config):
25 | """config."""
26 | model_arch = hf_config.architectures[0] if hf_config.architectures else None
27 | return model_arch == 'Gemma3ForConditionalGeneration'
28 |
29 | @classmethod
30 | def build(cls, hf_config, model_path: str = None, **kwargs):
31 | """Build gemma."""
32 | hf_config.text_config.architectures = ['Gemma3ForCausalLM']
33 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
34 | cfg.hf_config = hf_config
35 | return cfg
36 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/internvl.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class InternVLModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.architectures[0] == 'InternVLChatModel'
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """Build llava hf."""
16 | cfg = DefaultModelConfigBuilder.build(hf_config.llm_config, model_path, **kwargs)
17 | cfg.hf_config = hf_config
18 | return cfg
19 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/llama4.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class Llama4ModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.model_type in ['llama4']
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """Build llama4."""
16 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
17 | cfg.hf_config = hf_config
18 |
19 | return cfg
20 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/minicpm3.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .builder import AutoModelConfigBuilder
4 | from .default import DefaultModelConfigBuilder
5 |
6 |
7 | class MiniCPM3ModelConfigBuilder(AutoModelConfigBuilder):
8 |
9 | @classmethod
10 | def condition(cls, hf_config):
11 | """config."""
12 | return hf_config.architectures[0] in ['MiniCPM3ForCausalLM']
13 |
14 | @classmethod
15 | def build(cls, hf_config, model_path: str = None, **kwargs):
16 | """build."""
17 | head_dim = (hf_config.qk_nope_head_dim + hf_config.qk_rope_head_dim)
18 |
19 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
20 | cfg.head_dim = head_dim
21 | cfg.k_head_dim = head_dim
22 | cfg.v_head_dim = head_dim
23 |
24 | return cfg
25 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/mllama.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class MLlamaModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.architectures[0] == 'MllamaForConditionalGeneration'
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """Build llava hf."""
16 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
17 | cfg.hf_config = hf_config
18 | return cfg
19 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/qwen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import AutoModelConfigBuilder
3 | from .default import DefaultModelConfigBuilder
4 |
5 |
6 | class QwenModelConfigBuilder(AutoModelConfigBuilder):
7 |
8 | @classmethod
9 | def condition(cls, hf_config):
10 | """config."""
11 | return hf_config.model_type == 'qwen'
12 |
13 | @classmethod
14 | def build(cls, hf_config, model_path: str = None, **kwargs):
15 | """build."""
16 | from lmdeploy.utils import is_bf16_supported
17 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
18 | if cfg.bos_token_id is None:
19 | cfg.bos_token_id = 151644
20 | if cfg.eos_token_id is None:
21 | cfg.eos_token_id = 151645
22 |
23 | torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16'
24 | if hf_config.bf16 and is_bf16_supported():
25 | torch_dtype = 'bfloat16'
26 | elif hf_config.fp16:
27 | torch_dtype = 'float16'
28 | hf_config.torch_dtype = torch_dtype
29 | return cfg
30 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/configurations/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from lmdeploy.utils import get_logger
5 |
6 | logger = get_logger('lmdeploy')
7 |
8 |
9 | def flash_mla_available():
10 | """Check if flash mla is available."""
11 | # use flash_mla by default if it is installed
12 | use_flash_mla = False
13 | try:
14 | # torch_npu device_properties doesn't have 'major' attribute
15 | device_properties = torch.cuda.get_device_properties(0)
16 | if hasattr(device_properties, 'major') and device_properties.major >= 9:
17 | import flash_mla_cuda # noqa
18 | use_flash_mla = True
19 | except ImportError:
20 | logger.warning('For higher performance, please install flash_mla https://github.com/deepseek-ai/FlashMLA')
21 | return use_flash_mla
22 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/devices/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .device_manager import DefaultContext, DeviceContext, get_device_manager
3 |
4 | __all__ = ['DeviceContext', 'DefaultContext', 'get_device_manager']
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/backend/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from lmdeploy.logger import get_logger
3 |
4 | logger = get_logger('lmdeploy')
5 |
6 | try:
7 | logger.debug('Registering DLSlime Backend')
8 | from .dlslime import DLSlimeBackend
9 | except ImportError:
10 | logger.warning('Disable DLSlime Backend')
11 |
12 | try:
13 | logger.debug('Registering Mooncake Backend')
14 | from .mooncake import MooncakeBackend
15 | except ImportError:
16 | logger.warning('Disable Mooncake Backend')
17 |
18 | try:
19 | logger.debug('Registering InfiniStoreBackend Backend')
20 | from .infinistore import InfiniStoreBackend
21 | except ImportError:
22 | logger.warning('Disable InfiniStoreBackend Backend')
23 |
24 | __all__ = ['DLSlimeBackend', 'MooncakeBackend', 'InfiniStoreBackend']
25 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/backend/backend.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from mmengine.registry import Registry
3 |
4 | MIGRATION_BACKENDS = Registry('migration_backend', locations=['lmdeploy.pytorch.disagg.backend.backend'])
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/disagg/request.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import List, Optional
3 |
4 | from pydantic import BaseModel
5 |
6 | from lmdeploy.pytorch.disagg.config import (DistServeEngineConfig, DistServeNVLinkConfig, DistServeRDMAConfig,
7 | DistServeTCPConfig, MigrationProtocol)
8 |
9 |
10 | class DistServeConnectionRequest(BaseModel):
11 | protocol: MigrationProtocol
12 | remote_engine_id: str
13 | remote_endpoint_info: str
14 |
15 |
16 | class DistServeInitRequest(BaseModel):
17 | local_engine_id: str
18 | local_engine_config: DistServeEngineConfig
19 |
20 | remote_engine_id: str
21 | remote_engine_config: DistServeEngineConfig
22 |
23 | protocol: MigrationProtocol
24 |
25 | rank: Optional[int] = None
26 |
27 | tcp_config: Optional[DistServeTCPConfig] = None
28 | rdma_config: Optional[DistServeRDMAConfig] = None
29 | nvlink_config: Optional[DistServeNVLinkConfig] = None
30 |
31 |
32 | class MigrationRequest(BaseModel):
33 | protocol: MigrationProtocol
34 |
35 | remote_engine_id: str
36 | remote_session_id: int
37 | remote_token_id: int
38 | remote_block_ids: List[int]
39 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .engine import Engine
3 | from .engine_instance import EngineInstance
4 |
5 | __all__ = ['Engine', 'EngineInstance']
6 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .alibi_pagedattention import alibi_paged_attention_fwd
4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
5 | from .fill_kv_cache import fill_kv_cache
6 | from .fused_moe import fused_moe
7 | from .fused_rotary_emb import fused_rotary_emb
8 | from .multinomial_sampling import multinomial_sampling
9 | from .pagedattention import paged_attention_fwd
10 | from .rms_norm import rms_norm
11 | from .w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8,
12 | rms_norm_dynamic_quant)
13 |
14 | __all__ = [
15 | 'apply_rotary_pos_emb',
16 | 'fused_moe',
17 | 'fused_rotary_emb',
18 | 'paged_attention_fwd',
19 | 'alibi_paged_attention_fwd',
20 | 'fill_kv_cache',
21 | 'multinomial_sampling',
22 | 'rms_norm',
23 | 'matmul_kernel_dynamic_quant',
24 | 'per_channel_quant',
25 | 'per_token_quant_int8',
26 | 'rms_norm_dynamic_quant',
27 | ]
28 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/alibi_pagedattention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | alibi_paged_attention_fwd = FunctionDispatcher('alibi_paged_attention_fwd').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/apply_rotary_pos_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | apply_rotary_pos_emb = FunctionDispatcher('apply_rotary_pos_emb').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/cuda/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ..default.w8a8_kernels import per_channel_quant
3 | from .alibi_pagedattention import alibi_paged_attention_fwd
4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
5 | from .fill_kv_cache import fill_kv_cache
6 | from .flash_mla import flash_mla_fwd
7 | from .flashattention import flash_attention_fwd
8 | from .flatten_kv_cache import flatten_kv_cache
9 | from .fused_moe import fused_moe
10 | from .fused_rotary_emb import fused_rotary_emb
11 | from .multinomial_sampling import multinomial_sampling
12 | from .pagedattention import paged_attention_fwd
13 | from .rms_norm import rms_norm
14 | from .w8a8_fused_moe import fused_moe_w8a8
15 | from .w8a8_triton_kernels import matmul_kernel_dynamic_quant, per_token_quant_int8, rms_norm_dynamic_quant
16 |
17 | __all__ = [
18 | 'apply_rotary_pos_emb',
19 | 'fused_moe',
20 | 'fused_rotary_emb',
21 | 'paged_attention_fwd',
22 | 'alibi_paged_attention_fwd',
23 | 'fill_kv_cache',
24 | 'multinomial_sampling',
25 | 'rms_norm',
26 | 'matmul_kernel_dynamic_quant',
27 | 'per_channel_quant',
28 | 'per_token_quant_int8',
29 | 'rms_norm_dynamic_quant',
30 | 'flash_attention_fwd',
31 | 'flatten_kv_cache',
32 | 'fused_moe_w8a8',
33 | 'flash_mla_fwd',
34 | ]
35 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/cuda/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import functools
3 |
4 | import torch
5 |
6 | WARPS_PER_SM = {
7 | (8, 0): 64,
8 | (8, 6): 48,
9 | (8, 7): 48,
10 | (8, 9): 48,
11 | (9, 0): 64,
12 | (10, 0): 64,
13 | (10, 1): 48,
14 | (12, 0): 48,
15 | }
16 |
17 |
18 | @functools.lru_cache
19 | def get_device_props(device=None):
20 | if device is None:
21 | device = torch.cuda.current_device()
22 |
23 | props = torch.cuda.get_device_properties(device)
24 |
25 | warps_per_sm = WARPS_PER_SM.get((props.major, props.minor), 32)
26 | out = dict(
27 | multi_processor_count=props.multi_processor_count,
28 | warps_per_sm=warps_per_sm,
29 | )
30 | return out
31 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .multinomial_sampling import multinomial_sampling
3 | from .w8a8_kernels import per_channel_quant
4 |
5 | __all__ = [
6 | 'multinomial_sampling',
7 | 'per_channel_quant',
8 | ]
9 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | from torch import LongTensor, Tensor
4 |
5 |
6 | def multinomial_sampling(scores: Tensor, seeds: LongTensor, offsets: LongTensor, indices: Tensor = None):
7 | sampled_index = torch.multinomial(scores, num_samples=1, replacement=True)
8 | outputs = torch.gather(indices, dim=1, index=sampled_index)
9 | return outputs.view(-1)
10 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/default/w8a8_kernels.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 |
5 | def per_channel_quant(x: torch.Tensor, dtype: torch.dtype):
6 | """Quantize the input tensor 'x' channel-wise using the given number of
7 | bits.
8 |
9 | Args:
10 | x (torch.Tensor): The input tensor to be quantized. Must be a
11 | 2-dimensional tensor.
12 | dtype (torch.dtype): The data type to which the quantized tensor should
13 | be converted.
14 |
15 | Returns:
16 | tuple: A tuple containing two items -- the quantized tensor and
17 | the scale used for quantization.
18 | """
19 | assert x.ndim == 2
20 | x = x.to(torch.float32)
21 | x_absmax = x.view(x.shape[0], -1).abs().max(dim=1, keepdim=True)[0]
22 | qtype_info = torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype)
23 | q_max = qtype_info.max
24 | q_min = qtype_info.min
25 | scale = x_absmax / q_max
26 | x_q = x / scale
27 | if not dtype.is_floating_point:
28 | x_q = torch.round(x_q)
29 | x_q = x_q.clamp(q_min, q_max).to(dtype)
30 | return x_q, scale
31 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ..default import multinomial_sampling, per_channel_quant
3 | from .apply_rotary_pos_emb import apply_rotary_pos_emb
4 | from .awq_kernels import awq_linear
5 | from .fill_kv_cache import fill_kv_cache
6 | from .flash_attention import flash_attention_fwd
7 | from .fused_moe import fused_moe
8 | from .linear import linear
9 | from .moe_gating_topk_softmax import moe_gating_topk_softmax
10 | from .pagedattention import paged_attention_fwd
11 | from .rms_norm import rms_norm
12 |
13 | __all__ = [
14 | 'rms_norm',
15 | 'apply_rotary_pos_emb',
16 | 'awq_linear',
17 | 'fill_kv_cache',
18 | 'fused_moe',
19 | 'paged_attention_fwd',
20 | 'flash_attention_fwd',
21 | 'linear',
22 | 'moe_gating_topk_softmax',
23 | 'multinomial_sampling',
24 | 'per_channel_quant',
25 | ]
26 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 |
5 |
6 | def silu_and_mul(input_tensor: Tensor, ) -> Tensor:
7 | return ext_ops.silu_and_mul(input_tensor)
8 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Optional, Tuple
3 |
4 | import dlinfer.ops as ext_ops
5 | from torch import Tensor
6 |
7 |
8 | def apply_rotary_pos_emb(
9 | query_states: Tensor,
10 | key_states: Tensor,
11 | cos: Tensor,
12 | sin: Tensor,
13 | q_embed: Optional[Tensor],
14 | k_embed: Optional[Tensor],
15 | ) -> Tuple[Tensor, Tensor]:
16 | query_states_embed, key_states_embed = \
17 | ext_ops.apply_rotary_pos_emb(query_states,
18 | key_states,
19 | cos, sin)
20 | if q_embed is None:
21 | q_embed = query_states_embed.view(query_states.shape)
22 | elif q_embed is not query_states:
23 | q_embed.copy_(query_states_embed.view(query_states.shape))
24 |
25 | if k_embed is None:
26 | k_embed = key_states_embed.view(key_states.shape)
27 | elif k_embed is not key_states:
28 | k_embed.copy_(key_states_embed.view(key_states.shape))
29 |
30 | return q_embed, k_embed
31 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Optional
3 |
4 | import dlinfer.ops as ext_ops
5 | from torch import Tensor
6 |
7 |
8 | def awq_linear(x: Tensor,
9 | qweight: Tensor,
10 | scales: Tensor,
11 | qzeros: Tensor,
12 | bias: Optional[Tensor] = None,
13 | all_reduce: bool = False,
14 | group_size: int = 0):
15 | return ext_ops.weight_quant_matmul(x.squeeze(0),
16 | qweight,
17 | scales,
18 | offset=qzeros,
19 | bias=bias,
20 | all_reduce=all_reduce,
21 | group_size=group_size).unsqueeze(0)
22 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Optional, Sequence
3 |
4 | import dlinfer.ops as ext_ops
5 | from torch import Tensor
6 |
7 |
8 | def fill_kv_cache(
9 | key_states: Tensor,
10 | value_states: Tensor,
11 | key_caches: Tensor,
12 | value_caches: Tensor,
13 | kv_start_indices: Tensor,
14 | k_scales_zeros: Sequence[Optional[Tensor]],
15 | v_scales_zeros: Sequence[Optional[Tensor]],
16 | quant_bits: int = 0,
17 | ):
18 | """Fill key/value state to cache for paged attention."""
19 | return ext_ops.fill_kv_cache(key_states,
20 | value_states,
21 | key_caches,
22 | value_caches,
23 | kv_start_indices,
24 | k_scales_zeros=k_scales_zeros,
25 | v_scales_zeros=v_scales_zeros,
26 | quant_bits=quant_bits)
27 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/flash_attention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from dlinfer.utils.type_annotation import Tensor
4 |
5 |
6 | def flash_attention_fwd(
7 | query_states: Tensor,
8 | key_states: Tensor,
9 | value_states: Tensor,
10 | attn_output: Tensor,
11 | q_start_loc: Tensor,
12 | q_seqlens: Tensor,
13 | kv_start_loc: Tensor,
14 | kv_seqlens: Tensor,
15 | num_heads: int,
16 | num_kv_heads: int,
17 | max_q_seqlen: int = None,
18 | window_size: int = None,
19 | sm_scale: float = None,
20 | logit_softcapping: float = None,
21 | causal: bool = True,
22 | ):
23 | return ext_ops.prefill_attention(
24 | query_states,
25 | key_states,
26 | value_states,
27 | None,
28 | None,
29 | q_start_loc,
30 | q_seqlens,
31 | kv_seqlens,
32 | max_q_seqlen,
33 | num_heads,
34 | num_kv_heads,
35 | attn_mask=[],
36 | softmax_scale=sm_scale,
37 | attn_output=attn_output,
38 | )
39 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/fused_moe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 |
5 |
6 | def fused_moe(
7 | hidden_states: Tensor,
8 | gate_up_weights: Tensor,
9 | down_weights: Tensor,
10 | topk_weights: Tensor,
11 | topk_ids: Tensor,
12 | topk: int,
13 | renormalize: bool,
14 | ):
15 | """Dlinfer fused moe."""
16 | return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize)
17 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/linear.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Optional
3 |
4 | import dlinfer.ops as ext_ops
5 | from torch import Tensor
6 |
7 |
8 | def linear(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, all_reduce: bool = False, group: str = ''):
9 | return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce, group=group)
10 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 |
5 |
6 | def moe_gating_topk_softmax(router_logits: Tensor, topk: int):
7 | routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk)
8 | return routing_weights, selected_experts
9 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/dlinfer/rms_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import dlinfer.ops as ext_ops
3 | from torch import Tensor
4 |
5 |
6 | def rms_norm(hidden_states: Tensor, weight: Tensor, epsilon: float = 1e-6, residual: Tensor = None, out: Tensor = None):
7 | if residual is None:
8 | rms_norm_out = ext_ops.rms_norm(hidden_states, weight, epsilon)
9 | if out is None:
10 | out = rms_norm_out
11 | else:
12 | out.copy_(rms_norm_out)
13 | return out
14 | else:
15 | return ext_ops.add_rms_norm(hidden_states, residual, weight, epsilon)
16 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fill_kv_cache.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | fill_kv_cache = FunctionDispatcher('fill_kv_cache').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/flash_mla.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | flash_mla_fwd = FunctionDispatcher('flash_mla_fwd').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fused_moe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | fused_moe = FunctionDispatcher('fused_moe').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/fused_rotary_emb.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | fused_rotary_emb = FunctionDispatcher('fused_rotary_emb').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | moe_gating_topk_softmax = FunctionDispatcher('moe_gating_topk_softmax').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | multinomial_sampling = FunctionDispatcher('multinomial_sampling').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/pagedattention.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | paged_attention_fwd = FunctionDispatcher('paged_attention_fwd').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/rms_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | rms_norm = FunctionDispatcher('rms_norm').make_caller()
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/kernels/w8a8_triton_kernels.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .dispatcher import FunctionDispatcher
3 |
4 | per_channel_quant = FunctionDispatcher('per_channel_quant').make_caller()
5 |
6 | matmul_kernel_dynamic_quant = FunctionDispatcher('matmul_kernel_dynamic_quant').make_caller()
7 |
8 | per_token_quant_int8 = FunctionDispatcher('per_token_quant_int8').make_caller()
9 |
10 | rms_norm_dynamic_quant = FunctionDispatcher('rms_norm_dynamic_quant').make_caller()
11 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .q_modules import QLinear, QRMSNorm
3 |
4 | __all__ = ['QLinear', 'QRMSNorm']
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/models/utils/multimodal.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import List, Tuple
3 |
4 | from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
5 |
6 | PreparedInputs = Tuple[List[int], MultiModalInputs]
7 |
8 |
9 | class MultiModalMixin:
10 |
11 | def prepare_multimodal_input(self, input_ids, input_multimodals, **kwargs) -> PreparedInputs:
12 | """Prepare multimodals inputs."""
13 | raise NotImplementedError('prepare input not implemented.')
14 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/multimodal/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .data_type import MultiModalData, MultiModalTensor
3 |
4 | __all__ = ['MultiModalData', 'MultiModalTensor']
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/multimodal/image_type.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from dataclasses import dataclass
3 | from typing import Any, ClassVar, Dict
4 |
5 | from PIL import Image
6 |
7 | from .data_type import MultiModalData
8 |
9 |
10 | @dataclass
11 | class ImageData(MultiModalData):
12 | data: Image
13 | loc: int
14 | meta: Dict[str, Any] = None
15 | type: ClassVar[str] = 'image'
16 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # attention module is modified from:
3 | # https://github.com/vllm-project/vllm/blob/main/vllm/attention/
4 | from .activation import GeluAndMul, SiluAndMul # noqa: F401
5 | from .attention import Attention, FlashAttention # noqa: F401
6 | from .norm import LayerNorm, RMSNorm # noqa: F401
7 | from .rotary_embedding import ApplyRotaryEmb # noqa: F401
8 | from .rotary_embedding import RopeType # noqa: F401
9 | from .rotary_embedding import YarnParameters # noqa: F401
10 | from .rotary_embedding import build_rotary_embedding # noqa: F401
11 | from .rotary_embedding import build_rotary_params # noqa: F401
12 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/activation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch import Tensor, nn
3 |
4 | from ..backends import OpType, get_backend
5 |
6 |
7 | class SiluAndMul(nn.Module):
8 | """Silu and elementwise multiple."""
9 |
10 | def __init__(self, inplace: bool = True):
11 | super().__init__()
12 | backend = get_backend()
13 | builder = backend.get_layer_impl_builder(OpType.SiluAndMul)
14 | self.impl = builder.build(inplace)
15 |
16 | def forward(self, x: Tensor):
17 | """forward."""
18 | return self.impl.forward(x)
19 |
20 |
21 | class GeluAndMul(nn.Module):
22 | """Gelu and elementwise multiple."""
23 |
24 | def __init__(self, approximate: str = 'none'):
25 | super().__init__()
26 | backend = get_backend()
27 | builder = backend.get_layer_impl_builder(OpType.GeluAndMul)
28 | self.impl = builder.build(approximate)
29 |
30 | def forward(self, x: Tensor):
31 | """forward."""
32 | return self.impl.forward(x)
33 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/multinomial_sampling.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from ..backends import OpType, get_backend
5 |
6 |
7 | def multinomial_sampling(scores: torch.Tensor,
8 | seeds: torch.LongTensor,
9 | offsets: torch.LongTensor,
10 | indices: torch.Tensor = None):
11 | """Multinomial sampling op."""
12 | impl_builder = get_backend().get_layer_impl_builder(OpType.MultinomialSampling)
13 | return impl_builder.build().forward(scores, seeds, offsets, indices)
14 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/nn/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 |
5 | def div_up(a: int, b: int):
6 | """Div up."""
7 | return (a + b - 1) // b
8 |
9 |
10 | def get_distribute_size(feature_size: int, world_size: int, rank: int, align: int = 1):
11 | """Update feature size."""
12 | assert feature_size % align == 0
13 | aligned_size = feature_size // align
14 | # try to make every rank has same amount of feats
15 | updated_aligned_size = aligned_size // world_size
16 | # if there are still some remain, given them to
17 | # each rank
18 | if rank < aligned_size % world_size:
19 | updated_aligned_size += 1
20 | return updated_aligned_size * align
21 |
22 |
23 | def chunk_aligned(weight: torch.Tensor, chunks: int, dim: int, align: int):
24 | """Chunk aligned."""
25 | if align == 1:
26 | return weight.chunk(chunks, dim=dim)
27 | size = weight.size(dim)
28 | assert size % align == 0
29 | aligned_size = size // align
30 |
31 | # try best to evenly split chunks
32 | align_per_chunk = aligned_size // chunks
33 | remain = aligned_size % chunks
34 | sections = [align_per_chunk + int(c < remain) for c in range(chunks)]
35 | sections = [sec * align for sec in sections]
36 | return weight.split(sections, dim=dim)
37 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .scheduler import Scheduler
3 |
4 | __all__ = ['Scheduler']
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/block_manager/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ...config import CacheConfig
3 | from .base_block_manager import BaseBlockManager
4 | from .default_block_manager import DefaultBlockManager
5 | from .window_block_manager import WindowBlockManager
6 |
7 |
8 | def build_block_manager(cache_config: CacheConfig) -> BaseBlockManager:
9 | """Build block manager.
10 |
11 | Args:
12 | cache_config (CacheConfig): cache_config.
13 | """
14 |
15 | num_cpu_blocks = cache_config.num_cpu_blocks
16 | num_gpu_blocks = cache_config.num_gpu_blocks
17 | window_size = cache_config.window_size
18 |
19 | if window_size < 0:
20 | return DefaultBlockManager(num_gpu_blocks, num_cpu_blocks)
21 | else:
22 | return WindowBlockManager(num_gpu_blocks, num_cpu_blocks, window_size=window_size)
23 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/eviction_helper/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .recompute_eviction_helper import RecomputeEvictionHelper
3 |
4 | __all__ = ['RecomputeEvictionHelper']
5 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import List
3 |
4 | from ...messages import SchedulerSequence
5 | from ..scheduler import Scheduler
6 |
7 | SeqList = List[SchedulerSequence]
8 |
9 |
10 | class BaseEvictionHelper:
11 | """Base eviction helper."""
12 |
13 | def __init__(self, scheduler: Scheduler):
14 | self.scheduler = scheduler
15 | self.block_manager = scheduler.block_manager
16 | self.block_trie = scheduler.block_trie
17 |
18 | def need_swap_in(self, seq: SchedulerSequence):
19 | """Sequence need swap in."""
20 | raise NotImplementedError('Not implemented.')
21 |
22 | def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int):
23 | """Evict seqs."""
24 | raise NotImplementedError('Not implemented.')
25 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .utils import Timer # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # modify from: https://github.com/vllm-project/vllm
3 | import inspect
4 | from inspect import Parameter, Signature
5 | from typing import Dict, Sequence
6 |
7 | import psutil
8 |
9 |
10 | def get_gpu_memory(device_id: int = None) -> int:
11 | """Returns the free and total physical memory of the GPU in bytes."""
12 | import torch
13 | if device_id is None:
14 | device_id = torch.cuda.current_device()
15 | return torch.cuda.mem_get_info(device_id)
16 |
17 |
18 | def get_cpu_memory() -> int:
19 | """Returns the total CPU memory of the node in bytes."""
20 | return psutil.virtual_memory().total
21 |
22 |
23 | def bind_sigature(input_names: str, args: Sequence, kwargs: Dict):
24 | """Bind args and kwargs to given input names."""
25 | kind = inspect._ParameterKind.POSITIONAL_OR_KEYWORD
26 |
27 | sig = Signature([Parameter(name, kind) for name in input_names])
28 | bind = sig.bind(*args, **kwargs)
29 | return bind.arguments
30 |
--------------------------------------------------------------------------------
/lmdeploy/pytorch/weight_loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/serve/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/serve/gradio/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/serve/gradio/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | import gradio as gr
4 |
5 | CSS = """
6 | #container {
7 | width: 95%;
8 | margin-left: auto;
9 | margin-right: auto;
10 | }
11 |
12 | #chatbot {
13 | height: 500px;
14 | overflow: auto;
15 | }
16 |
17 | .chat_wrap_space {
18 | margin-left: 0.5em
19 | }
20 | """
21 |
22 | THEME = gr.themes.Soft(primary_hue=gr.themes.colors.blue,
23 | secondary_hue=gr.themes.colors.sky,
24 | font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
25 |
26 | enable_btn = gr.update(interactive=True)
27 | disable_btn = gr.update(interactive=False)
28 |
--------------------------------------------------------------------------------
/lmdeploy/serve/openai/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/serve/openai/reasoning_parser/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
3 | from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser
4 | from .reasoning_parser import ReasoningParser, ReasoningParserManager
5 |
6 | __all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser']
7 |
--------------------------------------------------------------------------------
/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
3 | from .reasoning_parser import ReasoningParserManager
4 |
5 |
6 | @ReasoningParserManager.register_module(name='qwen-qwq')
7 | class QwenQwQReasoningParser(DeepSeekR1ReasoningParser):
8 | """Reasoning parser for Qwen QwQ model.
9 |
10 | The Qwen QwQ model uses ... tokens to denote reasoning text. This parser extracts the reasoning
11 | content from the model output.
12 | """
13 |
--------------------------------------------------------------------------------
/lmdeploy/serve/openai/tool_parser/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .internlm2_parser import Internlm2ToolParser
3 | from .llama3_parser import Llama3JsonToolParser
4 | from .qwen2d5_parser import Qwen2d5ToolParser
5 | from .tool_parser import ToolParser, ToolParserManager
6 |
7 | __all__ = ['Internlm2ToolParser', 'Qwen2d5ToolParser', 'ToolParser', 'ToolParserManager', 'Llama3JsonToolParser']
8 |
--------------------------------------------------------------------------------
/lmdeploy/serve/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/serve/turbomind/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 |
4 | def bootstrap():
5 | import os
6 | import sys
7 |
8 | has_turbomind = False
9 | pwd = os.path.dirname(__file__)
10 | if os.path.exists(os.path.join(pwd, '..', 'lib')):
11 | has_turbomind = True
12 | if os.name == 'nt' and has_turbomind:
13 | if sys.version_info[:2] >= (3, 8):
14 | CUDA_PATH = os.getenv('CUDA_PATH')
15 | assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
16 | dll_path = os.path.join(CUDA_PATH, 'bin')
17 | print(f'Add dll path {dll_path}, please note cuda version '
18 | 'should >= 11.3 when compiled with cuda 11')
19 | os.add_dll_directory(dll_path)
20 |
21 |
22 | bootstrap()
23 |
24 | from .turbomind import TurboMind, update_parallel_config # noqa: E402
25 |
26 | __all__ = ['TurboMind', 'update_parallel_config']
27 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401
3 | from .deepseek2 import DeepSeek2Model # noqa: F401
4 | from .deepseek_vl import DeepSeekVLModel # noqa: F401
5 | from .glm4 import Glm4Model # noqa: F401
6 | from .internlm2 import InternLM2Model # noqa: F401
7 | from .internvl import InternVLModel # noqa: F401
8 | from .llama import LlamaModel # noqa: F401
9 | from .llava import LlavaModel # noqa: F401
10 | from .minicpmv import MiniCPMVModel # noqa: F401
11 | from .mixtral import MixtralModel # noqa: F401
12 | from .molmo import MolmoModel # noqa: F401
13 | from .qwen import QwenModel # noqa: F401
14 | from .xcomposer2 import Xcomposer2Model # noqa: F401
15 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/minicpmv.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | import json
4 | import os.path as osp
5 |
6 | from .base import INPUT_MODELS
7 | from .llama import LlamaModel, LlamaReader
8 |
9 |
10 | class MiniCPMVReader(LlamaReader):
11 | """MiniCPMVReader for llama model."""
12 |
13 | attn_layer_prefix = 'llm.model.layers'
14 | attn_layer_patten = r'llm.model.layers.([0-9]+).'
15 | tok_embeddings_key = 'llm.model.embed_tokens.weight'
16 | norm_weight_key = 'llm.model.norm.weight'
17 | output_weight_key = 'llm.lm_head.weight'
18 |
19 |
20 | @INPUT_MODELS.register_module(name='minicpmv')
21 | class MiniCPMVModel(LlamaModel):
22 | """MiniCPMV model in hf format."""
23 | Reader = MiniCPMVReader
24 |
25 | def model_info(self):
26 | info = super().model_info()
27 | with open(osp.join(self.model_path, 'config.json')) as f:
28 | config = json.load(f)
29 | if str(config.get('version')) == '2.6':
30 | info['attn_bias'] = True
31 | return info
32 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/source_model/mixtral.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .base import INPUT_MODELS
4 | from .llama import LlamaModel, LlamaReader
5 |
6 |
7 | class MixtralReader(LlamaReader):
8 |
9 | def moe_ffn_expert(self, e=None, i=None, kind=None):
10 | if not kind:
11 | return self.filter(r'experts')
12 | result = []
13 | for x in ['w1', 'w2', 'w3']:
14 | name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}'
15 | tensor = self.params.get(name)
16 | tensor = self.transform(tensor, kind)
17 | result.append(tensor)
18 | return (*result, )
19 |
20 | def moe_ffn_gate(self, i):
21 | return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.weight')
22 |
23 |
24 | @INPUT_MODELS.register_module(name='mixtral')
25 | class MixtralModel(LlamaModel):
26 |
27 | Reader = MixtralReader
28 |
29 | def model_info(self):
30 | cfg = self.model_config
31 | info = super().model_info()
32 | info['expert_num'] = cfg['num_local_experts']
33 | info['expert_inter_size'] = cfg['intermediate_size']
34 | info['experts_per_token'] = cfg['num_experts_per_tok']
35 | info['norm_topk_prob'] = True
36 | info['inter_size'] = 0
37 | return info
38 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/target_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .fp import TurbomindModel # noqa: F401
3 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/deploy/target_model/fp.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | from .base import OUTPUT_MODELS, BaseOutputModel
4 |
5 |
6 | @OUTPUT_MODELS.register_module(name='tm')
7 | class TurbomindModel(BaseOutputModel):
8 | """Export to turbomind fp16 format."""
9 | pass
10 |
--------------------------------------------------------------------------------
/lmdeploy/turbomind/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import os
3 |
4 | from transformers.utils import ExplicitEnum
5 |
6 | from lmdeploy.utils import get_logger
7 |
8 | logger = get_logger('lmdeploy')
9 |
10 |
11 | class ModelSource(ExplicitEnum):
12 | """Turbomind model source."""
13 | WORKSPACE = 'workspace'
14 | HF_MODEL = 'hf_model'
15 |
16 |
17 | def get_model_source(pretrained_model_name_or_path: str, **kwargs) -> ModelSource:
18 | """Get model source."""
19 | triton_model_path = os.path.join(pretrained_model_name_or_path, 'triton_models')
20 | if os.path.exists(triton_model_path):
21 | return ModelSource.WORKSPACE
22 | return ModelSource.HF_MODEL
23 |
--------------------------------------------------------------------------------
/lmdeploy/version.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from typing import Tuple
3 |
4 | __version__ = '0.8.0'
5 | short_version = __version__
6 |
7 |
8 | def parse_version_info(version_str: str) -> Tuple:
9 | """Parse version from a string.
10 |
11 | Args:
12 | version_str (str): A string represents a version info.
13 |
14 | Returns:
15 | tuple: A sequence of integer and string represents version.
16 | """
17 | _version_info = []
18 | for x in version_str.split('.'):
19 | if x.isdigit():
20 | _version_info.append(int(x))
21 | elif x.find('rc') != -1:
22 | patch_version = x.split('rc')
23 | _version_info.append(int(patch_version[0]))
24 | _version_info.append(f'rc{patch_version[1]}')
25 | return tuple(_version_info)
26 |
27 |
28 | version_info = parse_version_info(__version__)
29 |
--------------------------------------------------------------------------------
/lmdeploy/vl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .utils import load_image
3 |
4 | __all__ = ['load_image']
5 |
--------------------------------------------------------------------------------
/lmdeploy/vl/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | IMAGE_DUMMY_TOKEN_INDEX = 0
3 | IMAGE_TOKEN = ''
4 |
--------------------------------------------------------------------------------
/lmdeploy/vl/model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/lmdeploy/vl/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | pybind11<=2.13.1
2 | setuptools
3 |
--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | markdown>=3.4.0
2 | myst-parser
3 | sphinx==8.0.2
4 | sphinx-book-theme
5 | sphinx-copybutton
6 | sphinx-tabs
7 | sphinxcontrib-mermaid
8 |
--------------------------------------------------------------------------------
/requirements/lite.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets
3 | transformers_stream_generator
4 |
--------------------------------------------------------------------------------
/requirements/readthedocs.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | mmengine-lite
3 | pillow
4 | pydantic
5 | torch
6 | transformers
7 | urllib3<2.0.0
8 |
--------------------------------------------------------------------------------
/requirements/runtime_ascend.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.29.3
2 | dlinfer-ascend>=0.1.3
3 | einops
4 | fastapi
5 | fire
6 | mmengine-lite
7 | numpy<2.0.0
8 | openai
9 | outlines<0.1.0
10 | partial_json_parser
11 | peft<=0.11.1
12 | pillow
13 | protobuf
14 | pydantic>2.0.0
15 | safetensors
16 | sentencepiece
17 | shortuuid
18 | tiktoken
19 | torch<=2.4.0,>=2.3.1
20 | torch-npu==2.3.1
21 | torchvision<=0.19.0,>=0.18.1
22 | transformers
23 | uvicorn
24 |
--------------------------------------------------------------------------------
/requirements/runtime_camb.txt:
--------------------------------------------------------------------------------
1 | accelerate==1.2.0
2 | einops
3 | fastapi
4 | fire
5 | mmengine-lite
6 | numpy<2.0.0
7 | openai
8 | outlines<0.1.0
9 | partial_json_parser
10 | peft<=0.11.1
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | safetensors
15 | sentencepiece
16 | shortuuid
17 | tiktoken
18 | torch==2.4.0
19 | torchvision<=0.19.0,>=0.15.0
20 | transformers
21 | uvicorn
22 |
--------------------------------------------------------------------------------
/requirements/runtime_cuda.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.29.3
2 | einops
3 | fastapi
4 | fire
5 | mmengine-lite
6 | numpy<2.0.0
7 | openai
8 | outlines
9 | partial_json_parser
10 | peft<=0.14.0
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | pynvml
15 | ray
16 | safetensors
17 | sentencepiece
18 | shortuuid
19 | tiktoken
20 | torch<=2.6.0,>=2.0.0
21 | torchvision<=0.21.0,>=0.15.0
22 | transformers
23 | triton<=3.2.0,>=3.0.0; sys_platform == "linux"
24 | uvicorn
25 |
--------------------------------------------------------------------------------
/requirements/runtime_maca.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.32.1
2 | einops
3 | fastapi
4 | fire
5 | mmengine-lite
6 | numpy<2.0.0
7 | openai
8 | outlines<0.1.0
9 | partial_json_parser
10 | peft<=0.11.1
11 | pillow
12 | protobuf
13 | pydantic>2.0.0
14 | safetensors
15 | sentencepiece
16 | shortuuid
17 | tiktoken
18 | torch<=2.4.0,>=2.0.0
19 | torchvision<=0.19.0,>=0.15.0
20 | transformers
21 | triton>=2.1.0; sys_platform == "linux"
22 | uvicorn
23 |
--------------------------------------------------------------------------------
/requirements/serve.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | protobuf
3 | tritonclient[grpc]
4 |
--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
1 | allure-pytest
2 | coverage
3 | nvidia-ml-py
4 | pytest
5 | pytest-assume
6 | pytest-cov
7 | pytest-order
8 | pytest-rerunfailures
9 | pytest-sugar
10 | pytest-xdist
11 | pyyaml
12 |
--------------------------------------------------------------------------------
/requirements_ascend.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_ascend.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 |
--------------------------------------------------------------------------------
/requirements_camb.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_camb.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 |
--------------------------------------------------------------------------------
/requirements_cuda.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_cuda.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 |
--------------------------------------------------------------------------------
/requirements_maca.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/runtime_maca.txt
3 | -r requirements/lite.txt
4 | -r requirements/serve.txt
5 |
--------------------------------------------------------------------------------
/resources/batch_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InternLM/lmdeploy/c63db2b8a0b57ef732fc5ed1e7c2e0eefdfb76de/resources/batch_memory.png
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | add_subdirectory(turbomind)
16 |
--------------------------------------------------------------------------------
/src/turbomind/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | add_subdirectory(utils)
16 | add_subdirectory(core)
17 | add_subdirectory(kernels)
18 | add_subdirectory(layers)
19 | add_subdirectory(comm)
20 | add_subdirectory(models)
21 | add_subdirectory(engine)
22 | if(BUILD_PYT)
23 | add_subdirectory(th_op)
24 | endif()
25 | if(BUILD_PY_FFI)
26 | add_subdirectory(python)
27 | endif()
28 | add_subdirectory(triton_backend)
29 |
--------------------------------------------------------------------------------
/src/turbomind/comm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | cmake_minimum_required(VERSION 3.8)
4 |
5 | add_library(host_comm STATIC host_comm.cc thread_comm.cc)
6 | target_link_libraries(host_comm PRIVATE core logger)
7 | set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
8 |
9 | add_library(device_comm STATIC device_comm.cc)
10 | target_link_libraries(device_comm PRIVATE core logger)
11 | set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
12 | set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
13 |
14 | if (BUILD_MULTI_GPU)
15 | add_subdirectory(cuda_ipc)
16 | target_link_libraries(device_comm INTERFACE cuda_ipc_comm)
17 |
18 | if (USE_NCCL)
19 | add_subdirectory(nccl)
20 | target_link_libraries(device_comm INTERFACE nccl_comm)
21 | endif ()
22 |
23 | if (BUILD_TEST)
24 | add_executable(test_comm test_comm.cu)
25 | target_link_libraries(test_comm PRIVATE device_comm host_comm core pthread nvtx_utils)
26 | target_compile_options(test_comm PRIVATE -O3 -march=native -mtune=native)
27 | endif ()
28 | endif ()
29 |
--------------------------------------------------------------------------------
/src/turbomind/comm/barrier.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include
6 | #include
7 | #include
8 |
9 | namespace turbomind::comm {
10 |
11 | class Barrier {
12 | public:
13 | explicit Barrier(int count): threshold_{count}, count_{count} {}
14 |
15 | void arrive_and_wait()
16 | {
17 | std::unique_lock lock{mutex_};
18 | auto phase = phase_;
19 | if (--count_ == 0) {
20 | ++phase_;
21 | count_ = threshold_;
22 | cv_.notify_all();
23 | }
24 | else {
25 | cv_.wait(lock, [this, phase] { return phase_ != phase; });
26 | }
27 | }
28 |
29 | private:
30 | std::mutex mutex_;
31 | std::condition_variable cv_;
32 |
33 | int threshold_;
34 | int count_;
35 |
36 | uint32_t phase_{};
37 | };
38 |
39 | } // namespace turbomind::comm
40 |
--------------------------------------------------------------------------------
/src/turbomind/comm/cuda_ipc/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | cmake_minimum_required(VERSION 3.8)
4 |
5 | add_library(cuda_ipc_comm STATIC
6 | cuda_ipc_comm.cu
7 | allreduce.cu
8 | allgather.cu
9 | fused_allreduce.cu
10 | fused_allreduce_ex.cu)
11 |
12 | target_link_libraries(cuda_ipc_comm PRIVATE
13 | rms_norm
14 | host_comm
15 | core
16 | cuda_utils
17 | CUDA::cuda_driver
18 | logger)
19 |
20 | set_property(TARGET cuda_ipc_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
21 | set_property(TARGET cuda_ipc_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
22 |
--------------------------------------------------------------------------------
/src/turbomind/comm/cuda_ipc/group_sum.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include "src/turbomind/kernels/core/common.h"
6 |
7 | namespace turbomind::comm {
8 |
9 | namespace detail {
10 |
11 | template
12 | __device__ float GroupSum(const float val, int warps, Syncgroup syncgroup)
13 | {
14 | const int warp_id = threadIdx.x / WARP_SIZE;
15 | const int lane_id = threadIdx.x % WARP_SIZE;
16 | float sum = val;
17 | PRAGMA_UNROLL
18 | for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
19 | sum += __shfl_xor_sync((uint32_t)-1, sum, mask);
20 | }
21 | __shared__ float smem[32];
22 | // syncgroup();
23 | if (lane_id == 0) {
24 | smem[warp_id] = sum;
25 | }
26 | syncgroup();
27 | for (int i = 1; i < warps; ++i) {
28 | sum += smem[warp_id / warps * warps + i];
29 | }
30 | // sum = {};
31 | // for (int i = 0; i < warps; ++i) {
32 | // sum += smem[warp_id / warps * warps + i];
33 | // }
34 | return sum;
35 | }
36 |
37 | } // namespace detail
38 |
39 | } // namespace turbomind::comm
40 |
--------------------------------------------------------------------------------
/src/turbomind/comm/device_comm.cc:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "src/turbomind/comm/device_comm.h"
4 | #include "src/turbomind/utils/cuda_utils.h"
5 |
6 | namespace turbomind::comm {
7 |
8 | DeviceCommImpl::~DeviceCommImpl() = default;
9 |
10 | DeviceComm CreateNcclCommunicator(int n_ranks, int rank, HostComm h_comm);
11 |
12 | DeviceComm CreateCudaIpcCommunicator(int n_ranks, int rank, HostComm h_comm);
13 |
14 | DeviceComm CreateDeviceCommunicator(const std::string& backend, int n_ranks, int rank, HostComm h_comm)
15 | {
16 | #if BUILD_MULTI_GPU && USE_NCCL
17 | if (backend == "nccl") {
18 | return CreateNcclCommunicator(n_ranks, rank, h_comm);
19 | }
20 | #endif
21 |
22 | #if BUILD_MULTI_GPU
23 | if (backend == "native" || backend == "cuda-ipc") {
24 | return CreateCudaIpcCommunicator(n_ranks, rank, h_comm);
25 | }
26 | #endif
27 |
28 | TM_CHECK(0) << "Unknown communication backend: " << backend;
29 | return {};
30 | }
31 |
32 | } // namespace turbomind::comm
33 |
--------------------------------------------------------------------------------
/src/turbomind/comm/host_comm.cc:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "src/turbomind/comm/host_comm.h"
4 |
5 | namespace turbomind::comm {
6 |
7 | HostCommImpl::~HostCommImpl() = default;
8 |
9 | std::unique_ptr CreateThreadGroupId();
10 |
11 | std::unique_ptr CreateHostGroupId(const std::string& backend)
12 | {
13 | return CreateThreadGroupId();
14 | }
15 |
16 | } // namespace turbomind::comm
17 |
--------------------------------------------------------------------------------
/src/turbomind/comm/nccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | cmake_minimum_required(VERSION 3.8)
4 |
5 | add_library(nccl_comm STATIC nccl.cu)
6 | target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger)
7 | target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS})
8 |
9 | set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
10 | set_property(TARGET nccl_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
11 |
--------------------------------------------------------------------------------
/src/turbomind/core/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | cmake_minimum_required(VERSION 3.8)
4 |
5 | add_library(core STATIC
6 | check.cc
7 | allocator.cc
8 | stream.cc
9 | context.cc
10 | buffer.cc
11 | layout.cc
12 | tensor.cc
13 | tensor.cu
14 | module.cc)
15 |
16 | target_link_libraries(core PUBLIC cuda_utils CUDA::cudart CUDA::cuda_driver)
17 |
18 | set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON)
19 | set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
20 |
21 | target_compile_options(core PRIVATE $<$:-Xptxas=-v>)
22 |
23 | if (BUILD_TEST)
24 | add_executable(test_core test_core.cc)
25 | target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain)
26 | endif ()
27 |
--------------------------------------------------------------------------------
/src/turbomind/core/common.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | /// TODO: remove this dependency
9 | #include "src/turbomind/utils/cuda_utils.h"
10 |
11 | namespace turbomind::core {
12 |
13 | class Allocator;
14 | class Buffer;
15 | class Stream;
16 | class Event;
17 | class Context;
18 |
19 | using std::shared_ptr;
20 | using std::vector;
21 |
22 | using ssize_t = std::ptrdiff_t;
23 |
24 | } // namespace turbomind::core
25 |
--------------------------------------------------------------------------------
/src/turbomind/core/context.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "src/turbomind/core/allocator.h"
4 | #include "src/turbomind/core/common.h"
5 | #include "src/turbomind/core/stream.h"
6 |
7 | namespace turbomind::core {
8 |
9 | class Context {
10 | public:
11 | static Stream& stream();
12 | static Allocator& host_alloc();
13 | static Allocator& device_alloc();
14 | static Allocator& pinned_alloc();
15 | static Allocator& alloc(Device device);
16 |
17 | private:
18 | friend class ContextGuard;
19 | static void push(const Stream& stream);
20 | static void push(const Allocator& alloc);
21 | static void pop();
22 | };
23 |
24 | class ContextGuard {
25 | public:
26 | template
27 | explicit ContextGuard(Args&&... args): n_{}
28 | {
29 | (Context::push((Args &&) args), ...);
30 | n_ = sizeof...(Args);
31 | }
32 | ~ContextGuard()
33 | {
34 | for (int i = 0; i < n_; ++i) {
35 | Context::pop();
36 | }
37 | }
38 |
39 | private:
40 | int n_;
41 | };
42 |
43 | } // namespace turbomind::core
44 |
--------------------------------------------------------------------------------
/src/turbomind/core/core.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "src/turbomind/core/allocator.h"
4 | #include "src/turbomind/core/buffer.h"
5 | #include "src/turbomind/core/check.h"
6 | #include "src/turbomind/core/context.h"
7 | #include "src/turbomind/core/data_type.h"
8 | #include "src/turbomind/core/layout.h"
9 | #include "src/turbomind/core/stream.h"
10 | #include "src/turbomind/core/tensor.h"
11 |
12 | namespace turbomind {
13 |
14 | using core::ssize_t;
15 | using core::Buffer;
16 | using core::Buffer_;
17 | using core::Tensor;
18 | using core::Tensor_;
19 | using core::TensorMap;
20 | using core::Ref;
21 | using core::Layout;
22 | using core::Allocator;
23 | using core::Stream;
24 | using core::Event;
25 |
26 | } // namespace turbomind
27 |
--------------------------------------------------------------------------------
/src/turbomind/core/module.h:
--------------------------------------------------------------------------------
1 |
2 | #include "src/turbomind/core/tensor.h"
3 |
4 | namespace turbomind::core {
5 |
6 | class Module {
7 | public:
8 | virtual ~Module();
9 |
10 | Module();
11 |
12 | Module(const Module&) = delete;
13 | Module& operator=(const Module&) = delete;
14 |
15 | Module(Module&&) noexcept = delete;
16 | Module& operator=(Module&&) noexcept = delete;
17 |
18 | void register_module(std::string name, Module& module, std::optional index = {});
19 | void register_parameter(std::string name, Tensor& param);
20 |
21 | void remove_module(Module& module);
22 | void remove_parameter(Tensor& param);
23 |
24 | TensorMap get_parameters() const;
25 |
26 | private:
27 | void get_parameters_impl(std::string prefix, TensorMap& m) const;
28 |
29 | protected:
30 | Module* parent_;
31 |
32 | std::vector> modules_;
33 | std::vector> params_;
34 | };
35 |
36 | } // namespace turbomind::core
37 |
--------------------------------------------------------------------------------
/src/turbomind/core/stream.cc:
--------------------------------------------------------------------------------
1 |
2 | #include "src/turbomind/core/stream.h"
3 | #include
4 |
5 | namespace turbomind::core {
6 |
7 | Stream Stream::create(int priority)
8 | {
9 | Stream stream;
10 | stream.impl_ = std::make_shared(priority);
11 | return stream;
12 | }
13 |
14 | void StreamImpl::Wait(const Event& event)
15 | {
16 | check_cuda_error(cudaStreamWaitEvent(stream_, event));
17 | }
18 |
19 | } // namespace turbomind::core
20 |
--------------------------------------------------------------------------------
/src/turbomind/engine/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | cmake_minimum_required(VERSION 3.8)
4 |
5 | add_library(engine STATIC gateway.cc request_queue.cc model_request.cc)
6 | target_link_libraries(engine PRIVATE core)
7 | set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE ON)
8 | set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
9 |
--------------------------------------------------------------------------------
/src/turbomind/engine/request_queue.cc:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "src/turbomind/engine/request_queue.h"
4 | #include "src/turbomind/engine/gateway.h"
5 |
6 | #include "src/turbomind/engine/request.h"
7 |
8 | namespace turbomind {
9 |
10 | } // namespace turbomind
11 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/activation_kernels.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #pragma once
18 |
19 | #include
20 |
21 | #include "src/turbomind/core/core.h"
22 |
23 | namespace turbomind {
24 |
25 | // clang-format off
26 | template struct GeluActivation;
27 | template struct ReluActivation;
28 | template struct SiluActivation;
29 | template struct IdentityActivation;
30 | // clang-format on
31 |
32 | template class Activation>
33 | void invokeGenericActivation_v3(Ref inter_, const Tensor& gate, cudaStream_t stream);
34 |
35 | } // namespace turbomind
36 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/arch.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind::arch {
6 |
7 | // tags for dispatching & conditional codegen
8 |
9 | template
10 | struct Arch {
11 | static constexpr bool is_compatible(int arch)
12 | {
13 | return Begin <= arch && (End == -1 || arch < End);
14 | }
15 | };
16 |
17 | struct Sm70: Arch<700, 750> {
18 | };
19 |
20 | struct Sm75: Arch<750, 800> {
21 | };
22 |
23 | struct Sm80: Arch<800> {
24 | };
25 |
26 | } // namespace turbomind::arch
27 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/attention.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include "attention_params.h"
6 |
7 | namespace turbomind {
8 |
9 | constexpr int MAX_CTA_S = 64;
10 |
11 | template
12 | void dispatchAttention(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm70_128_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm75_128_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | // ! register spill
14 | // template void invokeAttention::Kernel>(
15 | // const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | // ! register spill
14 | // template void invokeAttention::Kernel>(
15 | // const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_128_bf16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_128_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_192.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_config.h"
4 | #include "../attention_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template void invokeAttention::Kernel>(
11 | const AttentionParams& params);
12 |
13 | template void invokeAttention::Kernel>(
14 | const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | template bool invokeDecoding>(const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_params.h"
4 | #include "../decoding_config.h"
5 | #include "../decoding_template.h"
6 |
7 | namespace turbomind {
8 |
9 | using namespace attention;
10 |
11 | template bool invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool invokeDecoding>(const AttentionParams& params);
14 |
15 | template bool invokeDecoding>(const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_params.h"
4 | #include "../decoding_config.h"
5 | #include "../decoding_template.h"
6 |
7 | namespace turbomind {
8 |
9 | using namespace attention;
10 |
11 | template bool invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool invokeDecoding>(const AttentionParams& params);
14 |
15 | template bool invokeDecoding>(const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | template bool invokeDecoding>(const AttentionParams& params);
15 |
16 | } // namespace turbomind
17 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_params.h"
4 | #include "../decoding_config.h"
5 | #include "../decoding_template.h"
6 |
7 | namespace turbomind {
8 |
9 | using namespace attention;
10 |
11 | template bool invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool invokeDecoding>(const AttentionParams& params);
14 |
15 | template bool invokeDecoding>(const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../attention_params.h"
4 | #include "../decoding_config.h"
5 | #include "../decoding_template.h"
6 |
7 | namespace turbomind {
8 |
9 | using namespace attention;
10 |
11 | template bool invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool invokeDecoding>(const AttentionParams& params);
14 |
15 | template bool invokeDecoding>(const AttentionParams& params);
16 |
17 | } // namespace turbomind
18 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool
11 | invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool
14 | invokeDecoding>(const AttentionParams& params);
15 |
16 | template bool
17 | invokeDecoding>(const AttentionParams& params);
18 |
19 | template bool
20 | invokeDecoding>(const AttentionParams& params);
21 |
22 | } // namespace turbomind
23 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | template bool invokeDecoding>(const AttentionParams& params);
15 |
16 | template bool invokeDecoding>(const AttentionParams& params);
17 |
18 | } // namespace turbomind
19 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool
11 | invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool invokeDecoding>(const AttentionParams& params);
14 |
15 | template bool
16 | invokeDecoding>(const AttentionParams& params);
17 |
18 | template bool invokeDecoding>(const AttentionParams& params);
19 |
20 | } // namespace turbomind
21 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool
11 | invokeDecoding>(const AttentionParams& params);
12 |
13 | template bool
14 | invokeDecoding>(const AttentionParams& params);
15 |
16 | template bool
17 | invokeDecoding>(const AttentionParams& params);
18 |
19 | template bool
20 | invokeDecoding>(const AttentionParams& params);
21 |
22 | } // namespace turbomind
23 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams& params);
11 |
12 | template bool invokeDecoding>(const AttentionParams& params);
13 |
14 | template bool invokeDecoding>(const AttentionParams& params);
15 |
16 | template bool invokeDecoding>(const AttentionParams& params);
17 |
18 | } // namespace turbomind
19 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "../decoding_config.h"
4 | #include "../decoding_template.h"
5 |
6 | namespace turbomind {
7 |
8 | using namespace attention;
9 |
10 | template bool invokeDecoding>(const AttentionParams&);
11 |
12 | template bool invokeDecoding>(const AttentionParams&);
13 |
14 | } // namespace turbomind
15 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/decoding.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include "attention_params.h"
6 |
7 | namespace turbomind {
8 |
9 | template
10 | void dispatchDecoding(const AttentionParams& params);
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/impl.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind {
6 |
7 | namespace attention {
8 |
9 | struct MMA_16816 {
10 | };
11 |
12 | struct MMA_81616 {
13 | }; // MMA_16816 transposed
14 |
15 | struct MMA_1688 {
16 | };
17 |
18 | struct MMA_884 {
19 | };
20 |
21 | struct MMA_SIMT {
22 | };
23 |
24 | template
35 | struct Impl {
36 | };
37 |
38 | } // namespace attention
39 |
40 | } // namespace turbomind
41 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/mainloop.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind::attention {
6 |
7 | template
8 | struct Mainloop {
9 | };
10 |
11 | } // namespace turbomind::attention
12 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/reduce.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include "cta_map.h"
6 | #include "src/turbomind/kernels/core/array_ops.h"
7 | #include "src/turbomind/kernels/core/thread_map.h"
8 | #include
9 | #include
10 | #include
11 |
12 | namespace turbomind::attention {
13 |
14 | template
15 | void invokeReduce(T* out,
16 | float* partial_M,
17 | float* partial_L,
18 | float* partial_O,
19 | const int* split_cnt,
20 | int partial_len,
21 | int max_split_cnt,
22 | int query_num,
23 | int head_num,
24 | float exp_scale,
25 | cudaStream_t stream);
26 |
27 | } // namespace turbomind::attention
28 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/attention/utils.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind {
6 |
7 | int GetSplitCount(int max_split_cnt,
8 | int grid_size,
9 | int max_active_ctas,
10 | int sm_count,
11 | int max_wave_cnt,
12 | float alpha = 1,
13 | float beta = 1e-3);
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/core/data_type.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include
6 | #if ENABLE_BF16
7 | #include
8 | #endif
9 |
10 | #include
11 |
12 | #include "src/turbomind/core/data_type.h"
13 |
14 | namespace turbomind {
15 |
16 | namespace detail {
17 |
18 | struct __uint4_t {
19 | uint32_t x;
20 | };
21 |
22 | } // namespace detail
23 |
24 | template
25 | struct get_pointer_type_t {
26 | using type = T*;
27 | };
28 |
29 | template
30 | using get_pointer_type = typename get_pointer_type_t::type;
31 |
32 | } // namespace turbomind
33 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/core/meta.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind {
6 |
7 | template
8 | struct basic_type {
9 | using type = T;
10 | };
11 |
12 | template
13 | constexpr basic_type type_c{};
14 |
15 | template
16 | struct constant {
17 | using type = constant;
18 | using value_type = decltype(v);
19 |
20 | static constexpr value_type value = v;
21 |
22 | constexpr value_type operator()() const noexcept
23 | {
24 | return v;
25 | }
26 | constexpr operator value_type() const noexcept
27 | {
28 | return v;
29 | }
30 | };
31 |
32 | template
33 | struct pair {
34 | };
35 |
36 | template
37 | constexpr auto first(pair)
38 | {
39 | return u;
40 | }
41 |
42 | template
43 | constexpr auto second(pair)
44 | {
45 | return v;
46 | }
47 |
48 | template
49 | struct triplet {
50 | };
51 |
52 | } // namespace turbomind
53 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/core/pipe_iter.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind {
6 |
7 | template
8 | struct PipeIter {
9 | static constexpr int kMaxStep = Stages * Step;
10 |
11 | int r = 0;
12 | int w = kMaxStep - Step;
13 |
14 | __inline__ __device__ PipeIter& operator++()
15 | {
16 | w = r;
17 | r += Step;
18 | if (r == kMaxStep) {
19 | r -= kMaxStep;
20 | }
21 | return *this;
22 | }
23 | };
24 |
25 | } // namespace turbomind
26 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8)
2 |
3 | add_subdirectory(fused_multi_head_attention)
4 |
5 | add_library(flash_attention STATIC flash_attention.cu)
6 | set_property(TARGET flash_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
7 | set_property(TARGET flash_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
8 | target_link_libraries(flash_attention PRIVATE llama_fmha)
9 |
10 | if (NOT MSVC)
11 | add_subdirectory(flash_attention2)
12 | target_link_libraries(flash_attention PRIVATE flash_attention2)
13 | endif()
14 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt:
--------------------------------------------------------------------------------
1 |
2 | cmake_minimum_required(VERSION 3.8)
3 | project(flash_attention2)
4 |
5 | add_library(${PROJECT_NAME} STATIC
6 | flash_api.cpp
7 | # flash_fwd_hdim32_fp16_sm80.cu
8 | # flash_fwd_hdim64_fp16_sm80.cu
9 | flash_fwd_hdim128_fp16_sm80.cu
10 | flash_fwd_hdim128_bf16_sm80.cu
11 | flash_fwd_hdim256_bf16_sm80.cu
12 | flash_fwd_hdim256_fp16_sm80.cu
13 | )
14 | target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
15 | target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
16 |
17 | set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
18 | set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
19 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/README.md:
--------------------------------------------------------------------------------
1 | #Flash Attention 2
2 |
3 | This is flash attention2 implementation modified from https://github.com/Dao-AILab/flash-attention
4 |
5 | - remove dropout
6 | - remove backward
7 | - cutlass 3.1.0
8 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | #ifdef ENABLE_BF16
8 | template<>
9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 | run_mha_fwd_hdim128(params, stream);
12 | }
13 | #endif
14 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | template<>
8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
9 | {
10 | run_mha_fwd_hdim128(params, stream);
11 | }
12 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | #ifdef ENABLE_BF16
8 | template<>
9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 | run_mha_fwd_hdim256(params, stream);
12 | }
13 | #endif
14 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | template<>
8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
9 | {
10 | run_mha_fwd_hdim256(params, stream);
11 | }
12 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | #ifdef ENABLE_BF16
8 | template<>
9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 | run_mha_fwd_hdim32(params, stream);
12 | }
13 | #endif
14 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | template<>
8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
9 | {
10 | run_mha_fwd_hdim32(params, stream);
11 | }
12 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | #ifdef ENABLE_BF16
8 | template<>
9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
10 | {
11 | run_mha_fwd_hdim64(params, stream);
12 | }
13 | #endif
14 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 |
3 | // Splitting the different head dimensions to different files to speed up compilation.
4 |
5 | #include "flash_fwd_launch_template.h"
6 |
7 | template<>
8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream)
9 | {
10 | run_mha_fwd_hdim64(params, stream);
11 | }
12 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/flash_attention/fused_multi_head_attention/CMakeLists.txt:
--------------------------------------------------------------------------------
1 |
2 | cmake_minimum_required(VERSION 3.8)
3 |
4 | add_library(llama_fmha STATIC llama_flash_attention_kernel.cu)
5 | target_include_directories(llama_fmha PRIVATE ${CUTLASS_DIR}/examples)
6 | target_link_libraries(llama_fmha PRIVATE nvidia::cutlass::cutlass)
7 | set_property(TARGET llama_fmha PROPERTY POSITION_INDEPENDENT_CODE ON)
8 | set_property(TARGET llama_fmha PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
9 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/arch.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind::gemm {
6 |
7 | // tags for dispatching & conditional codegen
8 |
9 | template
10 | struct Arch {
11 | static constexpr bool is_compatible(int arch)
12 | {
13 | return Begin <= arch && (End == -1 || arch < End);
14 | }
15 | };
16 |
17 | struct Sm70: Arch<700, 750> {
18 | static constexpr int value = 700;
19 | };
20 |
21 | struct Sm75: Arch<750, 800> {
22 | static constexpr int value = 750;
23 | };
24 |
25 | struct Sm80: Arch<800, 900> {
26 | static constexpr int value = 800;
27 | };
28 |
29 | struct Sm90: Arch<900> {
30 | static constexpr int value = 900;
31 | };
32 |
33 | inline bool is_arch_compatible(int karch, int darch)
34 | {
35 | switch (karch) {
36 | case 700:
37 | return Sm70::is_compatible(darch);
38 | case 750:
39 | return Sm75::is_compatible(darch);
40 | case 800:
41 | return Sm80::is_compatible(darch);
42 | case 900:
43 | return Sm90::is_compatible(darch);
44 | default:
45 | return false;
46 | }
47 | }
48 |
49 | } // namespace turbomind::gemm
50 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/dispatch_cache.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "src/turbomind/kernels/gemm/desc.h"
4 |
5 | #include
6 | #include
7 | #include
8 |
9 | namespace turbomind::gemm {
10 |
11 | class DispatchCache {
12 | public:
13 | DispatchCache(std::vector kernels);
14 |
15 | ~DispatchCache();
16 |
17 | std::optional LowerBound(const GemmDesc& desc) const;
18 |
19 | std::optional Find(const GemmDesc& desc) const;
20 |
21 | bool Insert(const GemmDesc& desc, const LaunchSpec& spec);
22 |
23 | int Export(std::ostream& os) const;
24 |
25 | int Import(std::istream& is);
26 |
27 | private:
28 | struct Impl;
29 | std::unique_ptr impl_;
30 | };
31 |
32 | } // namespace turbomind::gemm
33 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/gpu_metric.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include "src/turbomind/kernels/gemm/types.h"
6 |
7 | namespace turbomind::gemm {
8 |
9 | // bytes / second
10 | float MeasureL2CacheThroughput();
11 |
12 | // fused multiply-add / second
13 | float MeasureMmaThroughput(int proble_size = 16384);
14 |
15 | } // namespace turbomind::gemm
16 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/predicate.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | #include
6 | #include
7 |
8 | namespace turbomind::gemm {
9 |
10 | template
11 | struct Predicate {
12 |
13 | static constexpr int kSizeC = AlignedC ? 1 : C;
14 |
15 | static_assert(S * kSizeC <= 32);
16 |
17 | static constexpr bool is_active = true;
18 |
19 | uint32_t pred_{};
20 |
21 | __device__ int operator()(int s, int c) const
22 | {
23 | return (pred_ & (1 << (s * kSizeC + c))) != 0;
24 | }
25 |
26 | __device__ void set(int s, int c)
27 | {
28 | pred_ |= (1 << (s * kSizeC + c));
29 | }
30 |
31 | __device__ void clear()
32 | {
33 | pred_ = 0;
34 | }
35 | };
36 |
37 | template
38 | struct Predicate {
39 |
40 | static constexpr bool is_active = false;
41 |
42 | __device__ constexpr std::integral_constant operator()(int, int) const
43 | {
44 | return {};
45 | }
46 |
47 | __device__ void set(int, int) {}
48 |
49 | __device__ void clear()
50 | {
51 | // pred_ = 0;
52 | }
53 | };
54 |
55 | } // namespace turbomind::gemm
56 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/simt.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #pragma once
4 |
5 | namespace turbomind::gemm::simt {
6 |
7 | // constexpr int OP_M = 2;
8 | // constexpr int OP_N = 16;
9 | // constexpr int OP_K = 4;
10 |
11 | // constexpr int OP_M = 4;
12 | // constexpr int OP_N = 8;
13 | // constexpr int OP_K = 8;
14 |
15 | constexpr int OP_M = 1;
16 | constexpr int OP_N = 32;
17 | constexpr int OP_K = 8;
18 |
19 | } // namespace turbomind::gemm::simt
20 |
--------------------------------------------------------------------------------
/src/turbomind/kernels/gemm/test/quantization.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) OpenMMLab. All rights reserved.
2 |
3 | #include "src/turbomind/kernels/gemm/types.h"
4 | #include
5 | #include
6 |
7 | #pragma once
8 |
9 | namespace turbomind::gemm {
10 |
11 | template
12 | void Quantize(const thrust::universal_vector& x,
13 | int m,
14 | int k,
15 | Order order,
16 | int group_size,
17 | thrust::universal_vector