├── .clang-format ├── .github ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── 1-bug-report.yml │ ├── 2-feature-request.yml │ └── 3-documentation.yml ├── md-link-config.json ├── pull_request_template.md ├── release.yml ├── scripts │ ├── action_tools.py │ ├── check_lmdeploy.py │ ├── doc_link_checker.py │ ├── eval_base_config.py │ ├── eval_chat_config.py │ ├── eval_regression_base_models.py │ ├── eval_regression_chat_models.py │ ├── eval_stable_object_config.py │ └── eval_stable_subject_config.py └── workflows │ ├── benchmark.yml │ ├── cuda11.8-whl-release.yml │ ├── daily_ete_test.yml │ ├── daily_ete_test_3090.yml │ ├── docker.yml │ ├── evaluate.yml │ ├── evaluate_remote.yml │ ├── lint.yml │ ├── linux-x64-gpu.yml │ ├── pr_ete_test.yml │ ├── pr_full_test.yml │ ├── pypi.yml │ ├── stable.yml │ ├── stale.yml │ ├── unit-test.yml │ └── windows-x64-gpu.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_ja.md ├── README_zh-CN.md ├── autotest ├── benchmark │ ├── test_apiserver_performance.py │ ├── test_generation_performance.py │ └── test_throughput_performance.py ├── chat_prompt_case.yaml ├── config-3090.yaml ├── config-pr.yaml ├── config.yaml ├── conftest.py ├── interface │ ├── pipeline │ │ ├── test_pipeline_func.py │ │ └── test_pipeline_longtext_func.py │ └── restful │ │ ├── test_restful_chat_func.py │ │ └── test_restful_completions_v1.py ├── prompt_case.yaml ├── pytest.ini ├── template.json ├── toolchain │ └── test_lagent.py ├── tools │ ├── chat │ │ ├── test_command_chat_hf_pytorch.py │ │ └── test_command_chat_hf_turbomind.py │ ├── pipeline │ │ ├── llm_case.py │ │ ├── mllm_case.py │ │ ├── test_pipeline_chat_pytorch_llm.py │ │ ├── test_pipeline_chat_pytorch_mllm.py │ │ ├── test_pipeline_chat_turbomind_llm.py │ │ └── test_pipeline_chat_turbomind_mllm.py │ ├── quantization │ │ ├── test_quantization_awq.py │ │ └── test_quantization_w8a8.py │ └── restful │ │ ├── test_restful_chat_hf_pytorch_llm.py │ │ ├── test_restful_chat_hf_pytorch_mllm.py │ │ ├── test_restful_chat_hf_turbomind_llm.py │ │ └── test_restful_chat_hf_turbomind_mllm.py └── utils │ ├── benchmark_utils.py │ ├── config_utils.py │ ├── get_run_config.py │ ├── mp_log_utils.py │ ├── pipeline_chat.py │ ├── quantization_utils.py │ ├── restful_return_check.py │ ├── rule_condition_assert.py │ ├── run_client_chat.py │ └── run_restful_chat.py ├── benchmark ├── README.md ├── benchmark_decode.py ├── benchmark_serving.py ├── lmdeploy.yml ├── profile_generation.py ├── profile_pipeline_api.py ├── profile_restful_api.py └── profile_throughput.py ├── builder ├── manywheel │ ├── Dockerfile_2014 │ ├── README.md │ ├── build_all_docker.sh │ ├── build_all_wheel.sh │ ├── build_docker.sh │ ├── build_wheel.sh │ ├── entrypoint_build.sh │ └── scripts │ │ ├── install_conda.sh │ │ ├── install_cuda.sh │ │ └── install_openmpi.sh └── windows │ ├── README.md │ ├── generate.ps1 │ └── setup_cuda.ps1 ├── cmake ├── Modules │ ├── FindCUDNN.cmake │ └── FindNCCL.cmake ├── TritonTurboMindBackendConfig.cmake.in └── TurboMindConfig.cmake.in ├── debug.sh ├── docker ├── Dockerfile ├── Dockerfile_Hopper ├── Dockerfile_aarch64_ascend ├── InternVL_Dockerfile └── Qwen2VL_Dockerfile ├── docs ├── en │ ├── .readthedocs.yaml │ ├── Makefile │ ├── _static │ │ ├── css │ │ │ └── readthedocs.css │ │ └── image │ │ │ └── lmdeploy-logo.svg │ ├── advance │ │ ├── chat_template.md │ │ ├── debug_turbomind.md │ │ ├── long_context.md │ │ ├── pytorch_multinodes.md │ │ ├── pytorch_multithread.md │ │ ├── pytorch_new_model.md │ │ ├── pytorch_profiling.md │ │ └── structed_output.md │ ├── api │ │ └── pipeline.rst │ ├── benchmark │ │ ├── a100_fp16.md │ │ ├── benchmark.md │ │ └── evaluate_with_opencompass.md │ ├── conf.py │ ├── faq.md │ ├── get_started │ │ ├── ascend │ │ │ └── get_started.md │ │ ├── get_started.md │ │ ├── index.rst │ │ └── installation.md │ ├── index.rst │ ├── inference │ │ ├── load_hf.md │ │ ├── pytorch.md │ │ ├── turbomind.md │ │ └── turbomind_config.md │ ├── llm │ │ ├── api_server.md │ │ ├── api_server_lora.md │ │ ├── api_server_reasoning.md │ │ ├── api_server_tools.md │ │ ├── codellama.md │ │ ├── gradio.md │ │ ├── pipeline.md │ │ └── proxy_server.md │ ├── make.bat │ ├── multi_modal │ │ ├── api_server_vl.md │ │ ├── cogvlm.md │ │ ├── deepseek_vl2.md │ │ ├── gemma3.md │ │ ├── index.rst │ │ ├── internvl.md │ │ ├── llava.md │ │ ├── minicpmv.md │ │ ├── mllama.md │ │ ├── molmo.md │ │ ├── phi3.md │ │ ├── qwen2_5_vl.md │ │ ├── qwen2_vl.md │ │ ├── vl_pipeline.md │ │ └── xcomposer2d5.md │ ├── quantization │ │ ├── kv_quant.md │ │ ├── w4a16.md │ │ └── w8a8.md │ └── supported_models │ │ └── supported_models.md └── zh_cn │ ├── .readthedocs.yaml │ ├── Makefile │ ├── _static │ ├── css │ │ └── readthedocs.css │ └── image │ │ └── lmdeploy-logo.svg │ ├── advance │ ├── chat_template.md │ ├── debug_turbomind.md │ ├── long_context.md │ ├── pytorch_multinodes.md │ ├── pytorch_multithread.md │ ├── pytorch_new_model.md │ ├── pytorch_profiling.md │ └── structed_output.md │ ├── api │ └── pipeline.rst │ ├── benchmark │ ├── benchmark.md │ └── evaluate_with_opencompass.md │ ├── conf.py │ ├── faq.md │ ├── get_started │ ├── ascend │ │ └── get_started.md │ ├── get_started.md │ ├── index.rst │ └── installation.md │ ├── index.rst │ ├── inference │ ├── load_hf.md │ ├── pytorch.md │ ├── turbomind.md │ └── turbomind_config.md │ ├── llm │ ├── api_server.md │ ├── api_server_lora.md │ ├── api_server_reasoning.md │ ├── api_server_tools.md │ ├── codellama.md │ ├── gradio.md │ ├── pipeline.md │ └── proxy_server.md │ ├── make.bat │ ├── multi_modal │ ├── api_server_vl.md │ ├── cogvlm.md │ ├── deepseek_vl2.md │ ├── gemma3.md │ ├── index.rst │ ├── internvl.md │ ├── llava.md │ ├── minicpmv.md │ ├── mllama.md │ ├── molmo.md │ ├── phi3.md │ ├── qwen2_5_vl.md │ ├── qwen2_vl.md │ ├── vl_pipeline.md │ └── xcomposer2d5.md │ ├── quantization │ ├── kv_quant.md │ ├── w4a16.md │ └── w8a8.md │ └── supported_models │ └── supported_models.md ├── generate.sh ├── k8s ├── deployment.yaml └── service.yaml ├── lmdeploy ├── __init__.py ├── __main__.py ├── api.py ├── archs.py ├── cli │ ├── __init__.py │ ├── cli.py │ ├── entrypoint.py │ ├── lite.py │ ├── serve.py │ └── utils.py ├── lite │ ├── __init__.py │ ├── apis │ │ ├── __init__.py │ │ ├── auto_awq.py │ │ ├── calibrate.py │ │ ├── get_small_sharded_hf.py │ │ ├── gptq.py │ │ ├── kv_qparams.py │ │ └── smooth_quant.py │ ├── defaults.py │ ├── modeling │ │ ├── __init__.py │ │ ├── internlm2_gptq.py │ │ └── internlm3_gptq.py │ ├── quantization │ │ ├── __init__.py │ │ ├── activation │ │ │ ├── __init__.py │ │ │ └── observer.py │ │ ├── awq.py │ │ ├── calibration.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ └── linear.py │ │ └── weight │ │ │ ├── __init__.py │ │ │ └── quantizer.py │ └── utils │ │ ├── __init__.py │ │ ├── batch_split.py │ │ ├── cal_qparams.py │ │ ├── calib_dataloader.py │ │ ├── collect.py │ │ ├── global_avail.py │ │ ├── load.py │ │ └── memory_efficient.py ├── logger.py ├── messages.py ├── model.py ├── profiler.py ├── pytorch │ ├── __init__.py │ ├── accel.py │ ├── adapter │ │ ├── __init__.py │ │ └── adapter.py │ ├── backends │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── apply_rotary_emb.py │ │ ├── attention.py │ │ ├── awq_modules.py │ │ ├── base.py │ │ ├── blockedf8_modules.py │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── apply_rotary_emb.py │ │ │ ├── attention.py │ │ │ ├── awq_modules.py │ │ │ ├── blockedf8_modules.py │ │ │ ├── flash_attention.py │ │ │ ├── graph_runner.py │ │ │ ├── lora.py │ │ │ ├── moe.py │ │ │ ├── multinomial_sampling.py │ │ │ ├── norm.py │ │ │ ├── op_backend.py │ │ │ ├── qmodules.py │ │ │ ├── token_dispatcher.py │ │ │ └── warmup_manager.py │ │ ├── default │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── apply_rotary_emb.py │ │ │ ├── awq_modules.py │ │ │ ├── linear.py │ │ │ ├── moe.py │ │ │ ├── multinomial_sampling.py │ │ │ ├── norm.py │ │ │ ├── op_backend.py │ │ │ ├── rotary_embedding.py │ │ │ └── token_dispatcher.py │ │ ├── dlinfer │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── apply_rotary_emb.py │ │ │ ├── ascend │ │ │ │ ├── __init__.py │ │ │ │ ├── graph_runner.py │ │ │ │ └── op_backend.py │ │ │ ├── attention.py │ │ │ ├── awq_modules.py │ │ │ ├── camb │ │ │ │ ├── __init__.py │ │ │ │ └── op_backend.py │ │ │ ├── flash_attention.py │ │ │ ├── linear.py │ │ │ ├── maca │ │ │ │ ├── __init__.py │ │ │ │ └── op_backend.py │ │ │ ├── moe.py │ │ │ ├── norm.py │ │ │ ├── op_backend.py │ │ │ ├── qmodules.py │ │ │ └── rotary_embedding.py │ │ ├── flash_attention.py │ │ ├── graph_runner.py │ │ ├── linear.py │ │ ├── lora.py │ │ ├── moe.py │ │ ├── multinomial_sampling.py │ │ ├── norm.py │ │ ├── qmodules.py │ │ ├── rotary_embedding.py │ │ ├── selector.py │ │ └── token_dispatcher.py │ ├── block.py │ ├── chat.py │ ├── check_env │ │ ├── __init__.py │ │ ├── adapter.py │ │ ├── base.py │ │ ├── deeplink.py │ │ ├── dist.py │ │ ├── model.py │ │ ├── torch.py │ │ ├── transformers.py │ │ ├── triton.py │ │ └── triton_custom_add.py │ ├── config.py │ ├── configurations │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── chatglm.py │ │ ├── cogvlm.py │ │ ├── deepseek_v2.py │ │ ├── deepseek_vl2.py │ │ ├── default.py │ │ ├── gemma.py │ │ ├── internvl.py │ │ ├── llama4.py │ │ ├── llava_hf.py │ │ ├── minicpm3.py │ │ ├── mllama.py │ │ ├── qwen.py │ │ └── utils.py │ ├── devices │ │ ├── __init__.py │ │ └── device_manager.py │ ├── disagg │ │ ├── README.md │ │ ├── __init__.py │ │ ├── backend │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ ├── base.py │ │ │ ├── dlslime.py │ │ │ ├── infinistore.py │ │ │ └── mooncake.py │ │ ├── config.py │ │ ├── conn.py │ │ ├── messages.py │ │ └── request.py │ ├── distributed.py │ ├── engine │ │ ├── __init__.py │ │ ├── cache_engine.py │ │ ├── engine.py │ │ ├── engine_checker.py │ │ ├── engine_instance.py │ │ ├── executor │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── base_worker.py │ │ │ ├── dist_utils.py │ │ │ ├── mp_executor.py │ │ │ ├── ray_executor.py │ │ │ └── uni_executor.py │ │ ├── guided_process.py │ │ ├── input_process.py │ │ ├── logits_process.py │ │ ├── model_agent.py │ │ └── request.py │ ├── envs.py │ ├── kernels │ │ ├── __init__.py │ │ ├── alibi_pagedattention.py │ │ ├── apply_rotary_pos_emb.py │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── alibi_pagedattention.py │ │ │ ├── apply_rotary_pos_emb.py │ │ │ ├── awq_kernels.py │ │ │ ├── blocked_fp8_fused_moe.py │ │ │ ├── blocked_gemm_fp8.py │ │ │ ├── ep_moe.py │ │ │ ├── fill_kv_cache.py │ │ │ ├── flash_mla.py │ │ │ ├── flashattention.py │ │ │ ├── flatten_kv_cache.py │ │ │ ├── fused_lora.py │ │ │ ├── fused_moe.py │ │ │ ├── fused_rotary_emb.py │ │ │ ├── multinomial_sampling.py │ │ │ ├── pagedattention.py │ │ │ ├── rms_norm.py │ │ │ ├── triton_utils.py │ │ │ ├── utils.py │ │ │ ├── w8a8_fused_moe.py │ │ │ └── w8a8_triton_kernels.py │ │ ├── default │ │ │ ├── __init__.py │ │ │ ├── multinomial_sampling.py │ │ │ └── w8a8_kernels.py │ │ ├── dispatcher.py │ │ ├── dlinfer │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── apply_rotary_pos_emb.py │ │ │ ├── awq_kernels.py │ │ │ ├── fill_kv_cache.py │ │ │ ├── flash_attention.py │ │ │ ├── fused_moe.py │ │ │ ├── fused_rotary_emb.py │ │ │ ├── linear.py │ │ │ ├── moe_gating_topk_softmax.py │ │ │ ├── pagedattention.py │ │ │ ├── rms_norm.py │ │ │ └── w8a8_kernels.py │ │ ├── fill_kv_cache.py │ │ ├── flash_mla.py │ │ ├── fused_moe.py │ │ ├── fused_rotary_emb.py │ │ ├── moe_gating_topk_softmax.py │ │ ├── multinomial_sampling.py │ │ ├── pagedattention.py │ │ ├── rms_norm.py │ │ └── w8a8_triton_kernels.py │ ├── messages.py │ ├── model_inputs.py │ ├── models │ │ ├── __init__.py │ │ ├── baichuan.py │ │ ├── chatglm2.py │ │ ├── cogvlm.py │ │ ├── deepseek.py │ │ ├── deepseek_v2.py │ │ ├── deepseek_vl2.py │ │ ├── gemma.py │ │ ├── gemma3_vl.py │ │ ├── internlm.py │ │ ├── internlm2.py │ │ ├── internlm2_reward.py │ │ ├── internlm2_ve.py │ │ ├── internlm3.py │ │ ├── internvl.py │ │ ├── internvl_patch.py │ │ ├── llama.py │ │ ├── llama4.py │ │ ├── llava.py │ │ ├── minicpm3.py │ │ ├── minicpmv26.py │ │ ├── mistral.py │ │ ├── mixtral.py │ │ ├── mllama.py │ │ ├── module_map.py │ │ ├── patch.py │ │ ├── phi3.py │ │ ├── phi3_moe.py │ │ ├── phi3_v.py │ │ ├── q_modules.py │ │ ├── qwen.py │ │ ├── qwen2.py │ │ ├── qwen2_5_vl.py │ │ ├── qwen2_moe.py │ │ ├── qwen2_reward.py │ │ ├── qwen2_vl.py │ │ ├── qwen3.py │ │ ├── qwen3_moe.py │ │ ├── siglip.py │ │ ├── starcoder2.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── cudagraph.py │ │ │ ├── micro_batch.py │ │ │ ├── model.py │ │ │ └── multimodal.py │ ├── multimodal │ │ ├── __init__.py │ │ ├── data_type.py │ │ └── image_type.py │ ├── nn │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── attention.py │ │ ├── linear.py │ │ ├── moe.py │ │ ├── multinomial_sampling.py │ │ ├── norm.py │ │ ├── rotary_embedding.py │ │ └── utils.py │ ├── paging │ │ ├── __init__.py │ │ ├── block_manager │ │ │ ├── __init__.py │ │ │ ├── base_block_manager.py │ │ │ ├── default_block_manager.py │ │ │ └── window_block_manager.py │ │ ├── block_trie.py │ │ ├── eviction_helper │ │ │ ├── __init__.py │ │ │ ├── base_eviction_helper.py │ │ │ └── recompute_eviction_helper.py │ │ └── scheduler.py │ ├── supported_models.py │ ├── tools │ │ ├── __init__.py │ │ ├── layout_convert.py │ │ ├── make_inputs.py │ │ └── utils.py │ ├── utils.py │ └── weight_loader │ │ ├── __init__.py │ │ └── model_weight_loader.py ├── serve │ ├── __init__.py │ ├── async_engine.py │ ├── gradio │ │ ├── __init__.py │ │ ├── api_server_backend.py │ │ ├── app.py │ │ ├── constants.py │ │ ├── turbomind_coupled.py │ │ └── vl.py │ ├── openai │ │ ├── __init__.py │ │ ├── api_client.py │ │ ├── api_server.py │ │ ├── launch_server.py │ │ ├── protocol.py │ │ ├── reasoning_parser │ │ │ ├── __init__.py │ │ │ ├── deepseek_r1_reasoning_parser.py │ │ │ ├── qwen_qwq_reasoning_parser.py │ │ │ └── reasoning_parser.py │ │ └── tool_parser │ │ │ ├── __init__.py │ │ │ ├── internlm2_parser.py │ │ │ ├── llama3_parser.py │ │ │ ├── qwen2d5_parser.py │ │ │ ├── tool_parser.py │ │ │ └── utils.py │ ├── proxy │ │ ├── __init__.py │ │ ├── constants.py │ │ └── proxy.py │ ├── turbomind │ │ ├── __init__.py │ │ └── triton_python_backend │ │ │ ├── README.md │ │ │ ├── client.py │ │ │ ├── config.pbtxt │ │ │ └── model.py │ ├── utils.py │ └── vl_async_engine.py ├── tokenizer.py ├── turbomind │ ├── __init__.py │ ├── chat.py │ ├── deploy │ │ ├── __init__.py │ │ ├── config.py │ │ ├── converter.py │ │ ├── loader.py │ │ ├── module.py │ │ ├── parameter.py │ │ ├── policy.py │ │ ├── source_model │ │ │ ├── __init__.py │ │ │ ├── baichuan.py │ │ │ ├── base.py │ │ │ ├── deepseek2.py │ │ │ ├── deepseek_vl.py │ │ │ ├── glm4.py │ │ │ ├── internlm2.py │ │ │ ├── internvl.py │ │ │ ├── llama.py │ │ │ ├── llava.py │ │ │ ├── minicpmv.py │ │ │ ├── mixtral.py │ │ │ ├── molmo.py │ │ │ ├── qwen.py │ │ │ └── xcomposer2.py │ │ └── target_model │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── fp.py │ ├── generate_gemm_config.py │ ├── supported_models.py │ ├── turbomind.py │ └── utils.py ├── utils.py ├── version.py └── vl │ ├── __init__.py │ ├── constants.py │ ├── engine.py │ ├── model │ ├── __init__.py │ ├── base.py │ ├── builder.py │ ├── cogvlm.py │ ├── deepseek.py │ ├── deepseek_vl2.py │ ├── gemma3_vl.py │ ├── glm_4v.py │ ├── internvl.py │ ├── internvl_llava.py │ ├── llama4.py │ ├── llava.py │ ├── llava_hf.py │ ├── llava_next.py │ ├── minicpmv.py │ ├── mllama.py │ ├── molmo.py │ ├── phi3_vision.py │ ├── qwen.py │ ├── qwen2.py │ ├── utils.py │ ├── xcomposer2.py │ └── yi.py │ ├── tools │ ├── __init__.py │ └── merge_xcomposer2d5_task.py │ └── utils.py ├── requirements ├── build.txt ├── docs.txt ├── lite.txt ├── readthedocs.txt ├── runtime_ascend.txt ├── runtime_camb.txt ├── runtime_cuda.txt ├── runtime_maca.txt ├── serve.txt └── test.txt ├── requirements_ascend.txt ├── requirements_camb.txt ├── requirements_cuda.txt ├── requirements_maca.txt ├── resources └── batch_memory.png ├── setup.py ├── src ├── CMakeLists.txt └── turbomind │ ├── CMakeLists.txt │ ├── comm │ ├── CMakeLists.txt │ ├── barrier.h │ ├── cuda_ipc │ │ ├── CMakeLists.txt │ │ ├── allgather.cu │ │ ├── allreduce.cu │ │ ├── bootstrap.h │ │ ├── cuda_ipc_comm.cu │ │ ├── cuda_ipc_comm.h │ │ ├── device_semaphore.h │ │ ├── fused_allreduce.cu │ │ ├── fused_allreduce_ex.cu │ │ ├── group_sum.h │ │ └── mscclpp.h │ ├── device_comm.cc │ ├── device_comm.h │ ├── host_comm.cc │ ├── host_comm.h │ ├── nccl │ │ ├── CMakeLists.txt │ │ └── nccl.cu │ ├── test_comm.cu │ └── thread_comm.cc │ ├── core │ ├── CMakeLists.txt │ ├── allocator.cc │ ├── allocator.h │ ├── buffer.cc │ ├── buffer.h │ ├── check.cc │ ├── check.h │ ├── common.h │ ├── context.cc │ ├── context.h │ ├── core.h │ ├── cuda_data_type.h │ ├── data_type.h │ ├── layout.cc │ ├── layout.h │ ├── module.cc │ ├── module.h │ ├── stream.cc │ ├── stream.h │ ├── tensor.cc │ ├── tensor.cu │ ├── tensor.h │ └── test_core.cc │ ├── engine │ ├── CMakeLists.txt │ ├── gateway.cc │ ├── gateway.h │ ├── model_request.cc │ ├── model_request.h │ ├── request.h │ ├── request_queue.cc │ ├── request_queue.h │ └── signal_buffer.h │ ├── kernels │ ├── CMakeLists.txt │ ├── activation_kernels.cu │ ├── activation_kernels.h │ ├── attention │ │ ├── CMakeLists.txt │ │ ├── arch.h │ │ ├── attention.cu │ │ ├── attention.h │ │ ├── attention_config.h │ │ ├── attention_params.h │ │ ├── attention_template.h │ │ ├── attention_universal.h │ │ ├── block.h │ │ ├── block_iterator.h │ │ ├── codegen │ │ │ ├── attention_sm70_128_f16.cu │ │ │ ├── attention_sm70_64_f16.cu │ │ │ ├── attention_sm75_128_f16.cu │ │ │ ├── attention_sm75_64_f16.cu │ │ │ ├── attention_sm80_128_bf16.cu │ │ │ ├── attention_sm80_128_f16.cu │ │ │ ├── attention_sm80_192.cu │ │ │ ├── attention_sm80_64_bf16.cu │ │ │ ├── attention_sm80_64_f16.cu │ │ │ ├── decoding_sm70_128_f16_f16.cu │ │ │ ├── decoding_sm70_128_f16_u4.cu │ │ │ ├── decoding_sm70_128_f16_u8.cu │ │ │ ├── decoding_sm70_64_f16_f16.cu │ │ │ ├── decoding_sm70_64_f16_u4.cu │ │ │ ├── decoding_sm70_64_f16_u8.cu │ │ │ ├── decoding_sm75_128_f16_f16.cu │ │ │ ├── decoding_sm75_128_f16_u4.cu │ │ │ ├── decoding_sm75_128_f16_u8.cu │ │ │ ├── decoding_sm75_64_f16_f16.cu │ │ │ ├── decoding_sm75_64_f16_u4.cu │ │ │ ├── decoding_sm75_64_f16_u8.cu │ │ │ ├── decoding_sm80_128_bf16_bf16.cu │ │ │ ├── decoding_sm80_128_bf16_u4.cu │ │ │ ├── decoding_sm80_128_bf16_u8.cu │ │ │ ├── decoding_sm80_128_f16_f16.cu │ │ │ ├── decoding_sm80_128_f16_u4.cu │ │ │ ├── decoding_sm80_128_f16_u8.cu │ │ │ ├── decoding_sm80_192.cu │ │ │ ├── decoding_sm80_64_bf16_bf16.cu │ │ │ ├── decoding_sm80_64_bf16_u4.cu │ │ │ ├── decoding_sm80_64_bf16_u8.cu │ │ │ ├── decoding_sm80_64_f16_f16.cu │ │ │ ├── decoding_sm80_64_f16_u4.cu │ │ │ └── decoding_sm80_64_f16_u8.cu │ │ ├── cta_map.h │ │ ├── decoding.cu │ │ ├── decoding.h │ │ ├── decoding_config.h │ │ ├── decoding_template.h │ │ ├── impl.h │ │ ├── impl_16816.h │ │ ├── impl_1688.h │ │ ├── impl_81616.h │ │ ├── impl_884.h │ │ ├── impl_m16n8.h │ │ ├── impl_simt.h │ │ ├── iterator.h │ │ ├── iterator_sm70.h │ │ ├── iterator_sm80.h │ │ ├── kv_cache_utils_v2.cu │ │ ├── kv_cache_utils_v2.h │ │ ├── linear_iterator.h │ │ ├── mainloop.h │ │ ├── mainloop_sm70.h │ │ ├── mainloop_sm80.h │ │ ├── quantization.h │ │ ├── reduce.cu │ │ ├── reduce.h │ │ ├── reduce_kernel.h │ │ ├── reference.cu │ │ ├── reference.h │ │ ├── rotary_embedding.h │ │ ├── test_attention.cu │ │ ├── test_quant.cu │ │ ├── test_utils.cu │ │ ├── test_utils.h │ │ ├── utils.cc │ │ └── utils.h │ ├── ban_bad_words.cu │ ├── ban_bad_words.h │ ├── core │ │ ├── array.h │ │ ├── array_ops.h │ │ ├── common.h │ │ ├── data_type.h │ │ ├── layout.h │ │ ├── math.h │ │ ├── meta.h │ │ ├── mma.h │ │ ├── pipe_iter.h │ │ ├── smem.h │ │ ├── sub_byte_ptr.h │ │ ├── sync.h │ │ └── thread_map.h │ ├── decoding_kernels.cu │ ├── decoding_kernels.h │ ├── flash_attention │ │ ├── CMakeLists.txt │ │ ├── flash_attention.cu │ │ ├── flash_attention.h │ │ ├── flash_attention2 │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── block_info.h │ │ │ ├── flash.h │ │ │ ├── flash_api.cpp │ │ │ ├── flash_fwd_hdim128_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim128_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim256_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim256_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim32_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim32_fp16_sm80.cu │ │ │ ├── flash_fwd_hdim64_bf16_sm80.cu │ │ │ ├── flash_fwd_hdim64_fp16_sm80.cu │ │ │ ├── flash_fwd_kernel.h │ │ │ ├── flash_fwd_launch_template.h │ │ │ ├── kernel_traits.h │ │ │ ├── softmax.h │ │ │ ├── static_switch.h │ │ │ └── utils.h │ │ └── fused_multi_head_attention │ │ │ ├── CMakeLists.txt │ │ │ ├── llama_flash_attention_kernel.cu │ │ │ ├── mma_accum_lambda_iterator.h │ │ │ └── tile_smem_loader.h │ ├── gemm │ │ ├── CMakeLists.txt │ │ ├── arch.h │ │ ├── arch │ │ │ ├── config_simt.h │ │ │ ├── config_sm70_s884.h │ │ │ ├── config_sm75_s16816.h │ │ │ ├── config_sm80_s16816.h │ │ │ ├── mma_simt.h │ │ │ ├── mma_sm70.h │ │ │ ├── mma_sm80.h │ │ │ ├── operand_simt.h │ │ │ ├── operand_sm70_s884.h │ │ │ ├── operand_sm80_s16816.h │ │ │ ├── smem_copy_simt.h │ │ │ ├── smem_copy_sm70.h │ │ │ └── smem_copy_sm80.h │ │ ├── cast.cu │ │ ├── cast.h │ │ ├── context.cu │ │ ├── context.h │ │ ├── convert_v2.cu │ │ ├── convert_v2.h │ │ ├── cp_async.h │ │ ├── cta_map.h │ │ ├── desc.h │ │ ├── dispatch_cache.cu │ │ ├── dispatch_cache.h │ │ ├── epilogue.h │ │ ├── format.h │ │ ├── gemm.cu │ │ ├── gemm.h │ │ ├── gemm_universal.h │ │ ├── gpu_metric.cu │ │ ├── gpu_metric.h │ │ ├── iterator.h │ │ ├── iterator_sm70.h │ │ ├── iterator_sm80.h │ │ ├── kernel.cu │ │ ├── kernel.h │ │ ├── kernel │ │ │ ├── f16_u4g128_f16_tnt_sm70_s884.cu │ │ │ ├── f16_u4g128_f16_tnt_sm75_s16816.cu │ │ │ ├── f16_u4g128_f16_tnt_sm75_simt.cu │ │ │ ├── f16_u4g128_f16_tnt_sm80_s16816.cu │ │ │ ├── f16_u4g128_f16_tnt_sm90_s16816.cu │ │ │ ├── sm70_s884_dynamic.cu │ │ │ ├── sm75_s16816_dynamic.cu │ │ │ ├── sm80_s16816_dynamic.cu │ │ │ ├── sm90_s16816_dynamic.cu │ │ │ └── u4g128_f16_f16_nnn_sm80_s16816.cu │ │ ├── kernel_impl.h │ │ ├── mainloop_sm70.h │ │ ├── mainloop_sm80_v2.h │ │ ├── matrix_ptr.h │ │ ├── moe_utils_v2.cu │ │ ├── moe_utils_v2.h │ │ ├── operand.h │ │ ├── predicate.h │ │ ├── registry.cu │ │ ├── registry.h │ │ ├── simt.h │ │ ├── smem_copy.h │ │ ├── test │ │ │ ├── gemm_bench.cu │ │ │ ├── gemm_test.cu │ │ │ ├── models.h │ │ │ ├── quantization.cu │ │ │ ├── quantization.h │ │ │ ├── quantization_impl.h │ │ │ ├── reference.cu │ │ │ ├── reference.h │ │ │ ├── test_moe_utils.cu │ │ │ ├── test_utils.cu │ │ │ ├── test_utils.h │ │ │ └── testbed.h │ │ ├── thread_group_map.h │ │ ├── thread_map.h │ │ ├── tiled_mma.h │ │ ├── transform.h │ │ ├── tuner │ │ │ ├── cache_utils.cu │ │ │ ├── cache_utils.h │ │ │ ├── measurer.cu │ │ │ ├── measurer.h │ │ │ ├── params.cc │ │ │ ├── params.h │ │ │ ├── sampler.cu │ │ │ ├── sampler.h │ │ │ ├── stats.h │ │ │ ├── stopping_criterion.cc │ │ │ └── stopping_criterion.h │ │ ├── types.h │ │ ├── unpack.cu │ │ └── utils.h │ ├── gpt_kernels.cu │ ├── gpt_kernels.h │ ├── logprob_kernels.cu │ ├── logprob_kernels.h │ ├── norm │ │ ├── CMakeLists.txt │ │ ├── rms_norm.cu │ │ └── rms_norm.h │ ├── penalty_types.h │ ├── reduce_kernel_utils.cuh │ ├── sampling_kernels.cu │ ├── sampling_kernels.h │ ├── sampling_penalty_kernels.cu │ ├── sampling_penalty_kernels.h │ ├── sampling_topk_kernels.cu │ ├── sampling_topk_kernels.h │ ├── sampling_topp_kernels.cu │ ├── sampling_topp_kernels.h │ ├── stop_criteria_kernels.cu │ ├── stop_criteria_kernels.h │ ├── unfused_attention_kernels.cu │ └── unfused_attention_kernels.h │ ├── layers │ ├── BaseDynamicDecodeLayer.h │ ├── CMakeLists.txt │ ├── DynamicDecodeLayer.cc │ ├── DynamicDecodeLayer.h │ └── sampling_layers │ │ ├── CMakeLists.txt │ │ ├── LogitsProcessorLayer.cc │ │ ├── LogitsProcessorLayer.h │ │ ├── SamplingLayer.cc │ │ ├── SamplingLayer.h │ │ ├── StopCriteriaLayer.cc │ │ ├── StopCriteriaLayer.h │ │ └── utils.h │ ├── macro.h │ ├── models │ ├── CMakeLists.txt │ └── llama │ │ ├── Barrier.h │ │ ├── BlockManager.cc │ │ ├── BlockManager.h │ │ ├── BlockTrie.cc │ │ ├── BlockTrie.h │ │ ├── CMakeLists.txt │ │ ├── LlamaBatch.cc │ │ ├── LlamaBatch.h │ │ ├── LlamaDecoderLayerWeight.cc │ │ ├── LlamaDecoderLayerWeight.h │ │ ├── LlamaDenseWeight.cc │ │ ├── LlamaDenseWeight.h │ │ ├── LlamaFfnLayer.cc │ │ ├── LlamaFfnLayer.h │ │ ├── LlamaLinear.cu │ │ ├── LlamaLinear.h │ │ ├── LlamaV2.cc │ │ ├── LlamaV2.h │ │ ├── LlamaWeight.cc │ │ ├── LlamaWeight.h │ │ ├── SequenceManager.cc │ │ ├── SequenceManager.h │ │ ├── context.h │ │ ├── copy.h │ │ ├── llama_kernels.cu │ │ ├── llama_kernels.h │ │ ├── llama_params.h │ │ ├── llama_rope.h │ │ ├── llama_utils.cu │ │ ├── llama_utils.h │ │ ├── mla_utils.cu │ │ ├── mla_utils.h │ │ ├── moe_ffn_layer.cc │ │ ├── moe_ffn_layer.h │ │ ├── test_cache_manager.cc │ │ ├── unified_attention_layer.cc │ │ ├── unified_attention_layer.h │ │ ├── unified_decoder.cc │ │ └── unified_decoder.h │ ├── python │ ├── CMakeLists.txt │ ├── bind.cpp │ └── dlpack.h │ ├── triton_backend │ ├── CMakeLists.txt │ └── llama │ │ ├── CMakeLists.txt │ │ ├── LlamaTritonModel.cc │ │ └── LlamaTritonModel.h │ └── utils │ ├── CMakeLists.txt │ ├── anomaly_handler.cu │ ├── anomaly_handler.h │ ├── constant.h │ ├── cuda_bf16_fallbacks.cuh │ ├── cuda_bf16_wrapper.h │ ├── cuda_type_utils.cuh │ ├── cuda_utils.cc │ ├── cuda_utils.h │ ├── debug_utils.h │ ├── dispatch.h │ ├── logger.cc │ ├── logger.h │ ├── memory_utils.cu │ ├── memory_utils.h │ ├── monotonic.h │ ├── nvtx_utils.cc │ ├── nvtx_utils.h │ ├── parser.cc │ ├── parser.h │ ├── string_utils.h │ └── test_utils.h └── tests ├── csrc ├── CMakeLists.txt └── unittests │ ├── CMakeLists.txt │ ├── gtest_utils.h │ ├── test_logprob_kernels.cu │ ├── test_penalty_kernels.cu │ ├── test_sampling_kernels.cu │ ├── test_sampling_layer.cu │ └── unittest_utils.h ├── pytorch ├── engine │ ├── test_logits_process.py │ └── test_request.py ├── kernel │ ├── test_activation.py │ ├── test_apply_rotary.py │ ├── test_fill_kv_cache.py │ ├── test_flash_attention.py │ ├── test_flatten_kv_cache.py │ ├── test_fuse_moe_blocked_fp8.py │ ├── test_fused_lora.py │ ├── test_fused_moe.py │ ├── test_fused_rotary_emb.py │ ├── test_gemm_fp8.py │ ├── test_multinomial_sampling.py │ ├── test_paged_attention.py │ └── test_rms_norm.py ├── paging │ ├── test_block_manager.py │ ├── test_block_trie.py │ └── test_scheduler.py └── tools │ ├── test_layout_convert.py │ └── test_make_inputs.py └── test_lmdeploy ├── test_async_engine.py ├── test_auto_backend.py ├── test_lite └── test_quantization │ └── test_utils │ └── test_cal_qparams.py ├── test_messages.py ├── test_model.py ├── test_tokenizer.py ├── test_turbomind └── test_converter.py ├── test_utils.py └── test_vl └── test_vl_encode.py /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Suggest an idea for this project 3 | title: "[Feature] " 4 | 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | We strongly appreciate you creating a PR to implement this feature [here](https://github.com/InternLM/lmdeploy/pulls)! 10 | If you need our help, please fill in as much of the following form as you're able to. 11 | 12 | **The less clear the description, the longer it will take to solve it.** 13 | - type: textarea 14 | attributes: 15 | label: Motivation 16 | description: | 17 | A clear and concise description of the motivation of the feature. 18 | Ex1. It is inconvenient when \[....\]. 19 | validations: 20 | required: true 21 | - type: textarea 22 | attributes: 23 | label: Related resources 24 | description: | 25 | If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful. 26 | - type: textarea 27 | attributes: 28 | label: Additional context 29 | description: | 30 | Add any other context or screenshots about the feature request here. 31 | If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to the documentation. 3 | labels: "kind/doc,status/unconfirmed" 4 | title: "[Docs] " 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: 📚 The doc issue 10 | description: > 11 | A clear and concise description the issue. 12 | validations: 13 | required: true 14 | 15 | - type: textarea 16 | attributes: 17 | label: Suggest a potential alternative/fix 18 | description: > 19 | Tell us how we could improve the documentation in this regard. 20 | - type: markdown 21 | attributes: 22 | value: > 23 | Thanks for contributing 🎉! 24 | -------------------------------------------------------------------------------- /.github/md-link-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignorePatterns": [ 3 | { 4 | "pattern": "^https://www.reddit.com/" 5 | }, 6 | { 7 | "pattern": "^https://developer.nvidia.com/" 8 | }, 9 | { 10 | "pattern": "^https://docs.openvino.ai/" 11 | }, 12 | { 13 | "pattern": "^https://developer.android.com/" 14 | }, 15 | { 16 | "pattern": "^https://developer.qualcomm.com/" 17 | }, 18 | { 19 | "pattern": "^http://localhost" 20 | }, 21 | { 22 | "pattern": "^https://twitter.com" 23 | }, 24 | { 25 | "pattern": "^https://platform.openai.com" 26 | }, 27 | { 28 | "pattern": "^http://0.0.0.0" 29 | } 30 | ], 31 | "httpHeaders": [ 32 | { 33 | "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"], 34 | "headers": { 35 | "Accept-Encoding": "zstd, br, gzip, deflate" 36 | } 37 | } 38 | ], 39 | "timeout": "20s", 40 | "retryOn429": true, 41 | "retryCount": 5, 42 | "fallbackRetryDelay": "30s", 43 | "aliveStatusCodes": [200, 206, 429] 44 | } 45 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | categories: 3 | - title: 🚀 Features 4 | labels: 5 | - feature 6 | - enhancement 7 | - title: 💥 Improvements 8 | labels: 9 | - improvement 10 | - title: 🐞 Bug fixes 11 | labels: 12 | - bug 13 | - Bug:P0 14 | - Bug:P1 15 | - Bug:P2 16 | - Bug:P3 17 | - title: 📚 Documentations 18 | labels: 19 | - documentation 20 | - title: 🌐 Other 21 | labels: 22 | - '*' 23 | exclude: 24 | labels: 25 | - feature 26 | - enhancement 27 | - improvement 28 | - bug 29 | - documentation 30 | - Bug:P0 31 | - Bug:P1 32 | - Bug:P2 33 | - Bug:P3 34 | -------------------------------------------------------------------------------- /.github/scripts/check_lmdeploy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) MegFlow. All rights reserved. 2 | import glob 3 | import os 4 | 5 | import fire 6 | 7 | 8 | def check_module_init(root: str): 9 | """Check if a module has __init__.py file.""" 10 | all_files = glob.glob(os.path.join(root, '**/*'), recursive=True) 11 | not_exist = [] 12 | for d in all_files: 13 | if not os.path.isdir(d): 14 | continue 15 | if '__pycache__' in d: 16 | continue 17 | elif d.startswith('lmdeploy/bin'): 18 | continue 19 | elif d.startswith('lmdeploy/lib'): 20 | continue 21 | elif d.startswith('lmdeploy/serve/turbomind/triton_models'): 22 | continue 23 | elif d.startswith('lmdeploy/serve/turbomind/triton_python_backend'): 24 | continue 25 | init_file = os.path.join(d, '__init__.py') 26 | if not os.path.exists(init_file): 27 | not_exist.append(init_file) 28 | 29 | assert len(not_exist) == 0, f'Missing files: {not_exist}' 30 | 31 | 32 | if __name__ == '__main__': 33 | fire.Fire() 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .vscode/ 6 | .idea/ 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | triton-rerope/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | *build*/ 46 | !builder/ 47 | lmdeploy/lib/ 48 | lmdeploy/bin/ 49 | dist/ 50 | examples/cpp/llama/*.csv 51 | *.npy 52 | *.weight 53 | install/ 54 | 55 | # LMDeploy 56 | workspace/ 57 | work_dir*/ 58 | 59 | # Huggingface 60 | *.bin 61 | *config.json 62 | *generate_config.json 63 | !lmdeploy/turbomind/hf_repo/config.json 64 | 65 | # Pytorch 66 | *.pt 67 | *.pth 68 | *.py~ 69 | *.sh~ 70 | *.pyc 71 | **/src/pytorch-sphinx-theme/ 72 | 73 | # Outputs and logs 74 | *.txt 75 | *.log 76 | *.out 77 | *.csv 78 | !start_ids.csv 79 | *.pkl 80 | 81 | !CMakeLists.txt 82 | proxy_config.yml 83 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include lmdeploy/lib/*.so 3 | include lmdeploy/lib/*.so* 4 | include lmdeploy/lib/*.dll 5 | include lmdeploy/lib/*.pyd 6 | include lmdeploy/bin/* 7 | -------------------------------------------------------------------------------- /autotest/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import yaml 5 | 6 | cli_prompt_case_file = 'autotest/chat_prompt_case.yaml' 7 | common_prompt_case_file = 'autotest/prompt_case.yaml' 8 | config_file = 'autotest/config.yaml' 9 | 10 | 11 | @pytest.fixture(scope='session') 12 | def config(): 13 | config_path = os.path.join(config_file) 14 | with open(config_path) as f: 15 | env_config = yaml.load(f.read(), Loader=yaml.SafeLoader) 16 | return env_config 17 | 18 | 19 | @pytest.fixture(scope='session') 20 | def cli_case_config(): 21 | case_path = os.path.join(cli_prompt_case_file) 22 | with open(case_path) as f: 23 | case_config = yaml.load(f.read(), Loader=yaml.SafeLoader) 24 | return case_config 25 | 26 | 27 | @pytest.fixture(scope='class', autouse=True) 28 | def common_case_config(): 29 | case_path = os.path.join(common_prompt_case_file) 30 | with open(case_path) as f: 31 | case_config = yaml.load(f.read(), Loader=yaml.SafeLoader) 32 | return case_config 33 | 34 | 35 | def pytest_addoption(parser): 36 | parser.addoption('--run_id', action='store', default='', help='github run_id') 37 | 38 | 39 | @pytest.fixture(scope='session') 40 | def run_id(request): 41 | return request.config.getoption('--run_id') 42 | -------------------------------------------------------------------------------- /autotest/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = test*_*.py # test file 3 | python_classes = Test* # test class 4 | python_functions = test_* # test function 5 | pytest_runtest_call.tryfirst = True 6 | filterwarnings = ignore::UserWarning 7 | reruns = 2 8 | reruns_delay = 1 9 | -------------------------------------------------------------------------------- /autotest/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "base", 3 | "capability": "completion" 4 | } 5 | -------------------------------------------------------------------------------- /autotest/toolchain/test_lagent.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.order(10) 5 | @pytest.mark.lagent 6 | @pytest.mark.flaky(reruns=2) 7 | @pytest.mark.parametrize('model', ['internlm/internlm2_5-7b-chat']) 8 | def test_repeat(config, model): 9 | from lagent.llms import INTERNLM2_META, LMDeployPipeline 10 | 11 | model = LMDeployPipeline( 12 | path='/'.join([config.get('model_path'), model]), 13 | meta_template=INTERNLM2_META, 14 | tp=1, 15 | top_k=40, 16 | top_p=0.8, 17 | temperature=1.2, 18 | stop_words=['<|im_end|>'], 19 | max_new_tokens=4096, 20 | ) 21 | response_list = [] 22 | for i in range(3): 23 | print(f'run_{i}:') 24 | response = model.chat([{ 25 | 'role': 26 | 'user', 27 | 'content': 28 | '已知$$z_{1}=1$$,$$z_{2}=\\text{i}$$,$$z_{3}=-1$$,$$z_{4}=-\\text{i}$$,顺次连结它们所表示的点,则所得图形围成的面积为( )\nA. $$\\dfrac{1}{4}$$\n B. $$\\dfrac{1}{2}$$\n C. $$1$$\n D. $$2$$\n\n' # noqa: F401, E501 29 | }]) 30 | print(response) 31 | response_list.append(response) 32 | assert len(response) > 10 33 | assert response_list[0] != response_list[1] and response_list[1] != response_list[2] 34 | -------------------------------------------------------------------------------- /autotest/tools/quantization/test_quantization_w8a8.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import allure 4 | import pytest 5 | from utils.config_utils import get_cuda_prefix_by_workerid, get_quantization_model_list 6 | from utils.quantization_utils import quantization 7 | 8 | 9 | @pytest.mark.order(2) 10 | @pytest.mark.quantization_w8a8 11 | @pytest.mark.timeout(900) 12 | @pytest.mark.parametrize('model', get_quantization_model_list('w8a8')) 13 | def test_quantization_w8a8(config, model, worker_id): 14 | quantization_w8a8(config, model + '-inner-w8a8', model, get_cuda_prefix_by_workerid(worker_id)) 15 | 16 | 17 | def quantization_w8a8(config, quantization_model_name, origin_model_name, cuda_prefix): 18 | quantization_type = 'w8a8' 19 | result, msg = quantization(config, quantization_model_name, origin_model_name, quantization_type, cuda_prefix) 20 | log_path = config.get('log_path') 21 | quantization_log = os.path.join( 22 | log_path, '_'.join(['quantization', quantization_type, 23 | quantization_model_name.split('/')[1]]) + '.log') 24 | 25 | allure.attach.file(quantization_log, attachment_type=allure.attachment_type.TEXT) 26 | assert result, msg 27 | -------------------------------------------------------------------------------- /autotest/utils/mp_log_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import allure 4 | from pytest_assume.plugin import assume 5 | 6 | 7 | def write_log(config, result, msg, is_new: bool = True, case_path_tag: str = 'default'): 8 | try: 9 | log_path = os.path.join(config.get('log_path'), case_path_tag) 10 | 11 | if is_new: 12 | file = open(log_path, 'w') 13 | else: 14 | file = open(log_path, 'a') 15 | 16 | file.writelines('result:' + result + ', reason:' + msg + '\n') 17 | file.close() 18 | except Exception as e: 19 | return False, None, f'Unknown error: {e}' 20 | 21 | 22 | def assert_log(config, case_path_tag: str = 'default'): 23 | log_path = os.path.join(config.get('log_path'), case_path_tag) 24 | 25 | with open(log_path, 'r') as f: 26 | lines = f.readlines() 27 | 28 | for line in lines: 29 | if 'result:False, reason:' in line: 30 | result = False 31 | msg = line 32 | break 33 | if 'result:True, reason:' in line and not result: 34 | result = True 35 | 36 | allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT) 37 | with assume: 38 | assert result, msg 39 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | We provide several profiling tools to benchmark our models. 4 | 5 | ## profile with dataset 6 | 7 | Download the dataset below or create your own dataset. 8 | 9 | ```bash 10 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 11 | ``` 12 | 13 | Profiling your model with `profile_throughput.py` 14 | 15 | ```bash 16 | python profile_throughput.py \ 17 | ShareGPT_V3_unfiltered_cleaned_split.json \ 18 | /path/to/your/model \ 19 | --concurrency 64 20 | ``` 21 | 22 | ## profile without dataset 23 | 24 | `profile_generation.py` perform benchmark with dummy data. 25 | 26 | ```shell 27 | pip install nvidia-ml-py 28 | ``` 29 | 30 | ```bash 31 | python profile_generation.py \ 32 | /path/to/your/model \ 33 | --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512 34 | ``` 35 | 36 | ## profile restful api 37 | 38 | `profile_restful_api.py` is used to do benchmark on api server. 39 | 40 | ```bash 41 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 42 | 43 | python3 profile_restful_api.py --backend lmdeploy --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json 44 | ``` 45 | -------------------------------------------------------------------------------- /benchmark/lmdeploy.yml: -------------------------------------------------------------------------------- 1 | num_promts: &num_prompts 1 2 | dataset_path: &dataset_path "/nvme1/shared/ShareGPT_V3_unfiltered_cleaned_split.json" 3 | dataset_name: &dataset_name "sharegpt" 4 | server: 5 | - tp: 2 6 | "model_path": "Qwen/Qwen2.5-32B-Instruct" 7 | "max-batch-size": 1024 8 | "cache-max-entry-count": 0.8 9 | - tp: 4 10 | "model_path": "Qwen/Qwen2.5-32B-Instruct" 11 | "max-batch-size": 1024 12 | "cache-max-entry-count": 0.8 13 | data: 14 | - "dataset-name": "sharegpt" 15 | "dataset-path": *dataset_path 16 | "num-prompts": *num_prompts 17 | - "dataset-name": *dataset_name 18 | "dataset-path": *dataset_path 19 | "sharegpt-output-len": 2048 20 | "num-prompts": *num_prompts 21 | - "dataset-name": *dataset_name 22 | "dataset-path": *dataset_path 23 | "sharegpt-output-len": 4096 24 | "num-prompts": *num_prompts 25 | - "dataset-name": *dataset_name 26 | "dataset-path": *dataset_path 27 | "sharegpt-output-len": 8192 28 | "num-prompts": *num_prompts 29 | - "dataset-name": *dataset_name 30 | "dataset-path": *dataset_path 31 | "sharegpt-output-len": 16384 32 | "num-prompts": *num_prompts 33 | - "dataset-name": *dataset_name 34 | "dataset-path": *dataset_path 35 | "sharegpt-output-len": 32768 36 | "num-prompts": *num_prompts 37 | -------------------------------------------------------------------------------- /builder/manywheel/README.md: -------------------------------------------------------------------------------- 1 | # Build lmdeploy manylinux wheel 2 | 3 | ## Prepare docker image 4 | 5 | To build all docker images you can use the convenient script: 6 | 7 | ```bash 8 | ./build_all_docker.sh 9 | # Build with pushing 10 | WITH_PUSH=true ./build_all_docker.sh 11 | ``` 12 | 13 | To build a docker image with specific cuda version or manylinux-docker version, you may use: 14 | 15 | ```bash 16 | MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION=11.8 ./build_docker.sh 17 | ``` 18 | 19 | ## Build lmdeploy wheel 20 | 21 | ```bash 22 | ./build_all_wheel.sh 23 | ``` 24 | -------------------------------------------------------------------------------- /builder/manywheel/build_all_docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eou pipefail 4 | 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder 6 | 7 | for cuda_version in 11.8; do 8 | MANY_LINUX_VERSION=2014 GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/manywheel/build_docker.sh" 9 | done 10 | -------------------------------------------------------------------------------- /builder/manywheel/build_all_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eou pipefail 4 | 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder 6 | 7 | CUDA_VER=${CUDA_VER:-11.8} 8 | 9 | PLAT_NAME=manylinux2014_x86_64 10 | for cuver in ${CUDA_VER}; do 11 | DOCKER_TAG=cuda${cuver} 12 | OUTPUT_FOLDER=cuda${cuver}_dist 13 | for pyver in py38 py39 py310 py311 py312; do 14 | bash ${TOPDIR}/manywheel/build_wheel.sh ${pyver} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} \ 15 | |& tee ${PLAT_NAME}.${pyver}.cuda${cuver}.log.txt 16 | done 17 | done 18 | -------------------------------------------------------------------------------- /builder/manywheel/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eou pipefail 4 | 5 | TOPDIR=$(git rev-parse --show-toplevel)/builder 6 | GPU_ARCH_VERSION=${GPU_ARCH_VERSION} 7 | WITH_PUSH=${WITH_PUSH:-} 8 | 9 | TARGET=cuda_final 10 | DOCKER_TAG=cuda${GPU_ARCH_VERSION} 11 | DOCKER_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9" 12 | DOCKER_TAG=cuda${GPU_ARCH_VERSION} 13 | 14 | DOCKER_IMAGE=openmmlab/lmdeploy-builder:${DOCKER_TAG} 15 | if [[ -n ${MANY_LINUX_VERSION} ]]; then 16 | DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION} 17 | else 18 | DOCKERFILE_SUFFIX='' 19 | fi 20 | 21 | ( 22 | set -x 23 | DOCKER_BUILDKIT=1 docker build \ 24 | -t "${DOCKER_IMAGE}" \ 25 | ${DOCKER_BUILD_ARG} \ 26 | --target "${TARGET}" \ 27 | -f "${TOPDIR}/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \ 28 | "${TOPDIR}" 29 | ) 30 | 31 | if [[ "${WITH_PUSH}" == true ]]; then 32 | ( 33 | set -x 34 | docker push "${DOCKER_IMAGE}" 35 | ) 36 | fi 37 | -------------------------------------------------------------------------------- /builder/manywheel/build_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eux 3 | 4 | PYTHON_VERSION="$1" 5 | PLAT_NAME="$2" 6 | DOCKER_TAG="$3" 7 | OUTPUT_DIR="$4" 8 | 9 | DOCKER_IMAGE="openmmlab/lmdeploy-builder:${DOCKER_TAG}" 10 | export USERID=$(id -u) 11 | export GROUPID=$(id -g) 12 | 13 | cd "$(dirname "$0")" # move inside the script directory 14 | mkdir -p "${OUTPUT_DIR}" 15 | docker pull ${DOCKER_IMAGE} 16 | docker run --rm -it \ 17 | --env PYTHON_VERSION="${PYTHON_VERSION}" \ 18 | --env PLAT_NAME="${PLAT_NAME}" \ 19 | --env USERID="${USERID}" \ 20 | --env GROUPID="${GROUPID}" \ 21 | --volume "$(pwd)/../../:/lmdeploy" \ 22 | --volume "$(pwd)/${OUTPUT_DIR}:/lmdeploy_build" \ 23 | --volume "$(pwd)/entrypoint_build.sh:/entrypoint_build.sh" \ 24 | --entrypoint /entrypoint_build.sh \ 25 | ${DOCKER_IMAGE} 26 | -------------------------------------------------------------------------------- /builder/manywheel/entrypoint_build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eux 3 | 4 | export PYTHON_VERSION=$PYTHON_VERSION 5 | export PLAT_NAME=$PLAT_NAME 6 | export USERID=${USERID} 7 | export GROUPID=${GROUPID} 8 | export CUDAVER=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p') 9 | export NCCL_INCLUDE_DIR=/usr/local/cuda/include 10 | export NCCL_LIB_DIR=/usr/local/cuda/lib64 11 | 12 | source /opt/conda/bin/activate 13 | conda activate $PYTHON_VERSION 14 | 15 | cd lmdeploy 16 | rm -rf lmdeploy/lib 17 | mkdir -p build && cd build && rm -rf * 18 | bash ../generate.sh make 19 | make -j$(nproc) && make install 20 | if [ $? != 0 ]; then 21 | echo "build failed" 22 | exit 1 23 | fi 24 | cd .. 25 | rm -rf build 26 | python setup.py bdist_wheel --cuda=${CUDAVER} --plat-name $PLAT_NAME -d /tmpbuild/ 27 | chown ${USERID}:${GROUPID} /tmpbuild/* 28 | mv /tmpbuild/* /lmdeploy_build/ 29 | -------------------------------------------------------------------------------- /builder/manywheel/scripts/install_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 6 | chmod +x Miniconda3-latest-Linux-x86_64.sh 7 | bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda 8 | rm Miniconda3-latest-Linux-x86_64.sh 9 | -------------------------------------------------------------------------------- /builder/manywheel/scripts/install_openmpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz 6 | tar xf openmpi-4.1.5.tar.gz 7 | cd openmpi-4.1.5 8 | ./configure --prefix=/usr/local/mpi 9 | make -j$(nproc) 10 | make install 11 | -------------------------------------------------------------------------------- /builder/windows/README.md: -------------------------------------------------------------------------------- 1 | # Build lmdeploy on windows 2 | 3 | ## Requirements 4 | 5 | - [CMake 3.17+](https://github.com/Kitware/CMake/releases) 6 | - [Visual Studio 2019+](https://visualstudio.microsoft.com/downloads/) 7 | - [CUDA Toolkit 11.8+](https://developer.nvidia.com/cuda-toolkit-archive) 8 | 9 | ## Build lmdeploy wheel 10 | 11 | ```powershell 12 | mkdir build 13 | cd build 14 | ..\builder\windows\generate.ps1 15 | cmake --build . --config Release -- /m 16 | cmake --install . --config Release 17 | cd .. 18 | rm build -Force -Recurse 19 | python setup.py bdist_wheel -d build\wheel 20 | ``` 21 | -------------------------------------------------------------------------------- /builder/windows/generate.ps1: -------------------------------------------------------------------------------- 1 | cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" ` 2 | -DCMAKE_BUILD_TYPE=Release ` 3 | -DCMAKE_INSTALL_PREFIX=install ` 4 | -DBUILD_PY_FFI=ON ` 5 | -DBUILD_MULTI_GPU=OFF ` 6 | -DUSE_NVTX=OFF ` 7 | -DBUILD_TEST="$env:BUILD_TEST" 8 | -------------------------------------------------------------------------------- /debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | builder="-G Ninja" 4 | 5 | if [ "$1" == "make" ]; then 6 | builder="" 7 | fi 8 | 9 | cmake ${builder} .. \ 10 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 11 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ 12 | -DCMAKE_INSTALL_PREFIX=./install \ 13 | -DBUILD_PY_FFI=ON \ 14 | -DBUILD_MULTI_GPU=ON \ 15 | -DCMAKE_CUDA_FLAGS="-lineinfo" \ 16 | -DUSE_NVTX=ON \ 17 | -DPYTHON_EXECUTABLE=$(which python3) \ 18 | -DBUILD_TEST=ON 19 | -------------------------------------------------------------------------------- /docker/InternVL_Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=cu12 2 | 3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12 4 | ENV CUDA_VERSION_SHORT=cu123 5 | 6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11 7 | ENV CUDA_VERSION_SHORT=cu118 8 | 9 | FROM ${CUDA_VERSION} AS final 10 | 11 | RUN python3 -m pip install timm 12 | 13 | RUN python3 -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+${CUDA_VERSION_SHORT}torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl 14 | -------------------------------------------------------------------------------- /docker/Qwen2VL_Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=cu12 2 | 3 | FROM openmmlab/lmdeploy:latest-cu12 AS cu12 4 | ENV CUDA_VERSION_SHORT=cu123 5 | 6 | FROM openmmlab/lmdeploy:latest-cu11 AS cu11 7 | ENV CUDA_VERSION_SHORT=cu118 8 | 9 | FROM ${CUDA_VERSION} AS final 10 | 11 | # we use transformers to load vision part of qwen2_vl and it needs transformers > v4.44.2 12 | RUN python3 -m pip install git+https://github.com/huggingface/transformers.git 13 | 14 | RUN python3 -m pip install qwen_vl_utils 15 | -------------------------------------------------------------------------------- /docs/en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | formats: all 4 | 5 | build: 6 | os: "ubuntu-22.04" 7 | tools: 8 | python: "3.10" 9 | 10 | 11 | sphinx: 12 | configuration: docs/en/conf.py 13 | 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | - requirements: requirements/readthedocs.txt 19 | -------------------------------------------------------------------------------- /docs/en/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/en/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | table.autosummary td { 2 | width: 50% 3 | } 4 | 5 | img.align-center { 6 | display: block; 7 | margin-left: auto; 8 | margin-right: auto; 9 | } 10 | -------------------------------------------------------------------------------- /docs/en/api/pipeline.rst: -------------------------------------------------------------------------------- 1 | inference pipeline 2 | ================== 3 | .. currentmodule:: lmdeploy 4 | 5 | pipeline 6 | -------- 7 | .. autofunction:: pipeline 8 | 9 | serving 10 | -------- 11 | .. autofunction:: serve 12 | .. autofunction:: client 13 | 14 | 15 | PytorchEngineConfig 16 | ------------------- 17 | .. autoclass:: PytorchEngineConfig 18 | 19 | 20 | TurbomindEngineConfig 21 | --------------------- 22 | .. autoclass:: TurbomindEngineConfig 23 | 24 | 25 | GenerationConfig 26 | ---------------- 27 | .. autoclass:: GenerationConfig 28 | 29 | 30 | ChatTemplateConfig 31 | ------------------ 32 | .. autoclass:: ChatTemplateConfig 33 | -------------------------------------------------------------------------------- /docs/en/get_started/index.rst: -------------------------------------------------------------------------------- 1 | On Other Platforms 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: NPU(Huawei) 7 | 8 | ascend/get_started.md 9 | -------------------------------------------------------------------------------- /docs/en/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/en/multi_modal/index.rst: -------------------------------------------------------------------------------- 1 | Vision-Language Models 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Examples 7 | 8 | deepseek_vl2.md 9 | llava.md 10 | internvl.md 11 | xcomposer2d5.md 12 | cogvlm.md 13 | minicpmv.md 14 | phi3.md 15 | mllama.md 16 | qwen2_vl.md 17 | qwen2_5_vl.md 18 | molmo.md 19 | gemma3.md 20 | -------------------------------------------------------------------------------- /docs/zh_cn/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | formats: all 4 | 5 | build: 6 | os: "ubuntu-22.04" 7 | tools: 8 | python: "3.10" 9 | 10 | 11 | sphinx: 12 | configuration: docs/zh_cn/conf.py 13 | 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | - requirements: requirements/readthedocs.txt 19 | -------------------------------------------------------------------------------- /docs/zh_cn/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/zh_cn/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | table.autosummary td { 2 | width: 50% 3 | } 4 | 5 | img.align-center { 6 | display: block; 7 | margin-left: auto; 8 | margin-right: auto; 9 | } 10 | -------------------------------------------------------------------------------- /docs/zh_cn/api/pipeline.rst: -------------------------------------------------------------------------------- 1 | 推理 pipeline 2 | ================== 3 | .. currentmodule:: lmdeploy 4 | 5 | pipeline 6 | -------- 7 | .. autofunction:: pipeline 8 | 9 | serving 10 | -------- 11 | .. autofunction:: serve 12 | .. autofunction:: client 13 | 14 | 15 | PytorchEngineConfig 16 | ------------------- 17 | .. autoclass:: PytorchEngineConfig 18 | 19 | 20 | TurbomindEngineConfig 21 | --------------------- 22 | .. autoclass:: TurbomindEngineConfig 23 | 24 | 25 | GenerationConfig 26 | ---------------- 27 | .. autoclass:: GenerationConfig 28 | 29 | 30 | ChatTemplateConfig 31 | ------------------ 32 | .. autoclass:: ChatTemplateConfig 33 | -------------------------------------------------------------------------------- /docs/zh_cn/get_started/index.rst: -------------------------------------------------------------------------------- 1 | 其他软硬件平台 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: NPU(Huawei) 7 | 8 | ascend/get_started.md 9 | -------------------------------------------------------------------------------- /docs/zh_cn/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/zh_cn/multi_modal/cogvlm.md: -------------------------------------------------------------------------------- 1 | # cogvlm 2 | 3 | ## 简介 4 | 5 | CogVLM 是一个强大的开源视觉语言模型(VLM). LMDeploy 已在PyTorch后端支持 CogVLM-17B 模型 [THUDM/cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf) 和 CogVLM2-19B 模型如[THUDM/cogvlm2-llama3-chat-19B](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B) 6 | 7 | ## 快速开始 8 | 9 | 请参考[安装文档](../get_started/installation.md)安装 LMDeploy 10 | 11 | ### 准备 12 | 13 | 当使用LMDeploy部署 **CogVLM** 模型时,需要下载模型至本地目录。由于 **CogVLM** 模型使用外部Tokenizer,因而需要将相关文件下载至模型目录。然而对于**CogVLM2**模型,则可跳过此步骤。 14 | 15 | 以 **CogVLM** 模型 `cogvlm-chat-hf` 为例,可执行如下脚本下载模型: 16 | 17 | ```shell 18 | huggingface-cli download THUDM/cogvlm-chat-hf --local-dir ./cogvlm-chat-hf --local-dir-use-symlinks False 19 | huggingface-cli download lmsys/vicuna-7b-v1.5 special_tokens_map.json tokenizer.model tokenizer_config.json --local-dir ./cogvlm-chat-hf --local-dir-use-symlinks False 20 | ``` 21 | 22 | ### 离线推理 pipeline 23 | 24 | 以下是使用pipeline进行离线推理的示例,更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md) 25 | 26 | ```python 27 | from lmdeploy import pipeline 28 | from lmdeploy.vl import load_image 29 | 30 | 31 | if __name__ == "__main__": 32 | pipe = pipeline('cogvlm-chat-hf') 33 | 34 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') 35 | response = pipe(('describe this image', image)) 36 | print(response) 37 | ``` 38 | -------------------------------------------------------------------------------- /docs/zh_cn/multi_modal/gemma3.md: -------------------------------------------------------------------------------- 1 | # Gemma3 2 | 3 | ## 简介 4 | 5 | Gemma 是 Google 推出的轻量级、最先进的开放模型系列,采用与创建 Gemini 模型相同的研究和技术构建而成。Gemma3 模型是多模态模型,可处理文本和图像输入并生成文本输出,对预训练和指令微调均具有开源的权重。Gemma3 具有 128K 的大型上下文窗口,支持 140 多种语言,并且比以前的版本提供更多尺寸。Gemma3 模型非常适合各种文本生成和图像理解任务,包括问答、总结和推理。它们的尺寸相对较小,因此可以将其部署在资源有限的环境中,例如笔记本电脑、台式机或您自己的云基础设施,从而让每个人都能轻松访问最先进的 AI 模型,并帮助促进创新。 6 | 7 | ## 快速开始 8 | 9 | 请参考[安装文档](../get_started/installation.md)安装 LMDeploy。 10 | 11 | ### 准备 12 | 13 | 在使用 LMDeploy 部署 **Gemma3** 模型时,请安装最新的 transformers。 14 | 15 | ### 离线推理 pipeline 16 | 17 | 以下是使用pipeline进行离线推理的示例,更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)。 18 | 19 | ```python 20 | from lmdeploy import pipeline 21 | from lmdeploy.vl import load_image 22 | 23 | 24 | if __name__ == "__main__": 25 | pipe = pipeline('google/gemma-3-12b-it') 26 | 27 | image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') 28 | response = pipe(('describe this image', image)) 29 | print(response) 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/zh_cn/multi_modal/index.rst: -------------------------------------------------------------------------------- 1 | 视觉语言模型 2 | ================================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: 示例 7 | 8 | deepseek_vl2.md 9 | llava.md 10 | internvl.md 11 | xcomposer2d5.md 12 | cogvlm.md 13 | minicpmv.md 14 | phi3.md 15 | mllama.md 16 | qwen2_vl.md 17 | qwen2_5_vl.md 18 | molmo.md 19 | gemma3.md 20 | -------------------------------------------------------------------------------- /generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | WORKSPACE_PATH=$(dirname "$(readlink -f "$0")") 3 | 4 | builder="-G Ninja" 5 | 6 | if [ "$1" == "make" ]; then 7 | builder="" 8 | fi 9 | 10 | cmake ${builder} .. \ 11 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 12 | -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ 13 | -DCMAKE_INSTALL_PREFIX=${WORKSPACE_PATH}/install \ 14 | -DBUILD_PY_FFI=ON \ 15 | -DBUILD_MULTI_GPU=ON \ 16 | -DCMAKE_CUDA_FLAGS="-lineinfo" \ 17 | -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \ 18 | -DUSE_NVTX=ON 19 | -------------------------------------------------------------------------------- /k8s/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app: internlm2-chat-7b 6 | name: internlm2-chat-7b-svc 7 | spec: 8 | ports: 9 | - name: main 10 | port: 23333 11 | protocol: TCP 12 | targetPort: main 13 | selector: 14 | app: internlm2-chat-7b 15 | type: ClusterIP 16 | -------------------------------------------------------------------------------- /lmdeploy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .api import client, pipeline, serve 4 | from .messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, VisionConfig 5 | from .model import ChatTemplateConfig 6 | from .tokenizer import Tokenizer 7 | from .version import __version__, version_info 8 | 9 | __all__ = [ 10 | 'pipeline', 'serve', 'client', 'Tokenizer', 'GenerationConfig', '__version__', 'version_info', 'ChatTemplateConfig', 11 | 'PytorchEngineConfig', 'TurbomindEngineConfig', 'VisionConfig' 12 | ] 13 | -------------------------------------------------------------------------------- /lmdeploy/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .cli import run 3 | 4 | if __name__ == '__main__': 5 | run() 6 | -------------------------------------------------------------------------------- /lmdeploy/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .entrypoint import run 3 | 4 | __all__ = ['run'] 5 | -------------------------------------------------------------------------------- /lmdeploy/lite/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .apis import * # noqa: F401,F403 3 | from .quantization import * # noqa: F401,F403 4 | from .utils import * # noqa: F401,F403 5 | -------------------------------------------------------------------------------- /lmdeploy/lite/apis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/lite/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch import nn 3 | 4 | OFFLOAD_MOD = (nn.Linear, ) 5 | KV_CACHE_SIGNATURE = 'past_key_value' 6 | -------------------------------------------------------------------------------- /lmdeploy/lite/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/lite/modeling/internlm2_gptq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from auto_gptq.modeling import BaseGPTQForCausalLM 3 | 4 | 5 | class InternLM2GPTQForCausalLM(BaseGPTQForCausalLM): 6 | layer_type = 'InternLM2DecoderLayer' 7 | layers_block_name = 'model.layers' 8 | outside_layer_modules = ['model.tok_embeddings', 'model.norm'] 9 | inside_layer_modules = [ 10 | ['attention.wqkv'], 11 | ['attention.wo'], 12 | ['feed_forward.w3', 'feed_forward.w1'], 13 | ['feed_forward.w2'], 14 | ] 15 | -------------------------------------------------------------------------------- /lmdeploy/lite/modeling/internlm3_gptq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from auto_gptq.modeling import BaseGPTQForCausalLM 3 | 4 | 5 | class InternLM3GPTQForCausalLM(BaseGPTQForCausalLM): 6 | layer_type = 'InternLM3DecoderLayer' 7 | layers_block_name = 'model.layers' 8 | outside_layer_modules = ['model.embed_tokens', 'model.norm'] 9 | inside_layer_modules = [ 10 | ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], 11 | ['self_attn.o_proj'], 12 | ['mlp.up_proj', 'mlp.gate_proj'], 13 | ['mlp.down_proj'], 14 | ] 15 | -------------------------------------------------------------------------------- /lmdeploy/lite/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .activation import ActivationObserver, KVCacheObserver 3 | from .calibration import CalibrationContext, CalibrationContextV2 4 | from .weight import WeightQuantizer 5 | 6 | __all__ = ['WeightQuantizer', 'ActivationObserver', 'KVCacheObserver', 'CalibrationContext', 'CalibrationContextV2'] 7 | -------------------------------------------------------------------------------- /lmdeploy/lite/quantization/activation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .observer import ActivationObserver, KVCacheObserver 3 | 4 | __all__ = ['ActivationObserver', 'KVCacheObserver'] 5 | -------------------------------------------------------------------------------- /lmdeploy/lite/quantization/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .linear import WeightOnlyQLinear 3 | 4 | __all__ = ['WeightOnlyQLinear'] 5 | -------------------------------------------------------------------------------- /lmdeploy/lite/quantization/weight/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .quantizer import WeightQuantizer 3 | 4 | __all__ = ['WeightQuantizer'] 5 | -------------------------------------------------------------------------------- /lmdeploy/lite/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs 4 | from .cal_qparams import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax, 5 | cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax, 6 | cal_qparams_per_tensor_minmax, precise_round) 7 | from .calib_dataloader import get_calib_loaders 8 | from .collect import bimap_name_mod, collect_target_modules, collect_target_weights 9 | from .global_avail import GlobalAvailMixin 10 | from .load import load_hf_from_pretrained 11 | 12 | __all__ = [ 13 | 'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax', 'cal_qparams_per_group_absmax', 14 | 'cal_qparams_per_group_minmax', 'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax', 'QParams', 15 | 'get_calib_loaders', 'collect_target_modules', 'precise_round', 'collect_target_weights', 'GlobalAvailMixin', 16 | 'split_decoder_layer_inputs', 'bimap_name_mod', 'concat_decoder_layer_outputs', 'load_hf_from_pretrained' 17 | ] 18 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/adapter/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import OpType # noqa: F401 3 | from .selector import get_backend # noqa: F401 4 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | 4 | 5 | class SiluAndMulImpl(ABC): 6 | """Silu + multiple residual fused implementation.""" 7 | 8 | @abstractmethod 9 | def forward(self, x): 10 | """forward.""" 11 | raise NotImplementedError 12 | 13 | 14 | class SiluAndMulBuilder(ABC): 15 | """Silu and mul implementation builder.""" 16 | 17 | @staticmethod 18 | @abstractmethod 19 | def build(inplace: bool = False): 20 | """build.""" 21 | raise NotImplementedError 22 | 23 | 24 | class GeluAndMulImpl(ABC): 25 | """Gelu + multiple residual fused implementation.""" 26 | 27 | @abstractmethod 28 | def forward(self, x): 29 | """forward.""" 30 | raise NotImplementedError 31 | 32 | 33 | class GeluAndMulBuilder(ABC): 34 | """Gelu and mul implementation builder.""" 35 | 36 | @staticmethod 37 | @abstractmethod 38 | def build(approximate: str = 'none'): 39 | """build.""" 40 | raise NotImplementedError 41 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/apply_rotary_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | 4 | from torch import Tensor 5 | 6 | 7 | class ApplyRotaryEmbImpl(ABC): 8 | """Apply rotary embedding implementation.""" 9 | 10 | @abstractmethod 11 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True): 12 | """forward.""" 13 | raise NotImplementedError 14 | 15 | 16 | class ApplyRotaryEmbBuilder(ABC): 17 | """Apply rotary embedding implementation builder.""" 18 | 19 | @staticmethod 20 | @abstractmethod 21 | def build(): 22 | """Build implementation.""" 23 | raise NotImplementedError 24 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/awq_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | from typing import Optional 4 | 5 | import torch 6 | 7 | 8 | class LinearW4A16Impl(ABC): 9 | """W4a16 linear implementation.""" 10 | 11 | def update_weights(self, 12 | qweight: torch.Tensor, 13 | scales: torch.Tensor, 14 | qzeros: torch.Tensor, 15 | bias: Optional[torch.Tensor] = None): 16 | """Update weights.""" 17 | return qweight, scales, qzeros, bias 18 | 19 | @abstractmethod 20 | def forward(self, x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, all_reduce: bool = False): 21 | """forward.""" 22 | raise NotImplementedError 23 | 24 | 25 | class LinearW4A16Builder(ABC): 26 | """W4a16 linear implementation builder.""" 27 | 28 | @staticmethod 29 | @abstractmethod 30 | def build(in_features: int, 31 | out_features: int, 32 | w_bit: int, 33 | group_size: int, 34 | bias: bool = False, 35 | dtype: torch.dtype = None): 36 | """build.""" 37 | raise NotImplementedError 38 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/blockedf8_modules.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | from typing import List, Optional 4 | 5 | import torch 6 | 7 | 8 | class LinearBlockedF8Impl(ABC): 9 | """Linear BlockedF8 implementation api.""" 10 | 11 | def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None): 12 | """Update weights.""" 13 | return weight, scale, bias 14 | 15 | @abstractmethod 16 | def forward(self, 17 | x, 18 | weight: torch.Tensor, 19 | scale: torch.Tensor, 20 | bias: Optional[torch.Tensor] = None, 21 | all_reduce: bool = False, 22 | rank: int = 0, 23 | scatter_size: List[int] = None): 24 | """forward.""" 25 | raise NotImplementedError 26 | 27 | 28 | class LinearBlockedF8Builder(ABC): 29 | """Linear BlockedF8 implementation builder.""" 30 | 31 | @staticmethod 32 | @abstractmethod 33 | def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None): 34 | """build.""" 35 | raise NotImplementedError 36 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .op_backend import CudaOpsBackend # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/cuda/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul 3 | 4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl 5 | 6 | 7 | class TritonSiluAndMulImpl(SiluAndMulImpl): 8 | """Silu + multiple residual fused implementation.""" 9 | 10 | def __init__(self, inplace: bool): 11 | self.inplace = inplace 12 | 13 | def forward(self, x): 14 | """forward.""" 15 | out = None 16 | x_shape = None 17 | if x.dim() != 2: 18 | x_shape = x.shape 19 | x = x.flatten(0, -2) 20 | if self.inplace: 21 | out = x.chunk(2, -1)[0] 22 | 23 | out = silu_and_mul(x, out) 24 | 25 | if x_shape is not None: 26 | out = out.unflatten(0, x_shape[:-1]) 27 | return out 28 | 29 | 30 | class TritonSiluAndMulBuilder(SiluAndMulBuilder): 31 | """Silu and mul implementation builder.""" 32 | 33 | @staticmethod 34 | def build(inplace: bool = False): 35 | """build.""" 36 | return TritonSiluAndMulImpl(inplace) 37 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/cuda/apply_rotary_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from torch import Tensor 4 | 5 | from lmdeploy.pytorch.kernels.cuda import apply_rotary_pos_emb 6 | 7 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl 8 | 9 | 10 | class TritonApplyRotaryEmbImpl(ApplyRotaryEmbImpl): 11 | """Apply rotary embedding implementation.""" 12 | 13 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True): 14 | """forward.""" 15 | if inplace: 16 | q_embed = query 17 | k_embed = key 18 | else: 19 | q_embed = torch.empty_like(query) 20 | k_embed = torch.empty_like(key) 21 | return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed) 22 | 23 | 24 | class TritonApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder): 25 | """Apply rotary embedding implementation builder.""" 26 | 27 | @staticmethod 28 | def build(): 29 | """Build implementation.""" 30 | return TritonApplyRotaryEmbImpl() 31 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/cuda/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | import torch 4 | 5 | from lmdeploy.pytorch.kernels.cuda import multinomial_sampling 6 | 7 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl 8 | 9 | 10 | class TritonMultinomialSamplingImpl(MultinomialSamplingImpl): 11 | 12 | def forward(self, 13 | scores: torch.Tensor, 14 | seeds: torch.LongTensor, 15 | offsets: torch.LongTensor, 16 | indices: torch.Tensor = None): 17 | """forward.""" 18 | return multinomial_sampling(scores, seeds, offsets, indices) 19 | 20 | 21 | class TritonMultinomialSamplingBuilder(MultinomialSamplingBuilder): 22 | """Triton multinomial sampling builder.""" 23 | 24 | def build(): 25 | """build.""" 26 | return TritonMultinomialSamplingImpl() 27 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/cuda/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from lmdeploy.pytorch.kernels.cuda import rms_norm 5 | 6 | from ..norm import RMSNormBuilder, RMSNormImpl 7 | 8 | 9 | class TritonRMSNormImpl(RMSNormImpl): 10 | """Triton RMS norm implementation.""" 11 | 12 | def __init__(self, hidden_size: int, eps: float = 1e-6): 13 | self.hidden_size = hidden_size 14 | self.eps = eps 15 | 16 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None): 17 | """forward.""" 18 | if residual is None: 19 | x = rms_norm(x, weight, self.eps) 20 | return x 21 | else: 22 | x, residual = rms_norm(x, weight, self.eps, residual=residual) 23 | return x, residual 24 | 25 | 26 | class TritonRMSNormBuilder(RMSNormBuilder): 27 | """Triton RMS norm implementation builder.""" 28 | 29 | @staticmethod 30 | def build(weight: torch.Tensor, eps: float = 1e-6): 31 | """build.""" 32 | return TritonRMSNormImpl(weight, eps) 33 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/default/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .op_backend import DefaultOpsBackend # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/default/moe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from ..moe import SoftmaxTopKBuilder, SoftmaxTopKImpl 5 | 6 | 7 | class DefaultSoftmaxTopKImpl(SoftmaxTopKImpl): 8 | """RMS norm implementation api.""" 9 | 10 | def __init__(self, top_k: int, dim: int = -1): 11 | self.top_k = top_k 12 | self.dim = dim 13 | 14 | def forward(self, x: torch.Tensor): 15 | """forward.""" 16 | routing_weights = torch.softmax(x, dim=self.dim, dtype=torch.float32) 17 | topk_weights, topk_ids = torch.topk(routing_weights, self.top_k, dim=self.dim) 18 | return topk_weights, topk_ids 19 | 20 | 21 | class DefaultSoftmaxTopKBuilder(SoftmaxTopKBuilder): 22 | """RMS norm implementation builder.""" 23 | 24 | @staticmethod 25 | def build(top_k: int, dim: int = -1): 26 | """build.""" 27 | return DefaultSoftmaxTopKImpl(top_k, dim) 28 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/default/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | import torch 4 | 5 | from ..multinomial_sampling import MultinomialSamplingBuilder, MultinomialSamplingImpl 6 | 7 | 8 | class DefaultMultinomialSamplingImpl(MultinomialSamplingImpl): 9 | """Multinomial sampling implementation api.""" 10 | 11 | def forward(self, 12 | scores: torch.Tensor, 13 | seeds: torch.LongTensor, 14 | offsets: torch.LongTensor, 15 | indices: torch.Tensor = None): 16 | """forward.""" 17 | sampled_index = torch.multinomial(scores, num_samples=1, replacement=True) 18 | outputs = torch.gather(indices, dim=1, index=sampled_index) 19 | return outputs.view(-1) 20 | 21 | 22 | class DefaultMultinomialSamplingBuilder(MultinomialSamplingBuilder): 23 | """Multinomial sampling implementation builder.""" 24 | 25 | def build(): 26 | """build.""" 27 | return DefaultMultinomialSamplingImpl() 28 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .ascend import AscendOpsBackend # noqa: F401 3 | from .camb import CambOpsBackend # noqa: F401 4 | from .maca import MacaOpsBackend # noqa: F401 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from lmdeploy.pytorch.kernels.dlinfer.activation import silu_and_mul 3 | 4 | from ..activation import SiluAndMulBuilder, SiluAndMulImpl 5 | 6 | 7 | class DlinferSiluAndMulImpl(SiluAndMulImpl): 8 | """Silu + multiple fused implementation.""" 9 | 10 | def forward(self, x): 11 | """forward.""" 12 | return silu_and_mul(x) 13 | 14 | 15 | class DlinferSiluAndMulBuilder(SiluAndMulBuilder): 16 | """Silu and mul implementation builder.""" 17 | 18 | @staticmethod 19 | def build(inplace: bool = False): 20 | """build.""" 21 | return DlinferSiluAndMulImpl() 22 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/apply_rotary_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch import Tensor 3 | 4 | from lmdeploy.pytorch.kernels.dlinfer import apply_rotary_pos_emb 5 | 6 | from ..apply_rotary_emb import ApplyRotaryEmbBuilder, ApplyRotaryEmbImpl 7 | 8 | 9 | class DlinferApplyRotaryEmbImpl(ApplyRotaryEmbImpl): 10 | """Apply rotary embedding implementation.""" 11 | 12 | def forward(self, query: Tensor, key: Tensor, cos: Tensor, sin: Tensor, inplace: bool = True): 13 | """forward.""" 14 | if inplace: 15 | q_embed = None 16 | k_embed = None 17 | else: 18 | q_embed = query.new_empty(query.shape) 19 | k_embed = key.new_empty(key.shape) 20 | return apply_rotary_pos_emb(query, key, cos, sin, q_embed, k_embed) 21 | 22 | 23 | class DlinferApplyRotaryEmbBuilder(ApplyRotaryEmbBuilder): 24 | """Apply rotary embedding implementation builder.""" 25 | 26 | @staticmethod 27 | def build(): 28 | """Build implementation.""" 29 | return DlinferApplyRotaryEmbImpl() 30 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/ascend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .op_backend import AscendOpsBackend, SocVersion # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/camb/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .op_backend import CambOpsBackend # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/maca/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .op_backend import MacaOpsBackend # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/dlinfer/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from lmdeploy.pytorch.kernels.dlinfer import rms_norm 5 | 6 | from ..norm import RMSNormBuilder, RMSNormImpl 7 | 8 | 9 | class DlinferRMSNormImpl(RMSNormImpl): 10 | """Dlinfer RMS norm implementation.""" 11 | 12 | def __init__(self, hidden_size: int, eps: float = 1e-6): 13 | self.hidden_size = hidden_size 14 | self.eps = eps 15 | 16 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None): 17 | """forward.""" 18 | if residual is None: 19 | x = rms_norm(x, weight, self.eps) 20 | return x 21 | else: 22 | x, residual = rms_norm(x, weight, self.eps, residual=residual) 23 | return x, residual 24 | 25 | 26 | class DlinferRMSNormBuilder(RMSNormBuilder): 27 | """Dlinfer RMS norm implementation builder.""" 28 | 29 | @staticmethod 30 | def build(weight: torch.Tensor, eps: float = 1e-6): 31 | """build.""" 32 | return DlinferRMSNormImpl(weight, eps) 33 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/flash_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | 4 | from torch import Tensor 5 | 6 | 7 | class FlashAttentionImpl(ABC): 8 | """FlashAttention implementation.""" 9 | 10 | def forward(self, 11 | query: Tensor, 12 | key: Tensor, 13 | value: Tensor, 14 | q_start_loc: Tensor, 15 | q_seqlens: Tensor, 16 | kv_start_loc: Tensor, 17 | kv_seqlens: Tensor, 18 | max_q_seqlen: int = None): 19 | """forward.""" 20 | raise NotImplementedError 21 | 22 | 23 | class FlashAttentionBuilder(ABC): 24 | """FlashAttention implementation builder.""" 25 | 26 | @staticmethod 27 | @abstractmethod 28 | def build( 29 | num_heads: int, 30 | head_dim: int, 31 | scale: float = None, 32 | num_kv_heads: int = None, 33 | v_head_dim: int = None, 34 | causal: bool = True, 35 | sliding_window: int = None, 36 | logical_softcapping: float = None, 37 | **kwargs, 38 | ) -> FlashAttentionImpl: 39 | """build.""" 40 | raise NotImplementedError 41 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | from typing import List, Optional 4 | 5 | import torch 6 | 7 | 8 | class LinearImpl(ABC): 9 | """Linear implementation api.""" 10 | 11 | def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): 12 | """Update weights.""" 13 | return weight, bias 14 | 15 | @abstractmethod 16 | def forward(self, 17 | x, 18 | weight: torch.Tensor, 19 | bias: Optional[torch.Tensor] = None, 20 | all_reduce: bool = False, 21 | rank: int = 0, 22 | scatter_size: List[int] = None): 23 | """forward.""" 24 | raise NotImplementedError 25 | 26 | 27 | class LinearBuilder(ABC): 28 | """Linear implementation builder.""" 29 | 30 | @staticmethod 31 | @abstractmethod 32 | def build(in_features: int, out_features: int, bias: bool = True, dtype: torch.dtype = None): 33 | """build.""" 34 | raise NotImplementedError 35 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | 4 | import torch 5 | 6 | 7 | class MultinomialSamplingImpl(ABC): 8 | """Multinomial sampling implementation api.""" 9 | 10 | @abstractmethod 11 | def forward(scores: torch.Tensor, seeds: torch.LongTensor, offsets: torch.LongTensor, indices: torch.Tensor = None): 12 | """forward.""" 13 | raise NotImplementedError 14 | 15 | 16 | class MultinomialSamplingBuilder(ABC): 17 | """Multinomial sampling implementation builder.""" 18 | 19 | @staticmethod 20 | @abstractmethod 21 | def build(): 22 | """build.""" 23 | raise NotImplementedError 24 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/backends/norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABC, abstractmethod 3 | 4 | import torch 5 | 6 | 7 | class RMSNormImpl(ABC): 8 | """RMS norm implementation api.""" 9 | 10 | @abstractmethod 11 | def forward(self, x: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor = None): 12 | """forward.""" 13 | raise NotImplementedError 14 | 15 | 16 | class RMSNormBuilder(ABC): 17 | """RMS norm implementation builder.""" 18 | 19 | @staticmethod 20 | @abstractmethod 21 | def build(hidden_size: int, eps: float = 1e-6): 22 | """build.""" 23 | raise NotImplementedError 24 | 25 | 26 | class LayerNormImpl(ABC): 27 | """Layer norm implementation api.""" 28 | 29 | @abstractmethod 30 | def forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor = None, residual: torch.Tensor = None): 31 | """forward.""" 32 | raise NotImplementedError 33 | 34 | 35 | class LayerNormBuilder(ABC): 36 | """Layer norm implementation builder.""" 37 | 38 | @staticmethod 39 | @abstractmethod 40 | def build(normalized_shape: int, eps: float = 1e-6): 41 | """build.""" 42 | raise NotImplementedError 43 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/adapter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import BaseChecker 3 | 4 | 5 | class AdapterChecker(BaseChecker): 6 | """Check adapter is available.""" 7 | 8 | def __init__(self, adapter_path: str, logger=None): 9 | super().__init__(logger) 10 | self.adapter_path = adapter_path 11 | 12 | def check(self): 13 | """check.""" 14 | path = self.adapter_path 15 | 16 | try: 17 | import peft # noqa: F401 18 | except Exception as e: 19 | self.log_and_exit(e, 'Adapter', message='Failed to import peft.') 20 | 21 | try: 22 | from peft import PeftConfig 23 | PeftConfig.from_pretrained(path) 24 | except Exception as e: 25 | message = ('Please make sure the adapter can be loaded with ' 26 | '`peft.PeftConfig.from_pretrained`\n') 27 | err_msg = '' if len(e.args) == 0 else e.args[0] 28 | if 'got an unexpected keyword argument' in err_msg: 29 | message += ('Or try remove all unexpected keywords ' 30 | 'in `adapter_config.json`.') 31 | self.log_and_exit(e, 'Adapter', message=message) 32 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/deeplink.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from lmdeploy.utils import try_import_deeplink 3 | 4 | from .base import BaseChecker 5 | 6 | 7 | class DeeplinkChecker(BaseChecker): 8 | """Check pytorch is available.""" 9 | 10 | def __init__(self, device_type: str, logger=None) -> None: 11 | super().__init__(logger=logger) 12 | self.device_type = device_type 13 | 14 | def check(self): 15 | """check.""" 16 | try_import_deeplink(self.device_type) 17 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/torch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import BaseChecker 3 | 4 | 5 | class TorchChecker(BaseChecker): 6 | """Check pytorch is available.""" 7 | 8 | def __init__(self, device: str = 'cuda', logger=None) -> None: 9 | super().__init__(logger=logger) 10 | self.device = device 11 | 12 | def check(self): 13 | """check.""" 14 | try: 15 | import torch 16 | a = torch.tensor([1, 2], device=self.device) 17 | b = a.new_tensor([3, 4], device=self.device) 18 | c = a + b 19 | torch.testing.assert_close(c, a.new_tensor([4, 6])) 20 | except Exception as e: 21 | self.log_and_exit(e, 'PyTorch', 'PyTorch is not available.') 22 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/transformers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from packaging import version 3 | 4 | from .base import BaseChecker 5 | 6 | MIN_TRANSFORMERS_VERSION = '4.33.0' 7 | MAX_TRANSFORMERS_VERSION = '4.49.0' 8 | 9 | 10 | class TransformersChecker(BaseChecker): 11 | """Check transformers is available.""" 12 | 13 | def check(self): 14 | """check.""" 15 | import transformers 16 | logger = self.get_logger() 17 | try: 18 | trans_version = version.parse(transformers.__version__) 19 | min_version = version.parse(MIN_TRANSFORMERS_VERSION) 20 | max_version = version.parse(MAX_TRANSFORMERS_VERSION) 21 | if trans_version < min_version or trans_version > max_version: 22 | logger.warning('LMDeploy requires transformers version: ' 23 | f'[{MIN_TRANSFORMERS_VERSION} ~ ' 24 | f'{MAX_TRANSFORMERS_VERSION}], ' 25 | 'but found version: ' 26 | f'{transformers.__version__}') 27 | except Exception as e: 28 | self.log_and_exit(e, 'transformers', 'transformers is not available.') 29 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/check_env/triton_custom_add.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import triton 4 | import triton.language as tl 5 | 6 | 7 | @triton.jit 8 | def _add_kernel(A, B, C, size, BLOCK: tl.constexpr): 9 | """Add kernel.""" 10 | prog_id = tl.program_id(0) 11 | offs = prog_id * BLOCK + tl.arange(0, BLOCK) 12 | a = tl.load(A + offs, mask=offs < size) 13 | b = tl.load(B + offs, mask=offs < size) 14 | tl.store(C + offs, a + b, mask=offs < size) 15 | 16 | 17 | def custom_add(a, b): 18 | """Custom add one.""" 19 | c = torch.empty_like(a) 20 | size = c.size(0) 21 | BLOCK = 16 22 | 23 | grid = (triton.cdiv(size, BLOCK), ) 24 | _add_kernel[grid](a, b, c, size, BLOCK=BLOCK) 25 | return c 26 | 27 | 28 | if __name__ == '__main__': 29 | a = torch.tensor([1, 2], device='cuda') 30 | b = a.new_tensor([3, 4], device='cuda') 31 | c = custom_add(a, b) 32 | torch.testing.assert_close(c, a + b) 33 | print('Done.') 34 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import importlib 3 | import pkgutil 4 | 5 | from .builder import AutoModelConfigBuilder 6 | 7 | __all__ = [] 8 | 9 | # load all submodule 10 | for loader, module_name, is_pkg in pkgutil.walk_packages(__path__): 11 | __all__.append(module_name) 12 | _module = importlib.import_module('{}.{}'.format(__name__, module_name)) 13 | globals()[module_name] = _module 14 | 15 | __all__ += ['AutoModelConfigBuilder'] 16 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/cogvlm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class CogVLMModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | model_arch = hf_config.architectures[0] if hf_config.architectures else None 12 | return model_arch == 'CogVLMForCausalLM' 13 | 14 | @classmethod 15 | def build(cls, hf_config, model_path: str = None, **kwargs): 16 | """build.""" 17 | from lmdeploy.utils import is_bf16_supported 18 | if getattr(hf_config, 'num_multi_query_heads', None): 19 | hf_config.num_key_value_heads = hf_config.num_multi_query_heads 20 | else: 21 | hf_config.num_key_value_heads = hf_config.num_attention_heads 22 | 23 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs) 24 | cfg.cogvlm_style = True 25 | torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16' 26 | hf_config.torch_dtype = torch_dtype 27 | return cfg 28 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/deepseek_vl2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class DeepseekVLV2ModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.model_type in ['deepseek_vl_v2'] 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """Build deepseek-vl2.""" 16 | 17 | if hf_config.language_config.use_mla: 18 | from .deepseek_v2 import DeepseekV2ModelConfigBuilder 19 | cfg = DeepseekV2ModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs) 20 | cfg.hf_config = hf_config 21 | else: 22 | # deepseek-vl2-tiny uses MHA, rather than MLA 23 | # in this case, we use DefaultModelConfigBuilder 24 | cfg = DefaultModelConfigBuilder.build(hf_config.language_config, model_path, **kwargs) 25 | cfg.hf_config = hf_config 26 | 27 | return cfg 28 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/gemma.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class GemmaModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.model_type in ['gemma', 'gemma2', 'gemma3_text'] 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """Build gemma.""" 16 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs) 17 | cfg.head_dim = hf_config.head_dim 18 | return cfg 19 | 20 | 21 | class GemmaVLModelConfigBuilder(AutoModelConfigBuilder): 22 | 23 | @classmethod 24 | def condition(cls, hf_config): 25 | """config.""" 26 | model_arch = hf_config.architectures[0] if hf_config.architectures else None 27 | return model_arch == 'Gemma3ForConditionalGeneration' 28 | 29 | @classmethod 30 | def build(cls, hf_config, model_path: str = None, **kwargs): 31 | """Build gemma.""" 32 | hf_config.text_config.architectures = ['Gemma3ForCausalLM'] 33 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs) 34 | cfg.hf_config = hf_config 35 | return cfg 36 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/internvl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class InternVLModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.architectures[0] == 'InternVLChatModel' 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """Build llava hf.""" 16 | cfg = DefaultModelConfigBuilder.build(hf_config.llm_config, model_path, **kwargs) 17 | cfg.hf_config = hf_config 18 | return cfg 19 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/llama4.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class Llama4ModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.model_type in ['llama4'] 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """Build llama4.""" 16 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs) 17 | cfg.hf_config = hf_config 18 | 19 | return cfg 20 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/minicpm3.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .builder import AutoModelConfigBuilder 4 | from .default import DefaultModelConfigBuilder 5 | 6 | 7 | class MiniCPM3ModelConfigBuilder(AutoModelConfigBuilder): 8 | 9 | @classmethod 10 | def condition(cls, hf_config): 11 | """config.""" 12 | return hf_config.architectures[0] in ['MiniCPM3ForCausalLM'] 13 | 14 | @classmethod 15 | def build(cls, hf_config, model_path: str = None, **kwargs): 16 | """build.""" 17 | head_dim = (hf_config.qk_nope_head_dim + hf_config.qk_rope_head_dim) 18 | 19 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs) 20 | cfg.head_dim = head_dim 21 | cfg.k_head_dim = head_dim 22 | cfg.v_head_dim = head_dim 23 | 24 | return cfg 25 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/mllama.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class MLlamaModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.architectures[0] == 'MllamaForConditionalGeneration' 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """Build llava hf.""" 16 | cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs) 17 | cfg.hf_config = hf_config 18 | return cfg 19 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import AutoModelConfigBuilder 3 | from .default import DefaultModelConfigBuilder 4 | 5 | 6 | class QwenModelConfigBuilder(AutoModelConfigBuilder): 7 | 8 | @classmethod 9 | def condition(cls, hf_config): 10 | """config.""" 11 | return hf_config.model_type == 'qwen' 12 | 13 | @classmethod 14 | def build(cls, hf_config, model_path: str = None, **kwargs): 15 | """build.""" 16 | from lmdeploy.utils import is_bf16_supported 17 | cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs) 18 | if cfg.bos_token_id is None: 19 | cfg.bos_token_id = 151644 20 | if cfg.eos_token_id is None: 21 | cfg.eos_token_id = 151645 22 | 23 | torch_dtype = 'bfloat16' if is_bf16_supported() else 'float16' 24 | if hf_config.bf16 and is_bf16_supported(): 25 | torch_dtype = 'bfloat16' 26 | elif hf_config.fp16: 27 | torch_dtype = 'float16' 28 | hf_config.torch_dtype = torch_dtype 29 | return cfg 30 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/configurations/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from lmdeploy.utils import get_logger 5 | 6 | logger = get_logger('lmdeploy') 7 | 8 | 9 | def flash_mla_available(): 10 | """Check if flash mla is available.""" 11 | # use flash_mla by default if it is installed 12 | use_flash_mla = False 13 | try: 14 | # torch_npu device_properties doesn't have 'major' attribute 15 | device_properties = torch.cuda.get_device_properties(0) 16 | if hasattr(device_properties, 'major') and device_properties.major >= 9: 17 | import flash_mla_cuda # noqa 18 | use_flash_mla = True 19 | except ImportError: 20 | logger.warning('For higher performance, please install flash_mla https://github.com/deepseek-ai/FlashMLA') 21 | return use_flash_mla 22 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/devices/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .device_manager import DefaultContext, DeviceContext, get_device_manager 3 | 4 | __all__ = ['DeviceContext', 'DefaultContext', 'get_device_manager'] 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/disagg/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/disagg/backend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from lmdeploy.logger import get_logger 3 | 4 | logger = get_logger('lmdeploy') 5 | 6 | try: 7 | logger.debug('Registering DLSlime Backend') 8 | from .dlslime import DLSlimeBackend 9 | except ImportError: 10 | logger.warning('Disable DLSlime Backend') 11 | 12 | try: 13 | logger.debug('Registering Mooncake Backend') 14 | from .mooncake import MooncakeBackend 15 | except ImportError: 16 | logger.warning('Disable Mooncake Backend') 17 | 18 | try: 19 | logger.debug('Registering InfiniStoreBackend Backend') 20 | from .infinistore import InfiniStoreBackend 21 | except ImportError: 22 | logger.warning('Disable InfiniStoreBackend Backend') 23 | 24 | __all__ = ['DLSlimeBackend', 'MooncakeBackend', 'InfiniStoreBackend'] 25 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/disagg/backend/backend.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from mmengine.registry import Registry 3 | 4 | MIGRATION_BACKENDS = Registry('migration_backend', locations=['lmdeploy.pytorch.disagg.backend.backend']) 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/disagg/request.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List, Optional 3 | 4 | from pydantic import BaseModel 5 | 6 | from lmdeploy.pytorch.disagg.config import (DistServeEngineConfig, DistServeNVLinkConfig, DistServeRDMAConfig, 7 | DistServeTCPConfig, MigrationProtocol) 8 | 9 | 10 | class DistServeConnectionRequest(BaseModel): 11 | protocol: MigrationProtocol 12 | remote_engine_id: str 13 | remote_endpoint_info: str 14 | 15 | 16 | class DistServeInitRequest(BaseModel): 17 | local_engine_id: str 18 | local_engine_config: DistServeEngineConfig 19 | 20 | remote_engine_id: str 21 | remote_engine_config: DistServeEngineConfig 22 | 23 | protocol: MigrationProtocol 24 | 25 | rank: Optional[int] = None 26 | 27 | tcp_config: Optional[DistServeTCPConfig] = None 28 | rdma_config: Optional[DistServeRDMAConfig] = None 29 | nvlink_config: Optional[DistServeNVLinkConfig] = None 30 | 31 | 32 | class MigrationRequest(BaseModel): 33 | protocol: MigrationProtocol 34 | 35 | remote_engine_id: str 36 | remote_session_id: int 37 | remote_token_id: int 38 | remote_block_ids: List[int] 39 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .engine import Engine 3 | from .engine_instance import EngineInstance 4 | 5 | __all__ = ['Engine', 'EngineInstance'] 6 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .alibi_pagedattention import alibi_paged_attention_fwd 4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb 5 | from .fill_kv_cache import fill_kv_cache 6 | from .fused_moe import fused_moe 7 | from .fused_rotary_emb import fused_rotary_emb 8 | from .multinomial_sampling import multinomial_sampling 9 | from .pagedattention import paged_attention_fwd 10 | from .rms_norm import rms_norm 11 | from .w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8, 12 | rms_norm_dynamic_quant) 13 | 14 | __all__ = [ 15 | 'apply_rotary_pos_emb', 16 | 'fused_moe', 17 | 'fused_rotary_emb', 18 | 'paged_attention_fwd', 19 | 'alibi_paged_attention_fwd', 20 | 'fill_kv_cache', 21 | 'multinomial_sampling', 22 | 'rms_norm', 23 | 'matmul_kernel_dynamic_quant', 24 | 'per_channel_quant', 25 | 'per_token_quant_int8', 26 | 'rms_norm_dynamic_quant', 27 | ] 28 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/alibi_pagedattention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | alibi_paged_attention_fwd = FunctionDispatcher('alibi_paged_attention_fwd').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/apply_rotary_pos_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | apply_rotary_pos_emb = FunctionDispatcher('apply_rotary_pos_emb').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ..default.w8a8_kernels import per_channel_quant 3 | from .alibi_pagedattention import alibi_paged_attention_fwd 4 | from .apply_rotary_pos_emb import apply_rotary_pos_emb 5 | from .fill_kv_cache import fill_kv_cache 6 | from .flash_mla import flash_mla_fwd 7 | from .flashattention import flash_attention_fwd 8 | from .flatten_kv_cache import flatten_kv_cache 9 | from .fused_moe import fused_moe 10 | from .fused_rotary_emb import fused_rotary_emb 11 | from .multinomial_sampling import multinomial_sampling 12 | from .pagedattention import paged_attention_fwd 13 | from .rms_norm import rms_norm 14 | from .w8a8_fused_moe import fused_moe_w8a8 15 | from .w8a8_triton_kernels import matmul_kernel_dynamic_quant, per_token_quant_int8, rms_norm_dynamic_quant 16 | 17 | __all__ = [ 18 | 'apply_rotary_pos_emb', 19 | 'fused_moe', 20 | 'fused_rotary_emb', 21 | 'paged_attention_fwd', 22 | 'alibi_paged_attention_fwd', 23 | 'fill_kv_cache', 24 | 'multinomial_sampling', 25 | 'rms_norm', 26 | 'matmul_kernel_dynamic_quant', 27 | 'per_channel_quant', 28 | 'per_token_quant_int8', 29 | 'rms_norm_dynamic_quant', 30 | 'flash_attention_fwd', 31 | 'flatten_kv_cache', 32 | 'fused_moe_w8a8', 33 | 'flash_mla_fwd', 34 | ] 35 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/cuda/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import functools 3 | 4 | import torch 5 | 6 | WARPS_PER_SM = { 7 | (8, 0): 64, 8 | (8, 6): 48, 9 | (8, 7): 48, 10 | (8, 9): 48, 11 | (9, 0): 64, 12 | (10, 0): 64, 13 | (10, 1): 48, 14 | (12, 0): 48, 15 | } 16 | 17 | 18 | @functools.lru_cache 19 | def get_device_props(device=None): 20 | if device is None: 21 | device = torch.cuda.current_device() 22 | 23 | props = torch.cuda.get_device_properties(device) 24 | 25 | warps_per_sm = WARPS_PER_SM.get((props.major, props.minor), 32) 26 | out = dict( 27 | multi_processor_count=props.multi_processor_count, 28 | warps_per_sm=warps_per_sm, 29 | ) 30 | return out 31 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/default/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .multinomial_sampling import multinomial_sampling 3 | from .w8a8_kernels import per_channel_quant 4 | 5 | __all__ = [ 6 | 'multinomial_sampling', 7 | 'per_channel_quant', 8 | ] 9 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/default/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | from torch import LongTensor, Tensor 4 | 5 | 6 | def multinomial_sampling(scores: Tensor, seeds: LongTensor, offsets: LongTensor, indices: Tensor = None): 7 | sampled_index = torch.multinomial(scores, num_samples=1, replacement=True) 8 | outputs = torch.gather(indices, dim=1, index=sampled_index) 9 | return outputs.view(-1) 10 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/default/w8a8_kernels.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | 5 | def per_channel_quant(x: torch.Tensor, dtype: torch.dtype): 6 | """Quantize the input tensor 'x' channel-wise using the given number of 7 | bits. 8 | 9 | Args: 10 | x (torch.Tensor): The input tensor to be quantized. Must be a 11 | 2-dimensional tensor. 12 | dtype (torch.dtype): The data type to which the quantized tensor should 13 | be converted. 14 | 15 | Returns: 16 | tuple: A tuple containing two items -- the quantized tensor and 17 | the scale used for quantization. 18 | """ 19 | assert x.ndim == 2 20 | x = x.to(torch.float32) 21 | x_absmax = x.view(x.shape[0], -1).abs().max(dim=1, keepdim=True)[0] 22 | qtype_info = torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype) 23 | q_max = qtype_info.max 24 | q_min = qtype_info.min 25 | scale = x_absmax / q_max 26 | x_q = x / scale 27 | if not dtype.is_floating_point: 28 | x_q = torch.round(x_q) 29 | x_q = x_q.clamp(q_min, q_max).to(dtype) 30 | return x_q, scale 31 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ..default import multinomial_sampling, per_channel_quant 3 | from .apply_rotary_pos_emb import apply_rotary_pos_emb 4 | from .awq_kernels import awq_linear 5 | from .fill_kv_cache import fill_kv_cache 6 | from .flash_attention import flash_attention_fwd 7 | from .fused_moe import fused_moe 8 | from .linear import linear 9 | from .moe_gating_topk_softmax import moe_gating_topk_softmax 10 | from .pagedattention import paged_attention_fwd 11 | from .rms_norm import rms_norm 12 | 13 | __all__ = [ 14 | 'rms_norm', 15 | 'apply_rotary_pos_emb', 16 | 'awq_linear', 17 | 'fill_kv_cache', 18 | 'fused_moe', 19 | 'paged_attention_fwd', 20 | 'flash_attention_fwd', 21 | 'linear', 22 | 'moe_gating_topk_softmax', 23 | 'multinomial_sampling', 24 | 'per_channel_quant', 25 | ] 26 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import dlinfer.ops as ext_ops 3 | from torch import Tensor 4 | 5 | 6 | def silu_and_mul(input_tensor: Tensor, ) -> Tensor: 7 | return ext_ops.silu_and_mul(input_tensor) 8 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional, Tuple 3 | 4 | import dlinfer.ops as ext_ops 5 | from torch import Tensor 6 | 7 | 8 | def apply_rotary_pos_emb( 9 | query_states: Tensor, 10 | key_states: Tensor, 11 | cos: Tensor, 12 | sin: Tensor, 13 | q_embed: Optional[Tensor], 14 | k_embed: Optional[Tensor], 15 | ) -> Tuple[Tensor, Tensor]: 16 | query_states_embed, key_states_embed = \ 17 | ext_ops.apply_rotary_pos_emb(query_states, 18 | key_states, 19 | cos, sin) 20 | if q_embed is None: 21 | q_embed = query_states_embed.view(query_states.shape) 22 | elif q_embed is not query_states: 23 | q_embed.copy_(query_states_embed.view(query_states.shape)) 24 | 25 | if k_embed is None: 26 | k_embed = key_states_embed.view(key_states.shape) 27 | elif k_embed is not key_states: 28 | k_embed.copy_(key_states_embed.view(key_states.shape)) 29 | 30 | return q_embed, k_embed 31 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional 3 | 4 | import dlinfer.ops as ext_ops 5 | from torch import Tensor 6 | 7 | 8 | def awq_linear(x: Tensor, 9 | qweight: Tensor, 10 | scales: Tensor, 11 | qzeros: Tensor, 12 | bias: Optional[Tensor] = None, 13 | all_reduce: bool = False, 14 | group_size: int = 0): 15 | return ext_ops.weight_quant_matmul(x.squeeze(0), 16 | qweight, 17 | scales, 18 | offset=qzeros, 19 | bias=bias, 20 | all_reduce=all_reduce, 21 | group_size=group_size).unsqueeze(0) 22 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional, Sequence 3 | 4 | import dlinfer.ops as ext_ops 5 | from torch import Tensor 6 | 7 | 8 | def fill_kv_cache( 9 | key_states: Tensor, 10 | value_states: Tensor, 11 | key_caches: Tensor, 12 | value_caches: Tensor, 13 | kv_start_indices: Tensor, 14 | k_scales_zeros: Sequence[Optional[Tensor]], 15 | v_scales_zeros: Sequence[Optional[Tensor]], 16 | quant_bits: int = 0, 17 | ): 18 | """Fill key/value state to cache for paged attention.""" 19 | return ext_ops.fill_kv_cache(key_states, 20 | value_states, 21 | key_caches, 22 | value_caches, 23 | kv_start_indices, 24 | k_scales_zeros=k_scales_zeros, 25 | v_scales_zeros=v_scales_zeros, 26 | quant_bits=quant_bits) 27 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/flash_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import dlinfer.ops as ext_ops 3 | from dlinfer.utils.type_annotation import Tensor 4 | 5 | 6 | def flash_attention_fwd( 7 | query_states: Tensor, 8 | key_states: Tensor, 9 | value_states: Tensor, 10 | attn_output: Tensor, 11 | q_start_loc: Tensor, 12 | q_seqlens: Tensor, 13 | kv_start_loc: Tensor, 14 | kv_seqlens: Tensor, 15 | num_heads: int, 16 | num_kv_heads: int, 17 | max_q_seqlen: int = None, 18 | window_size: int = None, 19 | sm_scale: float = None, 20 | logit_softcapping: float = None, 21 | causal: bool = True, 22 | ): 23 | return ext_ops.prefill_attention( 24 | query_states, 25 | key_states, 26 | value_states, 27 | None, 28 | None, 29 | q_start_loc, 30 | q_seqlens, 31 | kv_seqlens, 32 | max_q_seqlen, 33 | num_heads, 34 | num_kv_heads, 35 | attn_mask=[], 36 | softmax_scale=sm_scale, 37 | attn_output=attn_output, 38 | ) 39 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/fused_moe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import dlinfer.ops as ext_ops 3 | from torch import Tensor 4 | 5 | 6 | def fused_moe( 7 | hidden_states: Tensor, 8 | gate_up_weights: Tensor, 9 | down_weights: Tensor, 10 | topk_weights: Tensor, 11 | topk_ids: Tensor, 12 | topk: int, 13 | renormalize: bool, 14 | ): 15 | """Dlinfer fused moe.""" 16 | return ext_ops.fused_moe(hidden_states, gate_up_weights, down_weights, topk_weights, topk_ids, topk, renormalize) 17 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/linear.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Optional 3 | 4 | import dlinfer.ops as ext_ops 5 | from torch import Tensor 6 | 7 | 8 | def linear(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, all_reduce: bool = False, group: str = ''): 9 | return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce, group=group) 10 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import dlinfer.ops as ext_ops 3 | from torch import Tensor 4 | 5 | 6 | def moe_gating_topk_softmax(router_logits: Tensor, topk: int): 7 | routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk) 8 | return routing_weights, selected_experts 9 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/dlinfer/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import dlinfer.ops as ext_ops 3 | from torch import Tensor 4 | 5 | 6 | def rms_norm(hidden_states: Tensor, weight: Tensor, epsilon: float = 1e-6, residual: Tensor = None, out: Tensor = None): 7 | if residual is None: 8 | rms_norm_out = ext_ops.rms_norm(hidden_states, weight, epsilon) 9 | if out is None: 10 | out = rms_norm_out 11 | else: 12 | out.copy_(rms_norm_out) 13 | return out 14 | else: 15 | return ext_ops.add_rms_norm(hidden_states, residual, weight, epsilon) 16 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/fill_kv_cache.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | fill_kv_cache = FunctionDispatcher('fill_kv_cache').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/flash_mla.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | flash_mla_fwd = FunctionDispatcher('flash_mla_fwd').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/fused_moe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | fused_moe = FunctionDispatcher('fused_moe').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/fused_rotary_emb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | fused_rotary_emb = FunctionDispatcher('fused_rotary_emb').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | moe_gating_topk_softmax = FunctionDispatcher('moe_gating_topk_softmax').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | multinomial_sampling = FunctionDispatcher('multinomial_sampling').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/pagedattention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | paged_attention_fwd = FunctionDispatcher('paged_attention_fwd').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | rms_norm = FunctionDispatcher('rms_norm').make_caller() 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/kernels/w8a8_triton_kernels.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .dispatcher import FunctionDispatcher 3 | 4 | per_channel_quant = FunctionDispatcher('per_channel_quant').make_caller() 5 | 6 | matmul_kernel_dynamic_quant = FunctionDispatcher('matmul_kernel_dynamic_quant').make_caller() 7 | 8 | per_token_quant_int8 = FunctionDispatcher('per_token_quant_int8').make_caller() 9 | 10 | rms_norm_dynamic_quant = FunctionDispatcher('rms_norm_dynamic_quant').make_caller() 11 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .q_modules import QLinear, QRMSNorm 3 | 4 | __all__ = ['QLinear', 'QRMSNorm'] 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/models/utils/multimodal.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List, Tuple 3 | 4 | from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs 5 | 6 | PreparedInputs = Tuple[List[int], MultiModalInputs] 7 | 8 | 9 | class MultiModalMixin: 10 | 11 | def prepare_multimodal_input(self, input_ids, input_multimodals, **kwargs) -> PreparedInputs: 12 | """Prepare multimodals inputs.""" 13 | raise NotImplementedError('prepare input not implemented.') 14 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .data_type import MultiModalData, MultiModalTensor 3 | 4 | __all__ = ['MultiModalData', 'MultiModalTensor'] 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/multimodal/image_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from dataclasses import dataclass 3 | from typing import Any, ClassVar, Dict 4 | 5 | from PIL import Image 6 | 7 | from .data_type import MultiModalData 8 | 9 | 10 | @dataclass 11 | class ImageData(MultiModalData): 12 | data: Image 13 | loc: int 14 | meta: Dict[str, Any] = None 15 | type: ClassVar[str] = 'image' 16 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # attention module is modified from: 3 | # https://github.com/vllm-project/vllm/blob/main/vllm/attention/ 4 | from .activation import GeluAndMul, SiluAndMul # noqa: F401 5 | from .attention import Attention, FlashAttention # noqa: F401 6 | from .norm import LayerNorm, RMSNorm # noqa: F401 7 | from .rotary_embedding import ApplyRotaryEmb # noqa: F401 8 | from .rotary_embedding import RopeType # noqa: F401 9 | from .rotary_embedding import YarnParameters # noqa: F401 10 | from .rotary_embedding import build_rotary_embedding # noqa: F401 11 | from .rotary_embedding import build_rotary_params # noqa: F401 12 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/nn/activation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch import Tensor, nn 3 | 4 | from ..backends import OpType, get_backend 5 | 6 | 7 | class SiluAndMul(nn.Module): 8 | """Silu and elementwise multiple.""" 9 | 10 | def __init__(self, inplace: bool = True): 11 | super().__init__() 12 | backend = get_backend() 13 | builder = backend.get_layer_impl_builder(OpType.SiluAndMul) 14 | self.impl = builder.build(inplace) 15 | 16 | def forward(self, x: Tensor): 17 | """forward.""" 18 | return self.impl.forward(x) 19 | 20 | 21 | class GeluAndMul(nn.Module): 22 | """Gelu and elementwise multiple.""" 23 | 24 | def __init__(self, approximate: str = 'none'): 25 | super().__init__() 26 | backend = get_backend() 27 | builder = backend.get_layer_impl_builder(OpType.GeluAndMul) 28 | self.impl = builder.build(approximate) 29 | 30 | def forward(self, x: Tensor): 31 | """forward.""" 32 | return self.impl.forward(x) 33 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/nn/multinomial_sampling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from ..backends import OpType, get_backend 5 | 6 | 7 | def multinomial_sampling(scores: torch.Tensor, 8 | seeds: torch.LongTensor, 9 | offsets: torch.LongTensor, 10 | indices: torch.Tensor = None): 11 | """Multinomial sampling op.""" 12 | impl_builder = get_backend().get_layer_impl_builder(OpType.MultinomialSampling) 13 | return impl_builder.build().forward(scores, seeds, offsets, indices) 14 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/nn/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | 5 | def div_up(a: int, b: int): 6 | """Div up.""" 7 | return (a + b - 1) // b 8 | 9 | 10 | def get_distribute_size(feature_size: int, world_size: int, rank: int, align: int = 1): 11 | """Update feature size.""" 12 | assert feature_size % align == 0 13 | aligned_size = feature_size // align 14 | # try to make every rank has same amount of feats 15 | updated_aligned_size = aligned_size // world_size 16 | # if there are still some remain, given them to 17 | # each rank 18 | if rank < aligned_size % world_size: 19 | updated_aligned_size += 1 20 | return updated_aligned_size * align 21 | 22 | 23 | def chunk_aligned(weight: torch.Tensor, chunks: int, dim: int, align: int): 24 | """Chunk aligned.""" 25 | if align == 1: 26 | return weight.chunk(chunks, dim=dim) 27 | size = weight.size(dim) 28 | assert size % align == 0 29 | aligned_size = size // align 30 | 31 | # try best to evenly split chunks 32 | align_per_chunk = aligned_size // chunks 33 | remain = aligned_size % chunks 34 | sections = [align_per_chunk + int(c < remain) for c in range(chunks)] 35 | sections = [sec * align for sec in sections] 36 | return weight.split(sections, dim=dim) 37 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/paging/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .scheduler import Scheduler 3 | 4 | __all__ = ['Scheduler'] 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/paging/block_manager/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ...config import CacheConfig 3 | from .base_block_manager import BaseBlockManager 4 | from .default_block_manager import DefaultBlockManager 5 | from .window_block_manager import WindowBlockManager 6 | 7 | 8 | def build_block_manager(cache_config: CacheConfig) -> BaseBlockManager: 9 | """Build block manager. 10 | 11 | Args: 12 | cache_config (CacheConfig): cache_config. 13 | """ 14 | 15 | num_cpu_blocks = cache_config.num_cpu_blocks 16 | num_gpu_blocks = cache_config.num_gpu_blocks 17 | window_size = cache_config.window_size 18 | 19 | if window_size < 0: 20 | return DefaultBlockManager(num_gpu_blocks, num_cpu_blocks) 21 | else: 22 | return WindowBlockManager(num_gpu_blocks, num_cpu_blocks, window_size=window_size) 23 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/paging/eviction_helper/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .recompute_eviction_helper import RecomputeEvictionHelper 3 | 4 | __all__ = ['RecomputeEvictionHelper'] 5 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import List 3 | 4 | from ...messages import SchedulerSequence 5 | from ..scheduler import Scheduler 6 | 7 | SeqList = List[SchedulerSequence] 8 | 9 | 10 | class BaseEvictionHelper: 11 | """Base eviction helper.""" 12 | 13 | def __init__(self, scheduler: Scheduler): 14 | self.scheduler = scheduler 15 | self.block_manager = scheduler.block_manager 16 | self.block_trie = scheduler.block_trie 17 | 18 | def need_swap_in(self, seq: SchedulerSequence): 19 | """Sequence need swap in.""" 20 | raise NotImplementedError('Not implemented.') 21 | 22 | def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int): 23 | """Evict seqs.""" 24 | raise NotImplementedError('Not implemented.') 25 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .utils import Timer # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # modify from: https://github.com/vllm-project/vllm 3 | import inspect 4 | from inspect import Parameter, Signature 5 | from typing import Dict, Sequence 6 | 7 | import psutil 8 | 9 | 10 | def get_gpu_memory(device_id: int = None) -> int: 11 | """Returns the free and total physical memory of the GPU in bytes.""" 12 | import torch 13 | if device_id is None: 14 | device_id = torch.cuda.current_device() 15 | return torch.cuda.mem_get_info(device_id) 16 | 17 | 18 | def get_cpu_memory() -> int: 19 | """Returns the total CPU memory of the node in bytes.""" 20 | return psutil.virtual_memory().total 21 | 22 | 23 | def bind_sigature(input_names: str, args: Sequence, kwargs: Dict): 24 | """Bind args and kwargs to given input names.""" 25 | kind = inspect._ParameterKind.POSITIONAL_OR_KEYWORD 26 | 27 | sig = Signature([Parameter(name, kind) for name in input_names]) 28 | bind = sig.bind(*args, **kwargs) 29 | return bind.arguments 30 | -------------------------------------------------------------------------------- /lmdeploy/pytorch/weight_loader/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/serve/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/serve/gradio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/serve/gradio/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | import gradio as gr 4 | 5 | CSS = """ 6 | #container { 7 | width: 95%; 8 | margin-left: auto; 9 | margin-right: auto; 10 | } 11 | 12 | #chatbot { 13 | height: 500px; 14 | overflow: auto; 15 | } 16 | 17 | .chat_wrap_space { 18 | margin-left: 0.5em 19 | } 20 | """ 21 | 22 | THEME = gr.themes.Soft(primary_hue=gr.themes.colors.blue, 23 | secondary_hue=gr.themes.colors.sky, 24 | font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif']) 25 | 26 | enable_btn = gr.update(interactive=True) 27 | disable_btn = gr.update(interactive=False) 28 | -------------------------------------------------------------------------------- /lmdeploy/serve/openai/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/serve/openai/reasoning_parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser 3 | from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser 4 | from .reasoning_parser import ReasoningParser, ReasoningParserManager 5 | 6 | __all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser'] 7 | -------------------------------------------------------------------------------- /lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser 3 | from .reasoning_parser import ReasoningParserManager 4 | 5 | 6 | @ReasoningParserManager.register_module(name='qwen-qwq') 7 | class QwenQwQReasoningParser(DeepSeekR1ReasoningParser): 8 | """Reasoning parser for Qwen QwQ model. 9 | 10 | The Qwen QwQ model uses ... tokens to denote reasoning text. This parser extracts the reasoning 11 | content from the model output. 12 | """ 13 | -------------------------------------------------------------------------------- /lmdeploy/serve/openai/tool_parser/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .internlm2_parser import Internlm2ToolParser 3 | from .llama3_parser import Llama3JsonToolParser 4 | from .qwen2d5_parser import Qwen2d5ToolParser 5 | from .tool_parser import ToolParser, ToolParserManager 6 | 7 | __all__ = ['Internlm2ToolParser', 'Qwen2d5ToolParser', 'ToolParser', 'ToolParserManager', 'Llama3JsonToolParser'] 8 | -------------------------------------------------------------------------------- /lmdeploy/serve/proxy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/serve/turbomind/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | 4 | def bootstrap(): 5 | import os 6 | import sys 7 | 8 | has_turbomind = False 9 | pwd = os.path.dirname(__file__) 10 | if os.path.exists(os.path.join(pwd, '..', 'lib')): 11 | has_turbomind = True 12 | if os.name == 'nt' and has_turbomind: 13 | if sys.version_info[:2] >= (3, 8): 14 | CUDA_PATH = os.getenv('CUDA_PATH') 15 | assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH' 16 | dll_path = os.path.join(CUDA_PATH, 'bin') 17 | print(f'Add dll path {dll_path}, please note cuda version ' 18 | 'should >= 11.3 when compiled with cuda 11') 19 | os.add_dll_directory(dll_path) 20 | 21 | 22 | bootstrap() 23 | 24 | from .turbomind import TurboMind, update_parallel_config # noqa: E402 25 | 26 | __all__ = ['TurboMind', 'update_parallel_config'] 27 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/source_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401 3 | from .deepseek2 import DeepSeek2Model # noqa: F401 4 | from .deepseek_vl import DeepSeekVLModel # noqa: F401 5 | from .glm4 import Glm4Model # noqa: F401 6 | from .internlm2 import InternLM2Model # noqa: F401 7 | from .internvl import InternVLModel # noqa: F401 8 | from .llama import LlamaModel # noqa: F401 9 | from .llava import LlavaModel # noqa: F401 10 | from .minicpmv import MiniCPMVModel # noqa: F401 11 | from .mixtral import MixtralModel # noqa: F401 12 | from .molmo import MolmoModel # noqa: F401 13 | from .qwen import QwenModel # noqa: F401 14 | from .xcomposer2 import Xcomposer2Model # noqa: F401 15 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/source_model/minicpmv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | import json 4 | import os.path as osp 5 | 6 | from .base import INPUT_MODELS 7 | from .llama import LlamaModel, LlamaReader 8 | 9 | 10 | class MiniCPMVReader(LlamaReader): 11 | """MiniCPMVReader for llama model.""" 12 | 13 | attn_layer_prefix = 'llm.model.layers' 14 | attn_layer_patten = r'llm.model.layers.([0-9]+).' 15 | tok_embeddings_key = 'llm.model.embed_tokens.weight' 16 | norm_weight_key = 'llm.model.norm.weight' 17 | output_weight_key = 'llm.lm_head.weight' 18 | 19 | 20 | @INPUT_MODELS.register_module(name='minicpmv') 21 | class MiniCPMVModel(LlamaModel): 22 | """MiniCPMV model in hf format.""" 23 | Reader = MiniCPMVReader 24 | 25 | def model_info(self): 26 | info = super().model_info() 27 | with open(osp.join(self.model_path, 'config.json')) as f: 28 | config = json.load(f) 29 | if str(config.get('version')) == '2.6': 30 | info['attn_bias'] = True 31 | return info 32 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/source_model/mixtral.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .base import INPUT_MODELS 4 | from .llama import LlamaModel, LlamaReader 5 | 6 | 7 | class MixtralReader(LlamaReader): 8 | 9 | def moe_ffn_expert(self, e=None, i=None, kind=None): 10 | if not kind: 11 | return self.filter(r'experts') 12 | result = [] 13 | for x in ['w1', 'w2', 'w3']: 14 | name = f'model.layers.{i}.block_sparse_moe.experts.{e}.{x}.{kind}' 15 | tensor = self.params.get(name) 16 | tensor = self.transform(tensor, kind) 17 | result.append(tensor) 18 | return (*result, ) 19 | 20 | def moe_ffn_gate(self, i): 21 | return self.params.get(f'model.layers.{i}.block_sparse_moe.gate.weight') 22 | 23 | 24 | @INPUT_MODELS.register_module(name='mixtral') 25 | class MixtralModel(LlamaModel): 26 | 27 | Reader = MixtralReader 28 | 29 | def model_info(self): 30 | cfg = self.model_config 31 | info = super().model_info() 32 | info['expert_num'] = cfg['num_local_experts'] 33 | info['expert_inter_size'] = cfg['intermediate_size'] 34 | info['experts_per_token'] = cfg['num_experts_per_tok'] 35 | info['norm_topk_prob'] = True 36 | info['inter_size'] = 0 37 | return info 38 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/target_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .fp import TurbomindModel # noqa: F401 3 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/deploy/target_model/fp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | from .base import OUTPUT_MODELS, BaseOutputModel 4 | 5 | 6 | @OUTPUT_MODELS.register_module(name='tm') 7 | class TurbomindModel(BaseOutputModel): 8 | """Export to turbomind fp16 format.""" 9 | pass 10 | -------------------------------------------------------------------------------- /lmdeploy/turbomind/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | from transformers.utils import ExplicitEnum 5 | 6 | from lmdeploy.utils import get_logger 7 | 8 | logger = get_logger('lmdeploy') 9 | 10 | 11 | class ModelSource(ExplicitEnum): 12 | """Turbomind model source.""" 13 | WORKSPACE = 'workspace' 14 | HF_MODEL = 'hf_model' 15 | 16 | 17 | def get_model_source(pretrained_model_name_or_path: str, **kwargs) -> ModelSource: 18 | """Get model source.""" 19 | triton_model_path = os.path.join(pretrained_model_name_or_path, 'triton_models') 20 | if os.path.exists(triton_model_path): 21 | return ModelSource.WORKSPACE 22 | return ModelSource.HF_MODEL 23 | -------------------------------------------------------------------------------- /lmdeploy/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from typing import Tuple 3 | 4 | __version__ = '0.8.0' 5 | short_version = __version__ 6 | 7 | 8 | def parse_version_info(version_str: str) -> Tuple: 9 | """Parse version from a string. 10 | 11 | Args: 12 | version_str (str): A string represents a version info. 13 | 14 | Returns: 15 | tuple: A sequence of integer and string represents version. 16 | """ 17 | _version_info = [] 18 | for x in version_str.split('.'): 19 | if x.isdigit(): 20 | _version_info.append(int(x)) 21 | elif x.find('rc') != -1: 22 | patch_version = x.split('rc') 23 | _version_info.append(int(patch_version[0])) 24 | _version_info.append(f'rc{patch_version[1]}') 25 | return tuple(_version_info) 26 | 27 | 28 | version_info = parse_version_info(__version__) 29 | -------------------------------------------------------------------------------- /lmdeploy/vl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .utils import load_image 3 | 4 | __all__ = ['load_image'] 5 | -------------------------------------------------------------------------------- /lmdeploy/vl/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | IMAGE_DUMMY_TOKEN_INDEX = 0 3 | IMAGE_TOKEN = '' 4 | -------------------------------------------------------------------------------- /lmdeploy/vl/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /lmdeploy/vl/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | -------------------------------------------------------------------------------- /requirements/build.txt: -------------------------------------------------------------------------------- 1 | pybind11<=2.13.1 2 | setuptools 3 | -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | markdown>=3.4.0 2 | myst-parser 3 | sphinx==8.0.2 4 | sphinx-book-theme 5 | sphinx-copybutton 6 | sphinx-tabs 7 | sphinxcontrib-mermaid 8 | -------------------------------------------------------------------------------- /requirements/lite.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets 3 | transformers_stream_generator 4 | -------------------------------------------------------------------------------- /requirements/readthedocs.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | mmengine-lite 3 | pillow 4 | pydantic 5 | torch 6 | transformers 7 | urllib3<2.0.0 8 | -------------------------------------------------------------------------------- /requirements/runtime_ascend.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.29.3 2 | dlinfer-ascend>=0.1.3 3 | einops 4 | fastapi 5 | fire 6 | mmengine-lite 7 | numpy<2.0.0 8 | openai 9 | outlines<0.1.0 10 | partial_json_parser 11 | peft<=0.11.1 12 | pillow 13 | protobuf 14 | pydantic>2.0.0 15 | safetensors 16 | sentencepiece 17 | shortuuid 18 | tiktoken 19 | torch<=2.4.0,>=2.3.1 20 | torch-npu==2.3.1 21 | torchvision<=0.19.0,>=0.18.1 22 | transformers 23 | uvicorn 24 | -------------------------------------------------------------------------------- /requirements/runtime_camb.txt: -------------------------------------------------------------------------------- 1 | accelerate==1.2.0 2 | einops 3 | fastapi 4 | fire 5 | mmengine-lite 6 | numpy<2.0.0 7 | openai 8 | outlines<0.1.0 9 | partial_json_parser 10 | peft<=0.11.1 11 | pillow 12 | protobuf 13 | pydantic>2.0.0 14 | safetensors 15 | sentencepiece 16 | shortuuid 17 | tiktoken 18 | torch==2.4.0 19 | torchvision<=0.19.0,>=0.15.0 20 | transformers 21 | uvicorn 22 | -------------------------------------------------------------------------------- /requirements/runtime_cuda.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.29.3 2 | einops 3 | fastapi 4 | fire 5 | mmengine-lite 6 | numpy<2.0.0 7 | openai 8 | outlines 9 | partial_json_parser 10 | peft<=0.14.0 11 | pillow 12 | protobuf 13 | pydantic>2.0.0 14 | pynvml 15 | ray 16 | safetensors 17 | sentencepiece 18 | shortuuid 19 | tiktoken 20 | torch<=2.6.0,>=2.0.0 21 | torchvision<=0.21.0,>=0.15.0 22 | transformers 23 | triton<=3.2.0,>=3.0.0; sys_platform == "linux" 24 | uvicorn 25 | -------------------------------------------------------------------------------- /requirements/runtime_maca.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.32.1 2 | einops 3 | fastapi 4 | fire 5 | mmengine-lite 6 | numpy<2.0.0 7 | openai 8 | outlines<0.1.0 9 | partial_json_parser 10 | peft<=0.11.1 11 | pillow 12 | protobuf 13 | pydantic>2.0.0 14 | safetensors 15 | sentencepiece 16 | shortuuid 17 | tiktoken 18 | torch<=2.4.0,>=2.0.0 19 | torchvision<=0.19.0,>=0.15.0 20 | transformers 21 | triton>=2.1.0; sys_platform == "linux" 22 | uvicorn 23 | -------------------------------------------------------------------------------- /requirements/serve.txt: -------------------------------------------------------------------------------- 1 | gradio 2 | protobuf 3 | tritonclient[grpc] 4 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | allure-pytest 2 | coverage 3 | nvidia-ml-py 4 | pytest 5 | pytest-assume 6 | pytest-cov 7 | pytest-order 8 | pytest-rerunfailures 9 | pytest-sugar 10 | pytest-xdist 11 | pyyaml 12 | -------------------------------------------------------------------------------- /requirements_ascend.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/runtime_ascend.txt 3 | -r requirements/lite.txt 4 | -r requirements/serve.txt 5 | -------------------------------------------------------------------------------- /requirements_camb.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/runtime_camb.txt 3 | -r requirements/lite.txt 4 | -r requirements/serve.txt 5 | -------------------------------------------------------------------------------- /requirements_cuda.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/runtime_cuda.txt 3 | -r requirements/lite.txt 4 | -r requirements/serve.txt 5 | -------------------------------------------------------------------------------- /requirements_maca.txt: -------------------------------------------------------------------------------- 1 | -r requirements/build.txt 2 | -r requirements/runtime_maca.txt 3 | -r requirements/lite.txt 4 | -r requirements/serve.txt 5 | -------------------------------------------------------------------------------- /resources/batch_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InternLM/lmdeploy/c63db2b8a0b57ef732fc5ed1e7c2e0eefdfb76de/resources/batch_memory.png -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(turbomind) 16 | -------------------------------------------------------------------------------- /src/turbomind/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(utils) 16 | add_subdirectory(core) 17 | add_subdirectory(kernels) 18 | add_subdirectory(layers) 19 | add_subdirectory(comm) 20 | add_subdirectory(models) 21 | add_subdirectory(engine) 22 | if(BUILD_PYT) 23 | add_subdirectory(th_op) 24 | endif() 25 | if(BUILD_PY_FFI) 26 | add_subdirectory(python) 27 | endif() 28 | add_subdirectory(triton_backend) 29 | -------------------------------------------------------------------------------- /src/turbomind/comm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | add_library(host_comm STATIC host_comm.cc thread_comm.cc) 6 | target_link_libraries(host_comm PRIVATE core logger) 7 | set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON) 8 | 9 | add_library(device_comm STATIC device_comm.cc) 10 | target_link_libraries(device_comm PRIVATE core logger) 11 | set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON) 12 | set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 13 | 14 | if (BUILD_MULTI_GPU) 15 | add_subdirectory(cuda_ipc) 16 | target_link_libraries(device_comm INTERFACE cuda_ipc_comm) 17 | 18 | if (USE_NCCL) 19 | add_subdirectory(nccl) 20 | target_link_libraries(device_comm INTERFACE nccl_comm) 21 | endif () 22 | 23 | if (BUILD_TEST) 24 | add_executable(test_comm test_comm.cu) 25 | target_link_libraries(test_comm PRIVATE device_comm host_comm core pthread nvtx_utils) 26 | target_compile_options(test_comm PRIVATE -O3 -march=native -mtune=native) 27 | endif () 28 | endif () 29 | -------------------------------------------------------------------------------- /src/turbomind/comm/barrier.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::comm { 10 | 11 | class Barrier { 12 | public: 13 | explicit Barrier(int count): threshold_{count}, count_{count} {} 14 | 15 | void arrive_and_wait() 16 | { 17 | std::unique_lock lock{mutex_}; 18 | auto phase = phase_; 19 | if (--count_ == 0) { 20 | ++phase_; 21 | count_ = threshold_; 22 | cv_.notify_all(); 23 | } 24 | else { 25 | cv_.wait(lock, [this, phase] { return phase_ != phase; }); 26 | } 27 | } 28 | 29 | private: 30 | std::mutex mutex_; 31 | std::condition_variable cv_; 32 | 33 | int threshold_; 34 | int count_; 35 | 36 | uint32_t phase_{}; 37 | }; 38 | 39 | } // namespace turbomind::comm 40 | -------------------------------------------------------------------------------- /src/turbomind/comm/cuda_ipc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | add_library(cuda_ipc_comm STATIC 6 | cuda_ipc_comm.cu 7 | allreduce.cu 8 | allgather.cu 9 | fused_allreduce.cu 10 | fused_allreduce_ex.cu) 11 | 12 | target_link_libraries(cuda_ipc_comm PRIVATE 13 | rms_norm 14 | host_comm 15 | core 16 | cuda_utils 17 | CUDA::cuda_driver 18 | logger) 19 | 20 | set_property(TARGET cuda_ipc_comm PROPERTY POSITION_INDEPENDENT_CODE ON) 21 | set_property(TARGET cuda_ipc_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 22 | -------------------------------------------------------------------------------- /src/turbomind/comm/cuda_ipc/group_sum.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/core/common.h" 6 | 7 | namespace turbomind::comm { 8 | 9 | namespace detail { 10 | 11 | template 12 | __device__ float GroupSum(const float val, int warps, Syncgroup syncgroup) 13 | { 14 | const int warp_id = threadIdx.x / WARP_SIZE; 15 | const int lane_id = threadIdx.x % WARP_SIZE; 16 | float sum = val; 17 | PRAGMA_UNROLL 18 | for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { 19 | sum += __shfl_xor_sync((uint32_t)-1, sum, mask); 20 | } 21 | __shared__ float smem[32]; 22 | // syncgroup(); 23 | if (lane_id == 0) { 24 | smem[warp_id] = sum; 25 | } 26 | syncgroup(); 27 | for (int i = 1; i < warps; ++i) { 28 | sum += smem[warp_id / warps * warps + i]; 29 | } 30 | // sum = {}; 31 | // for (int i = 0; i < warps; ++i) { 32 | // sum += smem[warp_id / warps * warps + i]; 33 | // } 34 | return sum; 35 | } 36 | 37 | } // namespace detail 38 | 39 | } // namespace turbomind::comm 40 | -------------------------------------------------------------------------------- /src/turbomind/comm/device_comm.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/comm/device_comm.h" 4 | #include "src/turbomind/utils/cuda_utils.h" 5 | 6 | namespace turbomind::comm { 7 | 8 | DeviceCommImpl::~DeviceCommImpl() = default; 9 | 10 | DeviceComm CreateNcclCommunicator(int n_ranks, int rank, HostComm h_comm); 11 | 12 | DeviceComm CreateCudaIpcCommunicator(int n_ranks, int rank, HostComm h_comm); 13 | 14 | DeviceComm CreateDeviceCommunicator(const std::string& backend, int n_ranks, int rank, HostComm h_comm) 15 | { 16 | #if BUILD_MULTI_GPU && USE_NCCL 17 | if (backend == "nccl") { 18 | return CreateNcclCommunicator(n_ranks, rank, h_comm); 19 | } 20 | #endif 21 | 22 | #if BUILD_MULTI_GPU 23 | if (backend == "native" || backend == "cuda-ipc") { 24 | return CreateCudaIpcCommunicator(n_ranks, rank, h_comm); 25 | } 26 | #endif 27 | 28 | TM_CHECK(0) << "Unknown communication backend: " << backend; 29 | return {}; 30 | } 31 | 32 | } // namespace turbomind::comm 33 | -------------------------------------------------------------------------------- /src/turbomind/comm/host_comm.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/comm/host_comm.h" 4 | 5 | namespace turbomind::comm { 6 | 7 | HostCommImpl::~HostCommImpl() = default; 8 | 9 | std::unique_ptr CreateThreadGroupId(); 10 | 11 | std::unique_ptr CreateHostGroupId(const std::string& backend) 12 | { 13 | return CreateThreadGroupId(); 14 | } 15 | 16 | } // namespace turbomind::comm 17 | -------------------------------------------------------------------------------- /src/turbomind/comm/nccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | add_library(nccl_comm STATIC nccl.cu) 6 | target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger) 7 | target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS}) 8 | 9 | set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE ON) 10 | set_property(TARGET nccl_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 11 | -------------------------------------------------------------------------------- /src/turbomind/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | add_library(core STATIC 6 | check.cc 7 | allocator.cc 8 | stream.cc 9 | context.cc 10 | buffer.cc 11 | layout.cc 12 | tensor.cc 13 | tensor.cu 14 | module.cc) 15 | 16 | target_link_libraries(core PUBLIC cuda_utils CUDA::cudart CUDA::cuda_driver) 17 | 18 | set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON) 19 | set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 20 | 21 | target_compile_options(core PRIVATE $<$:-Xptxas=-v>) 22 | 23 | if (BUILD_TEST) 24 | add_executable(test_core test_core.cc) 25 | target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain) 26 | endif () 27 | -------------------------------------------------------------------------------- /src/turbomind/core/common.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | /// TODO: remove this dependency 9 | #include "src/turbomind/utils/cuda_utils.h" 10 | 11 | namespace turbomind::core { 12 | 13 | class Allocator; 14 | class Buffer; 15 | class Stream; 16 | class Event; 17 | class Context; 18 | 19 | using std::shared_ptr; 20 | using std::vector; 21 | 22 | using ssize_t = std::ptrdiff_t; 23 | 24 | } // namespace turbomind::core 25 | -------------------------------------------------------------------------------- /src/turbomind/core/context.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "src/turbomind/core/allocator.h" 4 | #include "src/turbomind/core/common.h" 5 | #include "src/turbomind/core/stream.h" 6 | 7 | namespace turbomind::core { 8 | 9 | class Context { 10 | public: 11 | static Stream& stream(); 12 | static Allocator& host_alloc(); 13 | static Allocator& device_alloc(); 14 | static Allocator& pinned_alloc(); 15 | static Allocator& alloc(Device device); 16 | 17 | private: 18 | friend class ContextGuard; 19 | static void push(const Stream& stream); 20 | static void push(const Allocator& alloc); 21 | static void pop(); 22 | }; 23 | 24 | class ContextGuard { 25 | public: 26 | template 27 | explicit ContextGuard(Args&&... args): n_{} 28 | { 29 | (Context::push((Args &&) args), ...); 30 | n_ = sizeof...(Args); 31 | } 32 | ~ContextGuard() 33 | { 34 | for (int i = 0; i < n_; ++i) { 35 | Context::pop(); 36 | } 37 | } 38 | 39 | private: 40 | int n_; 41 | }; 42 | 43 | } // namespace turbomind::core 44 | -------------------------------------------------------------------------------- /src/turbomind/core/core.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "src/turbomind/core/allocator.h" 4 | #include "src/turbomind/core/buffer.h" 5 | #include "src/turbomind/core/check.h" 6 | #include "src/turbomind/core/context.h" 7 | #include "src/turbomind/core/data_type.h" 8 | #include "src/turbomind/core/layout.h" 9 | #include "src/turbomind/core/stream.h" 10 | #include "src/turbomind/core/tensor.h" 11 | 12 | namespace turbomind { 13 | 14 | using core::ssize_t; 15 | using core::Buffer; 16 | using core::Buffer_; 17 | using core::Tensor; 18 | using core::Tensor_; 19 | using core::TensorMap; 20 | using core::Ref; 21 | using core::Layout; 22 | using core::Allocator; 23 | using core::Stream; 24 | using core::Event; 25 | 26 | } // namespace turbomind 27 | -------------------------------------------------------------------------------- /src/turbomind/core/module.h: -------------------------------------------------------------------------------- 1 | 2 | #include "src/turbomind/core/tensor.h" 3 | 4 | namespace turbomind::core { 5 | 6 | class Module { 7 | public: 8 | virtual ~Module(); 9 | 10 | Module(); 11 | 12 | Module(const Module&) = delete; 13 | Module& operator=(const Module&) = delete; 14 | 15 | Module(Module&&) noexcept = delete; 16 | Module& operator=(Module&&) noexcept = delete; 17 | 18 | void register_module(std::string name, Module& module, std::optional index = {}); 19 | void register_parameter(std::string name, Tensor& param); 20 | 21 | void remove_module(Module& module); 22 | void remove_parameter(Tensor& param); 23 | 24 | TensorMap get_parameters() const; 25 | 26 | private: 27 | void get_parameters_impl(std::string prefix, TensorMap& m) const; 28 | 29 | protected: 30 | Module* parent_; 31 | 32 | std::vector> modules_; 33 | std::vector> params_; 34 | }; 35 | 36 | } // namespace turbomind::core 37 | -------------------------------------------------------------------------------- /src/turbomind/core/stream.cc: -------------------------------------------------------------------------------- 1 | 2 | #include "src/turbomind/core/stream.h" 3 | #include 4 | 5 | namespace turbomind::core { 6 | 7 | Stream Stream::create(int priority) 8 | { 9 | Stream stream; 10 | stream.impl_ = std::make_shared(priority); 11 | return stream; 12 | } 13 | 14 | void StreamImpl::Wait(const Event& event) 15 | { 16 | check_cuda_error(cudaStreamWaitEvent(stream_, event)); 17 | } 18 | 19 | } // namespace turbomind::core 20 | -------------------------------------------------------------------------------- /src/turbomind/engine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | add_library(engine STATIC gateway.cc request_queue.cc model_request.cc) 6 | target_link_libraries(engine PRIVATE core) 7 | set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE ON) 8 | set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 9 | -------------------------------------------------------------------------------- /src/turbomind/engine/request_queue.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/engine/request_queue.h" 4 | #include "src/turbomind/engine/gateway.h" 5 | 6 | #include "src/turbomind/engine/request.h" 7 | 8 | namespace turbomind { 9 | 10 | } // namespace turbomind 11 | -------------------------------------------------------------------------------- /src/turbomind/kernels/activation_kernels.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | #include "src/turbomind/core/core.h" 22 | 23 | namespace turbomind { 24 | 25 | // clang-format off 26 | template struct GeluActivation; 27 | template struct ReluActivation; 28 | template struct SiluActivation; 29 | template struct IdentityActivation; 30 | // clang-format on 31 | 32 | template class Activation> 33 | void invokeGenericActivation_v3(Ref inter_, const Tensor& gate, cudaStream_t stream); 34 | 35 | } // namespace turbomind 36 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/arch.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::arch { 6 | 7 | // tags for dispatching & conditional codegen 8 | 9 | template 10 | struct Arch { 11 | static constexpr bool is_compatible(int arch) 12 | { 13 | return Begin <= arch && (End == -1 || arch < End); 14 | } 15 | }; 16 | 17 | struct Sm70: Arch<700, 750> { 18 | }; 19 | 20 | struct Sm75: Arch<750, 800> { 21 | }; 22 | 23 | struct Sm80: Arch<800> { 24 | }; 25 | 26 | } // namespace turbomind::arch 27 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/attention.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "attention_params.h" 6 | 7 | namespace turbomind { 8 | 9 | constexpr int MAX_CTA_S = 64; 10 | 11 | template 12 | void dispatchAttention(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm70_128_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm70_64_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm75_128_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | // ! register spill 14 | // template void invokeAttention::Kernel>( 15 | // const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm75_64_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | // ! register spill 14 | // template void invokeAttention::Kernel>( 15 | // const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm80_128_bf16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm80_128_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm80_192.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm80_64_bf16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/attention_sm80_64_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_config.h" 4 | #include "../attention_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template void invokeAttention::Kernel>( 11 | const AttentionParams& params); 12 | 13 | template void invokeAttention::Kernel>( 14 | const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | template bool invokeDecoding>(const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_params.h" 4 | #include "../decoding_config.h" 5 | #include "../decoding_template.h" 6 | 7 | namespace turbomind { 8 | 9 | using namespace attention; 10 | 11 | template bool invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool invokeDecoding>(const AttentionParams& params); 14 | 15 | template bool invokeDecoding>(const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_128_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_params.h" 4 | #include "../decoding_config.h" 5 | #include "../decoding_template.h" 6 | 7 | namespace turbomind { 8 | 9 | using namespace attention; 10 | 11 | template bool invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool invokeDecoding>(const AttentionParams& params); 14 | 15 | template bool invokeDecoding>(const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | template bool invokeDecoding>(const AttentionParams& params); 15 | 16 | } // namespace turbomind 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_params.h" 4 | #include "../decoding_config.h" 5 | #include "../decoding_template.h" 6 | 7 | namespace turbomind { 8 | 9 | using namespace attention; 10 | 11 | template bool invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool invokeDecoding>(const AttentionParams& params); 14 | 15 | template bool invokeDecoding>(const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm70_64_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../attention_params.h" 4 | #include "../decoding_config.h" 5 | #include "../decoding_template.h" 6 | 7 | namespace turbomind { 8 | 9 | using namespace attention; 10 | 11 | template bool invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool invokeDecoding>(const AttentionParams& params); 14 | 15 | template bool invokeDecoding>(const AttentionParams& params); 16 | 17 | } // namespace turbomind 18 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_128_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm75_64_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool 11 | invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool 14 | invokeDecoding>(const AttentionParams& params); 15 | 16 | template bool 17 | invokeDecoding>(const AttentionParams& params); 18 | 19 | template bool 20 | invokeDecoding>(const AttentionParams& params); 21 | 22 | } // namespace turbomind 23 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_bf16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | template bool invokeDecoding>(const AttentionParams& params); 15 | 16 | template bool invokeDecoding>(const AttentionParams& params); 17 | 18 | } // namespace turbomind 19 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_128_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_192.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool 11 | invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool invokeDecoding>(const AttentionParams& params); 14 | 15 | template bool 16 | invokeDecoding>(const AttentionParams& params); 17 | 18 | template bool invokeDecoding>(const AttentionParams& params); 19 | 20 | } // namespace turbomind 21 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_bf16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool 11 | invokeDecoding>(const AttentionParams& params); 12 | 13 | template bool 14 | invokeDecoding>(const AttentionParams& params); 15 | 16 | template bool 17 | invokeDecoding>(const AttentionParams& params); 18 | 19 | template bool 20 | invokeDecoding>(const AttentionParams& params); 21 | 22 | } // namespace turbomind 23 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_bf16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_f16.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams& params); 11 | 12 | template bool invokeDecoding>(const AttentionParams& params); 13 | 14 | template bool invokeDecoding>(const AttentionParams& params); 15 | 16 | template bool invokeDecoding>(const AttentionParams& params); 17 | 18 | } // namespace turbomind 19 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u4.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/codegen/decoding_sm80_64_f16_u8.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "../decoding_config.h" 4 | #include "../decoding_template.h" 5 | 6 | namespace turbomind { 7 | 8 | using namespace attention; 9 | 10 | template bool invokeDecoding>(const AttentionParams&); 11 | 12 | template bool invokeDecoding>(const AttentionParams&); 13 | 14 | } // namespace turbomind 15 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/decoding.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "attention_params.h" 6 | 7 | namespace turbomind { 8 | 9 | template 10 | void dispatchDecoding(const AttentionParams& params); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/impl.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | namespace attention { 8 | 9 | struct MMA_16816 { 10 | }; 11 | 12 | struct MMA_81616 { 13 | }; // MMA_16816 transposed 14 | 15 | struct MMA_1688 { 16 | }; 17 | 18 | struct MMA_884 { 19 | }; 20 | 21 | struct MMA_SIMT { 22 | }; 23 | 24 | template 35 | struct Impl { 36 | }; 37 | 38 | } // namespace attention 39 | 40 | } // namespace turbomind 41 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/mainloop.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::attention { 6 | 7 | template 8 | struct Mainloop { 9 | }; 10 | 11 | } // namespace turbomind::attention 12 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/reduce.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "cta_map.h" 6 | #include "src/turbomind/kernels/core/array_ops.h" 7 | #include "src/turbomind/kernels/core/thread_map.h" 8 | #include 9 | #include 10 | #include 11 | 12 | namespace turbomind::attention { 13 | 14 | template 15 | void invokeReduce(T* out, 16 | float* partial_M, 17 | float* partial_L, 18 | float* partial_O, 19 | const int* split_cnt, 20 | int partial_len, 21 | int max_split_cnt, 22 | int query_num, 23 | int head_num, 24 | float exp_scale, 25 | cudaStream_t stream); 26 | 27 | } // namespace turbomind::attention 28 | -------------------------------------------------------------------------------- /src/turbomind/kernels/attention/utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | int GetSplitCount(int max_split_cnt, 8 | int grid_size, 9 | int max_active_ctas, 10 | int sm_count, 11 | int max_wave_cnt, 12 | float alpha = 1, 13 | float beta = 1e-3); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/data_type.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #if ENABLE_BF16 7 | #include 8 | #endif 9 | 10 | #include 11 | 12 | #include "src/turbomind/core/data_type.h" 13 | 14 | namespace turbomind { 15 | 16 | namespace detail { 17 | 18 | struct __uint4_t { 19 | uint32_t x; 20 | }; 21 | 22 | } // namespace detail 23 | 24 | template 25 | struct get_pointer_type_t { 26 | using type = T*; 27 | }; 28 | 29 | template 30 | using get_pointer_type = typename get_pointer_type_t::type; 31 | 32 | } // namespace turbomind 33 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/meta.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | template 8 | struct basic_type { 9 | using type = T; 10 | }; 11 | 12 | template 13 | constexpr basic_type type_c{}; 14 | 15 | template 16 | struct constant { 17 | using type = constant; 18 | using value_type = decltype(v); 19 | 20 | static constexpr value_type value = v; 21 | 22 | constexpr value_type operator()() const noexcept 23 | { 24 | return v; 25 | } 26 | constexpr operator value_type() const noexcept 27 | { 28 | return v; 29 | } 30 | }; 31 | 32 | template 33 | struct pair { 34 | }; 35 | 36 | template 37 | constexpr auto first(pair) 38 | { 39 | return u; 40 | } 41 | 42 | template 43 | constexpr auto second(pair) 44 | { 45 | return v; 46 | } 47 | 48 | template 49 | struct triplet { 50 | }; 51 | 52 | } // namespace turbomind 53 | -------------------------------------------------------------------------------- /src/turbomind/kernels/core/pipe_iter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | template 8 | struct PipeIter { 9 | static constexpr int kMaxStep = Stages * Step; 10 | 11 | int r = 0; 12 | int w = kMaxStep - Step; 13 | 14 | __inline__ __device__ PipeIter& operator++() 15 | { 16 | w = r; 17 | r += Step; 18 | if (r == kMaxStep) { 19 | r -= kMaxStep; 20 | } 21 | return *this; 22 | } 23 | }; 24 | 25 | } // namespace turbomind 26 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | add_subdirectory(fused_multi_head_attention) 4 | 5 | add_library(flash_attention STATIC flash_attention.cu) 6 | set_property(TARGET flash_attention PROPERTY POSITION_INDEPENDENT_CODE ON) 7 | set_property(TARGET flash_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 8 | target_link_libraries(flash_attention PRIVATE llama_fmha) 9 | 10 | if (NOT MSVC) 11 | add_subdirectory(flash_attention2) 12 | target_link_libraries(flash_attention PRIVATE flash_attention2) 13 | endif() 14 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 3.8) 3 | project(flash_attention2) 4 | 5 | add_library(${PROJECT_NAME} STATIC 6 | flash_api.cpp 7 | # flash_fwd_hdim32_fp16_sm80.cu 8 | # flash_fwd_hdim64_fp16_sm80.cu 9 | flash_fwd_hdim128_fp16_sm80.cu 10 | flash_fwd_hdim128_bf16_sm80.cu 11 | flash_fwd_hdim256_bf16_sm80.cu 12 | flash_fwd_hdim256_fp16_sm80.cu 13 | ) 14 | target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) 15 | target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) 16 | 17 | set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) 18 | set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 19 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/README.md: -------------------------------------------------------------------------------- 1 | #Flash Attention 2 2 | 3 | This is flash attention2 implementation modified from https://github.com/Dao-AILab/flash-attention 4 | 5 | - remove dropout 6 | - remove backward 7 | - cutlass 3.1.0 8 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_bf16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | #ifdef ENABLE_BF16 8 | template<> 9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 10 | { 11 | run_mha_fwd_hdim128(params, stream); 12 | } 13 | #endif 14 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 9 | { 10 | run_mha_fwd_hdim128(params, stream); 11 | } 12 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_bf16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | #ifdef ENABLE_BF16 8 | template<> 9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 10 | { 11 | run_mha_fwd_hdim256(params, stream); 12 | } 13 | #endif 14 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 9 | { 10 | run_mha_fwd_hdim256(params, stream); 11 | } 12 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_bf16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | #ifdef ENABLE_BF16 8 | template<> 9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 10 | { 11 | run_mha_fwd_hdim32(params, stream); 12 | } 13 | #endif 14 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 9 | { 10 | run_mha_fwd_hdim32(params, stream); 11 | } 12 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_bf16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | #ifdef ENABLE_BF16 8 | template<> 9 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 10 | { 11 | run_mha_fwd_hdim64(params, stream); 12 | } 13 | #endif 14 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/flash_attention2/flash_fwd_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | 3 | // Splitting the different head dimensions to different files to speed up compilation. 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream) 9 | { 10 | run_mha_fwd_hdim64(params, stream); 11 | } 12 | -------------------------------------------------------------------------------- /src/turbomind/kernels/flash_attention/fused_multi_head_attention/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 3.8) 3 | 4 | add_library(llama_fmha STATIC llama_flash_attention_kernel.cu) 5 | target_include_directories(llama_fmha PRIVATE ${CUTLASS_DIR}/examples) 6 | target_link_libraries(llama_fmha PRIVATE nvidia::cutlass::cutlass) 7 | set_property(TARGET llama_fmha PROPERTY POSITION_INDEPENDENT_CODE ON) 8 | set_property(TARGET llama_fmha PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 9 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/arch.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::gemm { 6 | 7 | // tags for dispatching & conditional codegen 8 | 9 | template 10 | struct Arch { 11 | static constexpr bool is_compatible(int arch) 12 | { 13 | return Begin <= arch && (End == -1 || arch < End); 14 | } 15 | }; 16 | 17 | struct Sm70: Arch<700, 750> { 18 | static constexpr int value = 700; 19 | }; 20 | 21 | struct Sm75: Arch<750, 800> { 22 | static constexpr int value = 750; 23 | }; 24 | 25 | struct Sm80: Arch<800, 900> { 26 | static constexpr int value = 800; 27 | }; 28 | 29 | struct Sm90: Arch<900> { 30 | static constexpr int value = 900; 31 | }; 32 | 33 | inline bool is_arch_compatible(int karch, int darch) 34 | { 35 | switch (karch) { 36 | case 700: 37 | return Sm70::is_compatible(darch); 38 | case 750: 39 | return Sm75::is_compatible(darch); 40 | case 800: 41 | return Sm80::is_compatible(darch); 42 | case 900: 43 | return Sm90::is_compatible(darch); 44 | default: 45 | return false; 46 | } 47 | } 48 | 49 | } // namespace turbomind::gemm 50 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/dispatch_cache.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/desc.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | class DispatchCache { 12 | public: 13 | DispatchCache(std::vector kernels); 14 | 15 | ~DispatchCache(); 16 | 17 | std::optional LowerBound(const GemmDesc& desc) const; 18 | 19 | std::optional Find(const GemmDesc& desc) const; 20 | 21 | bool Insert(const GemmDesc& desc, const LaunchSpec& spec); 22 | 23 | int Export(std::ostream& os) const; 24 | 25 | int Import(std::istream& is); 26 | 27 | private: 28 | struct Impl; 29 | std::unique_ptr impl_; 30 | }; 31 | 32 | } // namespace turbomind::gemm 33 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/gpu_metric.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | 7 | namespace turbomind::gemm { 8 | 9 | // bytes / second 10 | float MeasureL2CacheThroughput(); 11 | 12 | // fused multiply-add / second 13 | float MeasureMmaThroughput(int proble_size = 16384); 14 | 15 | } // namespace turbomind::gemm 16 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/predicate.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | 8 | namespace turbomind::gemm { 9 | 10 | template 11 | struct Predicate { 12 | 13 | static constexpr int kSizeC = AlignedC ? 1 : C; 14 | 15 | static_assert(S * kSizeC <= 32); 16 | 17 | static constexpr bool is_active = true; 18 | 19 | uint32_t pred_{}; 20 | 21 | __device__ int operator()(int s, int c) const 22 | { 23 | return (pred_ & (1 << (s * kSizeC + c))) != 0; 24 | } 25 | 26 | __device__ void set(int s, int c) 27 | { 28 | pred_ |= (1 << (s * kSizeC + c)); 29 | } 30 | 31 | __device__ void clear() 32 | { 33 | pred_ = 0; 34 | } 35 | }; 36 | 37 | template 38 | struct Predicate { 39 | 40 | static constexpr bool is_active = false; 41 | 42 | __device__ constexpr std::integral_constant operator()(int, int) const 43 | { 44 | return {}; 45 | } 46 | 47 | __device__ void set(int, int) {} 48 | 49 | __device__ void clear() 50 | { 51 | // pred_ = 0; 52 | } 53 | }; 54 | 55 | } // namespace turbomind::gemm 56 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/simt.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind::gemm::simt { 6 | 7 | // constexpr int OP_M = 2; 8 | // constexpr int OP_N = 16; 9 | // constexpr int OP_K = 4; 10 | 11 | // constexpr int OP_M = 4; 12 | // constexpr int OP_N = 8; 13 | // constexpr int OP_K = 8; 14 | 15 | constexpr int OP_M = 1; 16 | constexpr int OP_N = 32; 17 | constexpr int OP_K = 8; 18 | 19 | } // namespace turbomind::gemm::simt 20 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/quantization.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/types.h" 4 | #include 5 | #include 6 | 7 | #pragma once 8 | 9 | namespace turbomind::gemm { 10 | 11 | template 12 | void Quantize(const thrust::universal_vector& x, 13 | int m, 14 | int k, 15 | Order order, 16 | int group_size, 17 | thrust::universal_vector& x_p, // pseudo-quantized 18 | thrust::universal_vector& x_q, // quantized ushort 19 | thrust::universal_vector& x_u, // scales & zeros (always m-major) 20 | cudaStream_t stream); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/test/reference.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/types.h" 6 | 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | class Reference { 12 | public: 13 | Reference(); 14 | ~Reference(); 15 | 16 | void set_stream(cudaStream_t stream); 17 | 18 | void gemm(const void* A, MatrixLayout Adesc, const void* B, MatrixLayout Bdesc, void* C, MatrixLayout Cdesc); 19 | 20 | private: 21 | cublasHandle_t handle_; 22 | }; 23 | 24 | } // namespace turbomind::gemm 25 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/cache_utils.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/cache_utils.h" 4 | 5 | namespace turbomind::gemm { 6 | 7 | CacheFlushing::CacheFlushing() 8 | { 9 | cudaDeviceProp props{}; 10 | cudaGetDeviceProperties(&props, 0); 11 | 12 | size_ = props.l2CacheSize; 13 | 14 | cudaMalloc(&buffer_, size_); 15 | } 16 | 17 | void CacheFlushing::flush(cudaStream_t stream) 18 | { 19 | thread_local CacheFlushing inst{}; 20 | inst(stream); 21 | } 22 | 23 | void CacheFlushing::operator()(cudaStream_t stream) const 24 | { 25 | cudaMemsetAsync(buffer_, 0, size_, stream); 26 | } 27 | 28 | } // namespace turbomind::gemm 29 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/cache_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | namespace turbomind::gemm { 8 | 9 | class CacheFlushing { 10 | public: 11 | static void flush(cudaStream_t stream = {}); 12 | 13 | private: 14 | CacheFlushing(); 15 | void operator()(cudaStream_t stream) const; 16 | 17 | uint32_t* buffer_; 18 | size_t size_; 19 | }; 20 | 21 | } // namespace turbomind::gemm 22 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/measurer.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/desc.h" 4 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace turbomind::gemm { 11 | 12 | struct Measurement { 13 | cudaError_t status; 14 | int sample_count; 15 | float mean; 16 | float variance; 17 | }; 18 | 19 | using Launcher = std::function; 20 | 21 | class Measurer { 22 | public: 23 | Measurer(std::unique_ptr stop_criterion); 24 | 25 | ~Measurer(); 26 | 27 | std::vector 28 | Measure(const std::vector& specs, const Launcher& launcher, cudaStream_t stream); 29 | 30 | private: 31 | Measurement MeasureOne(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream); 32 | 33 | std::pair ColdRun(LaunchSpec spec, const Launcher& launcher, cudaStream_t stream); 34 | 35 | private: 36 | cudaEvent_t ev_beg_; 37 | cudaEvent_t ev_end_; 38 | std::unique_ptr stop_criterion_; 39 | }; 40 | 41 | } // namespace turbomind::gemm 42 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/params.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace turbomind::gemm { 10 | 11 | struct TuningParams { 12 | // Split-k params 13 | int max_splits = 8; 14 | int max_waves = 10; 15 | 16 | // Swizzling params 17 | std::vector swizzle{3}; 18 | 19 | // Sampling params 20 | float top_k = 0; 21 | int clusters = 5; 22 | int min_iter = 1; 23 | int max_iter = 10; 24 | float max_time = 1.f; 25 | 26 | std::vector seq; 27 | }; 28 | 29 | // example 30 | // max_splits=8,top_splits=5,max_waves=16,top_k=10,swizzle=[2,3,4],clusters=5,max_iter=10,min_iter=1,max_time=10.0 31 | void ParseTuningParams(TuningParams& params, const std::string& str); 32 | 33 | // example 34 | // 16-16-128,256-128-1024,8192 35 | std::vector ParseTuningSequence(const std::string& str); 36 | 37 | std::vector GenerateTuningSequence(const std::vector>& generators); 38 | 39 | std::vector> GetDefaultTuningGenerators(); 40 | 41 | } // namespace turbomind::gemm 42 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/sampler.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/kernels/gemm/desc.h" 6 | #include "src/turbomind/kernels/gemm/tuner/measurer.h" 7 | 8 | #include 9 | 10 | namespace turbomind::gemm { 11 | 12 | class Sampler { 13 | public: 14 | explicit Sampler(Measurer& measurer, int k_clusters): measurer_{measurer}, k_clusters_{k_clusters} {} 15 | 16 | std::vector Run(std::vector specs, const Launcher& launcher, cudaStream_t stream); 17 | 18 | private: 19 | Measurer& measurer_; 20 | int k_clusters_; 21 | }; 22 | 23 | } // namespace turbomind::gemm 24 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stats.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | 5 | namespace turbomind::gemm { 6 | 7 | class Stats { 8 | public: 9 | Stats(): count_{}, mean_{}, m2_{} {} 10 | 11 | float mean() const noexcept 12 | { 13 | return mean_; 14 | } 15 | 16 | float sum() const noexcept 17 | { 18 | return mean_ * count_; 19 | } 20 | 21 | int count() const noexcept 22 | { 23 | return count_; 24 | } 25 | 26 | float get_variance() const noexcept 27 | { 28 | return count_ < 2 ? std::numeric_limits::quiet_NaN() : m2_ / count_; 29 | } 30 | 31 | void add_sample(float x) noexcept 32 | { 33 | ++count_; 34 | float delta = x - mean_; 35 | mean_ += delta / count_; 36 | float delta2 = x - mean_; 37 | m2_ += delta * delta2; 38 | } 39 | 40 | private: 41 | int count_; 42 | float mean_; 43 | float m2_; 44 | }; 45 | 46 | } // namespace turbomind::gemm 47 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stopping_criterion.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/stopping_criterion.h" 4 | #include 5 | 6 | namespace turbomind::gemm { 7 | 8 | namespace stopping_criterions { 9 | 10 | class Optimistic: public StoppingCriterion { 11 | public: 12 | Optimistic(int min_iter, int max_iter, float max_ms) 13 | { 14 | min_iter_ = std::max(min_iter, 1); 15 | max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits::max(); 16 | max_ms_ = max_ms > 0 ? max_ms : std::numeric_limits::infinity(); 17 | } 18 | bool should_stop(const Stats& stats) override 19 | { 20 | return stats.count() >= min_iter_ && (stats.count() >= max_iter_ || stats.sum() >= max_ms_); 21 | } 22 | 23 | private: 24 | int min_iter_; 25 | int max_iter_; 26 | float max_ms_; 27 | }; 28 | 29 | } // namespace stopping_criterions 30 | 31 | std::unique_ptr CreateStoppingCriterion(int min_iter, int max_iter, float max_ms) 32 | { 33 | return std::make_unique(min_iter, max_iter, max_ms); 34 | } 35 | 36 | } // namespace turbomind::gemm 37 | -------------------------------------------------------------------------------- /src/turbomind/kernels/gemm/tuner/stopping_criterion.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include "src/turbomind/kernels/gemm/tuner/stats.h" 4 | #include 5 | 6 | namespace turbomind::gemm { 7 | 8 | class StoppingCriterion { 9 | public: 10 | virtual ~StoppingCriterion() = default; 11 | virtual bool should_stop(const Stats& stats) = 0; 12 | }; 13 | 14 | std::unique_ptr CreateStoppingCriterion(int min_iter, int max_iter, float max_ms); 15 | 16 | } // namespace turbomind::gemm 17 | -------------------------------------------------------------------------------- /src/turbomind/kernels/norm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | add_library(rms_norm rms_norm.cu) 4 | set_property(TARGET rms_norm PROPERTY POSITION_INDEPENDENT_CODE ON) 5 | set_property(TARGET rms_norm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 6 | -------------------------------------------------------------------------------- /src/turbomind/kernels/norm/rms_norm.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | 5 | #include "src/turbomind/core/core.h" 6 | 7 | namespace turbomind { 8 | 9 | void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st); 10 | 11 | void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st); 12 | 13 | template 14 | void invokeBiasResidualRMSNorm( 15 | T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st); 16 | 17 | void invokeResidualBiasRMSNorm(void* hidden_states, 18 | void* residual, 19 | const void* weights, 20 | const void* bias, 21 | DataType dtype, 22 | int dims, 23 | int num, 24 | float eps, 25 | cudaStream_t st); 26 | 27 | } // namespace turbomind 28 | -------------------------------------------------------------------------------- /src/turbomind/layers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_minimum_required(VERSION 3.8) 16 | 17 | add_subdirectory(sampling_layers) 18 | 19 | find_package(CUDAToolkit REQUIRED) 20 | add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc) 21 | set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON) 22 | set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 23 | target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart 24 | LogitsProcessorLayer SamplingLayer StopCriteriaLayer 25 | gpt_kernels nvtx_utils) 26 | -------------------------------------------------------------------------------- /src/turbomind/macro.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__) 4 | 5 | #define __PRETTY_FUNCTION__ __FUNCSIG__ 6 | 7 | #endif 8 | 9 | typedef unsigned int uint; 10 | -------------------------------------------------------------------------------- /src/turbomind/models/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(llama) 16 | -------------------------------------------------------------------------------- /src/turbomind/models/llama/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | 5 | 6 | find_package(CUDAToolkit REQUIRED) 7 | 8 | add_library(Llama STATIC 9 | LlamaV2.cc 10 | LlamaBatch.cc 11 | LlamaLinear.cu 12 | BlockManager.cc 13 | BlockTrie.cc 14 | SequenceManager.cc 15 | LlamaWeight.cc 16 | LlamaDenseWeight.cc 17 | LlamaDecoderLayerWeight.cc 18 | LlamaFfnLayer.cc 19 | moe_ffn_layer.cc 20 | unified_decoder.cc 21 | unified_attention_layer.cc 22 | llama_kernels.cu 23 | llama_utils.cu 24 | mla_utils.cu) 25 | set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) 26 | set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) 27 | target_link_libraries(Llama PUBLIC CUDA::cudart 28 | engine 29 | core 30 | gemm2 31 | CUDA::cublas 32 | rms_norm 33 | DynamicDecodeLayer 34 | activation_kernels 35 | attention 36 | decoding_kernels 37 | unfused_attention_kernels 38 | gpt_kernels 39 | memory_utils 40 | cuda_utils 41 | logger 42 | anomaly_handler) 43 | -------------------------------------------------------------------------------- /src/turbomind/models/llama/copy.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include "src/turbomind/models/llama/llama_kernels.h" 6 | #include "src/turbomind/utils/cuda_utils.h" 7 | 8 | namespace turbomind { 9 | 10 | class BatchedCopy { 11 | public: 12 | template = 0> 13 | T* Add(const T* src, int size, T* dst) 14 | { 15 | src_.push_back((void*)src); 16 | dst_.push_back((void*)dst); 17 | size_.push_back(sizeof(T) * size); 18 | return dst + size; 19 | } 20 | 21 | void Submit(cudaStream_t stream) 22 | { 23 | if (size_.empty()) { 24 | return; 25 | } 26 | 27 | invokeBatchedCopy(src_.data(), dst_.data(), size_.data(), size_.size(), stream); 28 | sync_check_cuda_error(); 29 | 30 | src_.clear(); 31 | dst_.clear(); 32 | size_.clear(); 33 | } 34 | 35 | private: 36 | std::vector src_; 37 | std::vector dst_; 38 | std::vector size_; 39 | }; 40 | 41 | } // namespace turbomind 42 | -------------------------------------------------------------------------------- /src/turbomind/models/llama/mla_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | #pragma once 3 | 4 | #include 5 | 6 | #include "src/turbomind/core/data_type.h" 7 | 8 | namespace turbomind { 9 | 10 | void MLACopyQKV(DataType dtype, 11 | void* qkv, 12 | const void* q, 13 | const void* kv_a, 14 | const void* kv_b, 15 | int token_num, 16 | int head_num, 17 | int nope_dim, 18 | int rope_dim, 19 | int kv_lora_rank, 20 | int v_head_dim, 21 | cudaStream_t stream); 22 | 23 | } // namespace turbomind 24 | -------------------------------------------------------------------------------- /src/turbomind/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | cmake_minimum_required(VERSION 3.8) 4 | project(_turbomind) 5 | 6 | find_package(pybind11 CONFIG) 7 | if(NOT pybind11_FOUND) 8 | execute_process(COMMAND "pybind11-config" "--cmakedir" 9 | RESULT_VARIABLE _COMMAND_SUCCESS 10 | OUTPUT_VARIABLE pybind11_DIR 11 | OUTPUT_STRIP_TRAILING_WHITESPACE) 12 | find_package(pybind11 CONFIG) 13 | endif() 14 | 15 | pybind11_add_module(${PROJECT_NAME} bind.cpp) 16 | target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend) 17 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14) 18 | 19 | set(_INSTALL_CUDA_RPATH 20 | "\$ORIGIN" 21 | "\$ORIGIN/../../nvidia/nccl/lib/" 22 | "\$ORIGIN/../../nvidia/cuda_runtime/lib/" 23 | "\$ORIGIN/../../nvidia/cublas/lib/" 24 | "\$ORIGIN/../../nvidia/curand/lib/" 25 | ) 26 | set_target_properties(${PROJECT_NAME} PROPERTIES 27 | BUILD_RPATH "\$ORIGIN" 28 | INSTALL_RPATH "${_INSTALL_CUDA_RPATH}" 29 | ) 30 | -------------------------------------------------------------------------------- /src/turbomind/triton_backend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_subdirectory(llama) 3 | -------------------------------------------------------------------------------- /src/turbomind/utils/constant.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | namespace turbomind { 6 | 7 | const int kMaxLogProb = 1024; 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/turbomind/utils/cuda_bf16_wrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #ifdef ENABLE_BF16 20 | #include 21 | #endif 22 | -------------------------------------------------------------------------------- /src/turbomind/utils/debug_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if __has_include("3rdparty/dbg.h") 4 | #include "3rdparty/dbg.h" 5 | #else 6 | #define dbg(...) 7 | #endif 8 | -------------------------------------------------------------------------------- /src/turbomind/utils/dispatch.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | namespace turbomind { 8 | 9 | namespace detail { 10 | 11 | template 12 | inline constexpr std::integral_constant _Int{}; 13 | 14 | template 15 | bool dispatch_impl(F&& f, P&& p, G g, std::integer_sequence, std::index_sequence) 16 | { 17 | constexpr int N = sizeof...(Xs); 18 | return (((((P &&) p)(_Int) || (g && Is == N - 1)) && (((F &&) f)(_Int), 1)) || ...); 19 | } 20 | 21 | } // namespace detail 22 | 23 | template 24 | bool dispatch(std::integer_sequence seq, P&& p, F&& f, G g = {}) 25 | { 26 | return detail::dispatch_impl((F &&) f, (P &&) p, g, seq, std::make_index_sequence{}); 27 | } 28 | 29 | template 30 | bool dispatch(std::integer_sequence seq, F&& f) 31 | { 32 | return (((F &&) f)(detail::_Int) || ...); 33 | } 34 | 35 | } // namespace turbomind 36 | -------------------------------------------------------------------------------- /src/turbomind/utils/memory_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | namespace turbomind { 22 | 23 | template 24 | void invokeInPlaceTranspose102( 25 | T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0); 26 | 27 | } // namespace turbomind 28 | -------------------------------------------------------------------------------- /src/turbomind/utils/monotonic.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace turbomind { 8 | 9 | class Monotonic { 10 | public: 11 | Monotonic(void* base, size_t alignment = 256): ptr_{base}, alignment_{alignment} 12 | { 13 | ptr_ = align(ptr_); 14 | } 15 | 16 | template 17 | void operator()(T** ptr, size_t numel) noexcept 18 | { 19 | *ptr = (T*)std::exchange(ptr_, align((T*)ptr_ + numel)); 20 | } 21 | 22 | void* ptr() const noexcept 23 | { 24 | return ptr_; 25 | } 26 | 27 | private: 28 | template 29 | void* align(T* p) 30 | { 31 | static_assert(sizeof(T*) == sizeof(uintptr_t)); 32 | auto x = reinterpret_cast(p); 33 | if (auto remainder = x % alignment_) { 34 | x += alignment_ - remainder; 35 | } 36 | return reinterpret_cast(x); 37 | } 38 | 39 | void* ptr_; 40 | size_t alignment_; 41 | }; 42 | 43 | } // namespace turbomind 44 | -------------------------------------------------------------------------------- /src/turbomind/utils/parser.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) OpenMMLab. All rights reserved. 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace turbomind { 9 | 10 | std::vector> ParseArgsList(const std::string& str) 11 | { 12 | const std::regex regex(R"((\w+)=([^,\[\(]+|\[.*\]|\(.*\)))"); 13 | 14 | std::sregex_iterator beg(str.begin(), str.end(), regex); 15 | std::sregex_iterator end{}; 16 | 17 | std::vector> ret; 18 | for (auto it = beg; it != end; ++it) { 19 | std::smatch match = *it; 20 | ret.emplace_back(match[1], match[2]); 21 | } 22 | 23 | return ret; 24 | } 25 | 26 | std::vector ParseListOrTuple(const std::string& str) 27 | { 28 | const std::regex regex(R"([,\[\]\(\)]+)"); 29 | 30 | std::vector ret; 31 | std::copy_if(std::sregex_token_iterator(str.begin(), str.end(), regex, -1), 32 | std::sregex_token_iterator{}, 33 | std::back_inserter(ret), 34 | [](const std::string& s) { return !s.empty(); }); 35 | 36 | return ret; 37 | } 38 | 39 | } // namespace turbomind 40 | -------------------------------------------------------------------------------- /src/turbomind/utils/parser.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace turbomind { 5 | 6 | std::vector> ParseArgsList(const std::string& str); 7 | 8 | std::vector ParseListOrTuple(const std::string& str); 9 | 10 | inline void Parse(int& value, const std::string& str) 11 | { 12 | value = std::stoi(str); 13 | } 14 | 15 | inline void Parse(float& value, const std::string& str) 16 | { 17 | value = std::stof(str); 18 | } 19 | 20 | template 21 | void Parse(std::vector& xs, const std::string& str) 22 | { 23 | const auto ss = ParseListOrTuple(str); 24 | for (const auto& s : ss) { 25 | xs.emplace_back(); 26 | Parse(xs.back(), s); 27 | } 28 | } 29 | 30 | } // namespace turbomind 31 | -------------------------------------------------------------------------------- /tests/csrc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | add_subdirectory(unittests) 16 | -------------------------------------------------------------------------------- /tests/pytorch/kernel/test_activation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | 5 | class TestSiluAndMul: 6 | 7 | @pytest.fixture 8 | def seqlen(self, request): 9 | yield request.param 10 | 11 | @pytest.fixture 12 | def feat_size(self, request): 13 | yield request.param 14 | 15 | @pytest.fixture 16 | def x(self, seqlen, feat_size): 17 | yield torch.rand(seqlen, feat_size, dtype=torch.float16, device='cuda') 18 | 19 | @pytest.fixture 20 | def gt(self, x): 21 | gate, up = x.chunk(2, -1) 22 | gate = torch.nn.functional.silu(gate) 23 | yield gate * up 24 | 25 | @pytest.mark.parametrize('seqlen', [65536, 256], indirect=True) 26 | @pytest.mark.parametrize('feat_size', [4096, 768], indirect=True) 27 | def test_silu_and_mul(self, x, gt): 28 | from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul 29 | 30 | out = silu_and_mul(x) 31 | torch.testing.assert_close(out, gt) 32 | -------------------------------------------------------------------------------- /tests/pytorch/tools/test_layout_convert.py: -------------------------------------------------------------------------------- 1 | # yapf: disable 2 | import pytest 3 | import torch 4 | 5 | from lmdeploy.pytorch.tools.layout_convert import batch_tensor, continuous_tensor 6 | 7 | # yapf: enable 8 | 9 | 10 | class TestContinuous: 11 | 12 | @pytest.fixture 13 | def batched_tensor(self): 14 | yield torch.tensor([[1, 2, 3, 0, 0], [4, 5, 6, 7, 8], [9, 10, 0, 0, 0]]) 15 | 16 | @pytest.fixture 17 | def seq_len(self): 18 | yield torch.tensor([3, 5, 2]) 19 | 20 | @pytest.fixture 21 | def conti_tensor(self): 22 | yield torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) 23 | 24 | def test_conti_tensor(self, batched_tensor, seq_len, conti_tensor): 25 | conti_out = continuous_tensor(batched_tensor, seq_len) 26 | torch.testing.assert_close(conti_out, conti_tensor) 27 | 28 | batched_out = batch_tensor(conti_tensor, seq_len) 29 | torch.testing.assert_close(batched_out, batched_tensor) 30 | -------------------------------------------------------------------------------- /tests/test_lmdeploy/test_async_engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from lmdeploy.serve.async_engine import get_names_from_model 5 | 6 | 7 | def test_get_names_from_hf_model(): 8 | cases = [ 9 | # model repo_id from huggingface hub, model_name, chat_template_name 10 | ('InternLM/internlm2_5-7b-chat', 'internlm2.5-7b-chat', 'internlm2'), 11 | ('InternLM/internlm2_5-7b-chat', None, 'internlm2'), 12 | ] 13 | for model_path, model_name, chat_template in cases: 14 | _model_name, _chat_template = get_names_from_model(model_path=model_path, model_name=model_name) 15 | assert _chat_template == chat_template 16 | assert _model_name == model_name if model_name else model_path 17 | 18 | 19 | def test_get_names_from_turbomind_model(): 20 | workspace = tempfile.TemporaryDirectory('internlm2_5-7b-chat').name 21 | os.makedirs(os.path.join(workspace, 'triton_models', 'weights'), exist_ok=True) 22 | 23 | import yaml 24 | 25 | expected_chat_template = 'internlm2' 26 | config = dict(model_config=dict(chat_template=expected_chat_template)) 27 | with open(f'{workspace}/triton_models/weights/config.yaml', 'w') as f: 28 | yaml.safe_dump(config, f) 29 | 30 | _, chat_template = get_names_from_model(workspace) 31 | assert chat_template == expected_chat_template 32 | -------------------------------------------------------------------------------- /tests/test_lmdeploy/test_messages.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | 5 | from lmdeploy import GenerationConfig, Tokenizer 6 | from lmdeploy.utils import get_hf_gen_cfg 7 | 8 | 9 | def test_engine_generation_config(): 10 | tokenizer = Tokenizer('internlm/internlm-chat-7b') 11 | config = GenerationConfig(n=3, stop_words=['']) 12 | stop_token_ids = tokenizer.encode('', add_bos=False) 13 | config.convert_stop_bad_words_to_ids(tokenizer) 14 | assert stop_token_ids == config.stop_token_ids 15 | assert isinstance(config.stop_token_ids, List) and \ 16 | isinstance(config.stop_token_ids[0], int) 17 | 18 | 19 | @pytest.mark.parametrize('model_path', [ 20 | 'deepseek-ai/DeepSeek-V3', 21 | 'Qwen/Qwen2.5-32B-Instruct', 22 | 'internlm/internlm3-8b-instruct', 23 | ]) 24 | def test_update_from_hf_gen_cfg(model_path): 25 | tokenizer = Tokenizer(model_path) 26 | model_cfg = get_hf_gen_cfg(model_path) 27 | 28 | generation_config = GenerationConfig() 29 | generation_config.update_from_hf_gen_cfg(model_cfg, tokenizer.eos_token_id) 30 | assert generation_config.stop_token_ids is not None 31 | -------------------------------------------------------------------------------- /tests/test_lmdeploy/test_utils.py: -------------------------------------------------------------------------------- 1 | # yapf: disable 2 | from transformers import AutoConfig 3 | 4 | from lmdeploy.turbomind.deploy.config import ModelConfig, TurbomindModelConfig, config_from_dict 5 | from lmdeploy.utils import _get_and_verify_max_len 6 | 7 | # yapf: enable 8 | 9 | 10 | def test_get_and_verify_max_len(): 11 | # with PretrainedConfig 12 | config = AutoConfig.from_pretrained('OpenGVLab/InternVL-Chat-V1-5-AWQ', trust_remote_code=True) 13 | assert (_get_and_verify_max_len(config, None) == 32768) 14 | assert (_get_and_verify_max_len(config, 1024) == 1024) 15 | assert (_get_and_verify_max_len(config, 102400) == 102400) 16 | 17 | # with PretrainedConfig 18 | config = AutoConfig.from_pretrained('internlm/internlm2-chat-7b', trust_remote_code=True) 19 | assert (_get_and_verify_max_len(config, None) == 32768) 20 | assert (_get_and_verify_max_len(config, 1024) == 1024) 21 | assert (_get_and_verify_max_len(config, 102400) == 102400) 22 | 23 | # with TurbomindModelConfig 24 | config = config_from_dict(TurbomindModelConfig, {}) 25 | config.model_config = config_from_dict(ModelConfig, dict(session_len=4096)) 26 | assert (_get_and_verify_max_len(config, None) == config.session_len) 27 | assert (_get_and_verify_max_len(config, 1024) == 1024) 28 | -------------------------------------------------------------------------------- /tests/test_lmdeploy/test_vl/test_vl_encode.py: -------------------------------------------------------------------------------- 1 | # yapf: disable 2 | from lmdeploy.vl.utils import encode_image_base64, load_image, load_image_from_base64 3 | 4 | # yapf: enable 5 | 6 | 7 | def test_encode_image_base64(): 8 | url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg' # noqa E501 9 | im1 = load_image(url) 10 | base64 = encode_image_base64(url) 11 | im2 = load_image_from_base64(base64) 12 | assert im1 == im2.convert('RGB') 13 | 14 | 15 | def test_load_truncated_image(): 16 | url = 'https://github.com/irexyc/lmdeploy/releases/download/v0.0.1/tr.jpeg' 17 | im = load_image(url) 18 | assert im.width == 1638 19 | assert im.height == 2048 20 | 21 | 22 | def test_load_invalid_url(): 23 | url = ('https://raw.githubusercontent.com/open-mmlab/' 24 | 'mmdeploy/main/tests/data/tiger.jpeg') 25 | # invalid 26 | im1 = load_image(url[:-1]) 27 | assert im1.width == 32 28 | assert im1.height == 32 29 | # valid 30 | im2 = load_image(url) 31 | assert im2.height == 182 32 | assert im2.width == 278 33 | 34 | 35 | def test_load_invalid_base64(): 36 | base64 = 'data:image/jpeg;base64,xxx' 37 | im = load_image(base64) 38 | assert im.width == 32 39 | assert im.height == 32 40 | --------------------------------------------------------------------------------