├── .clang-format ├── .flake8 ├── .github ├── actions │ └── rerun-workflow │ │ ├── action.yml │ │ └── rerun.sh ├── copilot-instructions.md ├── pull_request_template.md └── workflows │ ├── CheckPRTemplate.yml │ ├── Codestyle-Check.yml │ ├── _accuracy_test.yml │ ├── _base_test.yml │ ├── _build_linux.yml │ ├── _ci_gcu.yml │ ├── _ci_image_build.yml │ ├── _clone_linux.yml │ ├── _logprob_test_linux.yml │ ├── _pre_ce_test.yml │ ├── _stable_test.yml │ ├── _unit_test_coverage.yml │ ├── approve.yml │ ├── ce_job.yml │ ├── check-bypass.yml │ ├── ci_hpu.yml │ ├── ci_iluvatar.yml │ ├── ci_image_update.yml │ ├── ci_xpu.yml │ ├── gh-pages.yml │ ├── pr_build_and_test.yml │ ├── publish_job.yml │ ├── remove-skip-ci-labels.yml │ └── rerun.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── README_CN.md ├── README_EN.md ├── benchmarks ├── README.md ├── backend_request_func.py ├── benchmark_dataset.py ├── benchmark_mtp.py ├── benchmark_serving.py ├── benchmark_utils.py ├── paddleocr_vl │ ├── PaddleOCR-VL.yaml │ ├── README.md │ └── benchmark.py ├── quick_benchmark.py ├── requirements.txt └── yaml │ ├── GLM45-air-32k-bf16.yaml │ ├── GLM45-air-32k-wfp8afp8.yaml │ ├── deepseek-32k-tp8-wint4.yaml │ ├── eb45-128k-wint4-a800-tp8.yaml │ ├── eb45-128k-wint4-p800-tp8.yaml │ ├── eb45-128k-wint4-tp1-plas.yaml │ ├── eb45-128k-wint8-a800-tp8.yaml │ ├── eb45-21B-vl-128k-wint4-h800-tp1.yaml │ ├── eb45-21b-a3b-32k-bf16.yaml │ ├── eb45-21b-a3b-32k-wint4-a10.yaml │ ├── eb45-21b-a3b-32k-wint4.yaml │ ├── eb45-21b-a3b-32k-wint8.yaml │ ├── eb45-32k-bf16-a30-tp1.yaml │ ├── eb45-32k-blockwise-fp8-h800-tp8.yaml │ ├── eb45-32k-tensorwise-fp8-h800-tp8.yaml │ ├── eb45-32k-w4a8c8-a800-tp4.yaml │ ├── eb45-32k-w4a8c8-tp4_decode.yaml │ ├── eb45-32k-w4a8c8-tp4_prefill.yaml │ ├── eb45-32k-wint2-h20-tp1.yaml │ ├── eb45-32k-wint2-tp4.yaml │ ├── eb45-32k-wint4-a800-tp4.yaml │ ├── eb45-32k-wint4-h800-dp8_decode.yaml │ ├── eb45-32k-wint4-h800-dp8_prefill.yaml │ ├── eb45-32k-wint4-mtp-h800-tp4.yaml │ ├── eb45-32k-wint4-mtp-tp4-decode.yaml │ ├── eb45-32k-wint4-mtp-tp4-prefill.yaml │ ├── eb45-32k-wint4-p800-tp4.yaml │ ├── eb45-32k-wint4-p800-tp8.yaml │ ├── eb45-32k-wint4-prefixcache-a800-tp4.yaml │ ├── eb45-32k-wint4-tp4_decode.yaml │ ├── eb45-32k-wint4-tp4_prefill.yaml │ ├── eb45-32k-wint8-a800-tp8.yaml │ ├── eb45-32k-wint8-p800-tp8.yaml │ ├── eb45-32k-wint8-prefixcache-a800-tp8.yaml │ ├── eb45-8k-fp8-tp1-dp8_ep.yaml │ ├── eb45-vl-128k-wint4-h800-tp8.yaml │ ├── eb45-vl-28b-thinking-128k-wint8.yaml │ ├── eb45-vl-28b-thinking-32k-wint8.yaml │ ├── eb45-vl-32k-wint4-a800-tp8.yaml │ ├── eb45-vl-32k-wint4-h800-tp8.yaml │ ├── eb45-vl-32k-wint4-tp4.yaml │ ├── eb45-vl-32k-wint8-a800-tp8.yaml │ ├── eb45-vl-32k-wint8-h800-tp8.yaml │ ├── eb45-vl-32k-wint8-tp4.yaml │ ├── eb45-vl-lite-32k-bf16-a800-tp1.yaml │ ├── eb45-vl-lite-32k-wint4-a800-tp1.yaml │ ├── eb45-vl-lite-32k-wint8-a800-tp1.yaml │ ├── eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml │ ├── eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml │ ├── eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml │ ├── eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml │ ├── eb45t_21b-32k-bf16-h800-tp1-static.yaml │ ├── eb45t_21b-32k-wint4-h800-tp1-static.yaml │ ├── eb45t_300b-32k-wint4-h800-tp4-static.yaml │ ├── qwen25_7b-vl-32k-bf16.yaml │ ├── qwen2_7b-32k-bf16-a30-tp1-static.yaml │ ├── qwen2_7b-32k-bf16-h800-tp1-static.yaml │ ├── qwen2_7b-32k-bf16-h800-tp1.yaml │ ├── qwen2_7b-32k-fp8-h800-tp1-static.yaml │ ├── qwen2_7b-32k-fp8-h800-tp1.yaml │ ├── qwen2_7b-32k-wint8-h800-tp1.yaml │ ├── qwen3-235b-32k-fp8-tp1-dp4_decode.yaml │ ├── qwen3-235b-32k-fp8-tp1-dp4_prefill.yaml │ ├── qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml │ ├── qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml │ ├── qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml │ ├── qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml │ ├── qwen3_30b-32k-bf16-h800-tp1-static.yaml │ ├── qwen3_30b-32k-wint4-h800-tp1-static.yaml │ ├── qwen3dot6b-32k-bf16-a30-tp1.yaml │ ├── qwen3dot6b-32k-bf16-a800-tp1.yaml │ ├── qwen3dot6b-32k-bf16-h800-tp1.yaml │ ├── qwen3dot6b-32k-wint8-a30-tp1.yaml │ ├── qwen3dot6b-32k-wint8-a800-tp1.yaml │ ├── qwen3dot6b-32k-wint8-h800-tp1.yaml │ ├── qwen3moe235b-32k-wint4-h800-tp4.yaml │ ├── qwen3moe235b-32k-wint8-h800-tp4.yaml │ ├── qwen3moe30b-32k-bf16-a800-tp1.yaml │ ├── qwen3moe30b-32k-bf16-h800-tp1.yaml │ ├── qwen3moe30b-32k-wint4-a800-tp1.yaml │ ├── qwen3moe30b-32k-wint4-h800-tp1.yaml │ ├── request_yaml │ ├── GLM-32k.yaml │ ├── deepseek-32k.yaml │ ├── eb45-128k.yaml │ ├── eb45-32k.yaml │ ├── eb45-vl-128k.yaml │ ├── eb45-vl-32k.yaml │ ├── quick_benchmark.yaml │ ├── qwen2-32k.yaml │ ├── qwen25-vl-32k.yaml │ ├── qwen3-32k.yaml │ ├── request.yaml │ ├── x1-128k.yaml │ └── x1-32k.yaml │ ├── x1-32k-wint4-h800-tp8.yaml │ ├── x1-32k-wint4-p800-tp4.yaml │ ├── x1-32k-wint4-p800-tp8.yaml │ ├── x1-32k-wint4-prefixcache-h800-tp8.yaml │ ├── x1-32k-wint8-h800-tp8.yaml │ ├── x1-32k-wint8-p800-tp4.yaml │ ├── x1-32k-wint8-p800-tp8.yaml │ ├── x1-32k-wint8-prefixcache-h800-tp8.yaml │ ├── x1-64k-w4a8c8-tp4.yaml │ └── x1-a3b-128k-wint8-h800-tp1.yaml ├── custom_ops ├── 0001-DeepGEMM-95e81b3.patch ├── MANIFEST.in ├── cpu_ops │ ├── avx_weight_only_fake.cc │ ├── get_padding_offset.cc │ ├── rebuild_padding.cc │ ├── set_value_by_flags.cc │ ├── simd_sort.cc │ ├── simd_sort_fake.cc │ ├── stop_generation_multi_ends.cc │ ├── token_penalty_multi_scores.cc │ ├── update_inputs.cc │ ├── xft_all_layer_fake.cc │ └── xft_greedy_search_fake.cc ├── gpu_ops │ ├── append_attention.cu │ ├── append_attn │ │ ├── append_attention_c16_impl.cuh │ │ ├── append_attention_c4_impl.cuh │ │ ├── append_attention_c8_impl.cuh │ │ ├── append_attention_func.cuh │ │ ├── append_attention_kernel.h │ │ ├── decode_attention_func.cuh │ │ ├── decoder_mla_attention_kernel.cu │ │ ├── decoder_mla_attention_kernel.h │ │ ├── decoder_write_cache_with_rope_impl.cuh │ │ ├── decoder_write_cache_with_rope_kernel.cu │ │ ├── decoder_write_cache_with_rope_kernel.h │ │ ├── encoder_write_cache_with_rope_impl.cuh │ │ ├── encoder_write_cache_with_rope_kernel.h │ │ ├── get_block_shape_and_split_kv_block.cu │ │ ├── gqa_rope_write_cache.cu │ │ ├── mem_util.cuh │ │ ├── mla_cache_kernel.cu │ │ ├── mla_cache_kernel.cuh │ │ ├── mma_tensor_op.cuh │ │ ├── multiquery_attention_c16_impl.cuh │ │ ├── multiquery_attention_c16_kernel.h │ │ ├── multiquery_attention_c4_impl.cuh │ │ ├── multiquery_attention_c4_kernel.h │ │ ├── multiquery_attention_c8_impl.cuh │ │ ├── multiquery_attention_c8_kernel.h │ │ ├── multiquery_decoder_attention_impl.cuh │ │ ├── multiquery_decoder_attention_kernel.h │ │ ├── pre_cache_len_concat.cu │ │ ├── speculate_write_cache_with_rope_impl.cuh │ │ ├── speculate_write_cache_with_rope_kernel.cu │ │ ├── speculate_write_cache_with_rope_kernel.h │ │ ├── template_config.json │ │ ├── template_instantiation │ │ │ ├── encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu │ │ │ ├── encoder_write_cache_with_rope_bfloat16_int_kernel.cu │ │ │ ├── encoder_write_cache_with_rope_float16_float16_kernel.cu │ │ │ └── encoder_write_cache_with_rope_float16_int_kernel.cu │ │ └── utils.cuh │ ├── beam_search_softmax.cu │ ├── common │ │ ├── configManager.h │ │ ├── cudaUtils.h │ │ └── quantization.h │ ├── cpp_extensions.cc │ ├── cuda_multiprocess.h │ ├── custom_all_reduce │ │ ├── all_reduce.cu │ │ └── all_reduce.cuh │ ├── cutlass_extensions │ │ ├── arch │ │ │ ├── copy_red_global.hpp │ │ │ ├── memory_copy_sm80.h │ │ │ └── mma.h │ │ ├── compute_occupancy.h │ │ ├── epilogue │ │ │ ├── broadcast_load_epilogue_array_c3x.hpp │ │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ │ ├── scaled_mm_epilogues_c2x.hpp │ │ │ ├── scaled_mm_epilogues_c3x.hpp │ │ │ ├── thread │ │ │ │ └── fused_activations.h │ │ │ └── threadblock │ │ │ │ ├── epilogue_per_row_per_col_scale.h │ │ │ │ └── epilogue_tensor_op_int32.h │ │ ├── epilogue_helpers.h │ │ ├── gemm │ │ │ ├── collective │ │ │ │ ├── builders │ │ │ │ │ └── sm90_gmma_builder_gated.inl │ │ │ │ ├── collective_builder.hpp │ │ │ │ ├── collective_builder_gated.hpp │ │ │ │ ├── collective_mma_gated.hpp │ │ │ │ ├── fp8_accumulation.hpp │ │ │ │ ├── sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp │ │ │ │ ├── sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp │ │ │ │ └── sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp │ │ │ ├── device │ │ │ │ ├── gemm_universal_base_compat.h │ │ │ │ └── splitk_gemm_grouped.h │ │ │ ├── dispatch_policy.hpp │ │ │ ├── kernel │ │ │ │ ├── default_fpA_intB_traits.h │ │ │ │ ├── default_int8_traits.h │ │ │ │ ├── fpA_intB_gemm.h │ │ │ │ ├── gemm_moe_problem_visitor.h │ │ │ │ ├── gemm_universal_gated.hpp │ │ │ │ ├── gemm_with_epilogue_visitor.h │ │ │ │ ├── mixed_gemm_B_layout.h │ │ │ │ ├── moe_problem_visitor.h │ │ │ │ ├── sm90_gemm_gated_tma_warpspecialized_cooperative.hpp │ │ │ │ ├── sm90_gemm_gated_tma_warpspecialized_pingpong.hpp │ │ │ │ └── splitk_gemm_grouped.h │ │ │ ├── threadblock │ │ │ │ ├── default_dq_mma.h │ │ │ │ ├── default_dq_mma_multistage.h │ │ │ │ ├── default_dq_mma_pipelined.h │ │ │ │ ├── default_mma.h │ │ │ │ ├── default_mma_bf16.h │ │ │ │ ├── default_mma_core.h │ │ │ │ ├── default_wint2x_mma.h │ │ │ │ ├── dq_mma_base.h │ │ │ │ ├── dq_mma_multistage.h │ │ │ │ ├── dq_mma_multistage_finegrained.h │ │ │ │ ├── dq_mma_multistage_percol.h │ │ │ │ ├── dq_mma_pipelined.h │ │ │ │ ├── dq_mma_pipelined_finegrained.h │ │ │ │ ├── dq_mma_pipelined_percol.h │ │ │ │ ├── wint2x_mma_base.h │ │ │ │ ├── wint2x_mma_multistage.h │ │ │ │ ├── wint2x_params_accessor.h │ │ │ │ └── wint2x_unzip.h │ │ │ └── warp │ │ │ │ ├── default_mma_tensor_op.h │ │ │ │ ├── mma_tensorop_compute_B_with_f16.h │ │ │ │ ├── mma_tensorop_dequantizer.h │ │ │ │ └── mma_tensorop_wint2x_dequantizer.h │ │ ├── gemm_configs.h │ │ ├── interleaved_numeric_conversion.h │ │ ├── tile_interleaved_layout.h │ │ ├── transform │ │ │ └── threadblock │ │ │ │ └── fine_grained_scale_zero_iterator.h │ │ ├── util │ │ │ └── gather_tensor.hpp │ │ ├── weight_only_quant_op.h │ │ └── wint_type_traits.h │ ├── cutlass_kernels │ │ ├── cutlass_helper.h │ │ ├── cutlass_heuristic.cu │ │ ├── cutlass_heuristic.h │ │ ├── cutlass_preprocessors.cu │ │ ├── cutlass_preprocessors.h │ │ ├── cutlass_type_conversion.h │ │ ├── fp8_gemm_fused │ │ │ ├── dual_gemm │ │ │ │ ├── device │ │ │ │ │ └── dual_gemm.h │ │ │ │ ├── dual_gemm_common.h │ │ │ │ ├── kernel │ │ │ │ │ └── dual_gemm.h │ │ │ │ ├── thread │ │ │ │ │ ├── left_gelu_and_mul.h │ │ │ │ │ └── left_silu_and_mul.h │ │ │ │ └── threadblock │ │ │ │ │ ├── dual_epilogue.h │ │ │ │ │ ├── dual_mma_base.h │ │ │ │ │ └── dual_mma_multistage.h │ │ │ ├── fp8_fp8_dual_gemm_scale_bias_act.h │ │ │ ├── fp8_fp8_gemm_scale_bias_act.h │ │ │ ├── fuse_block_gemm_act_template_3x.h │ │ │ ├── fuse_dual_gemm_act_template_3x.h │ │ │ ├── fuse_dual_gemm_geglu_template.h │ │ │ ├── fuse_dual_gemm_swiglu_template.h │ │ │ ├── fuse_gemm_act_template_3x.h │ │ │ ├── fuse_gemm_gelu_template.h │ │ │ ├── fuse_gemm_noact_template.h │ │ │ ├── fuse_gemm_relu_template.h │ │ │ ├── per_channel_fp8_fp8_half_gemm.h │ │ │ ├── visitor_fp8_gemm_fused.h │ │ │ └── visitor_fp8_gemm_fused_template.h │ │ ├── fpA_intB_gemm │ │ │ ├── fpA_intB_gemm.h │ │ │ └── fpA_intB_gemm_template.h │ │ ├── moe_gemm │ │ │ ├── fused_moe_cutlass_kernel.h │ │ │ ├── fused_moe_gemm_kernels.h │ │ │ ├── fused_moe_gemm_kernels_bf16_bf16.cu │ │ │ ├── fused_moe_gemm_kernels_bf16_int2.cu │ │ │ ├── fused_moe_gemm_kernels_bf16_int4.cu │ │ │ ├── fused_moe_gemm_kernels_bf16_int8.cu │ │ │ ├── fused_moe_gemm_kernels_fp16_fp16.cu │ │ │ ├── fused_moe_gemm_kernels_fp16_int2.cu │ │ │ ├── fused_moe_gemm_kernels_fp16_int4.cu │ │ │ ├── fused_moe_gemm_kernels_fp16_int8.cu │ │ │ └── fused_moe_gemm_kernels_template.h │ │ ├── w4a8_moe │ │ │ ├── base64_encode.h │ │ │ ├── compile_w4a8_moe.sh │ │ │ ├── cuda_utils.h │ │ │ ├── cutlass_extensions │ │ │ │ ├── arch │ │ │ │ │ ├── mma.h │ │ │ │ │ └── mma_sm80.h │ │ │ │ ├── compute_occupancy.h │ │ │ │ ├── epilogue │ │ │ │ │ ├── epilogue_quant_helper.h │ │ │ │ │ └── threadblock │ │ │ │ │ │ ├── epilogue_per_row_per_col_scale_nf4.h │ │ │ │ │ │ └── epilogue_tensor_op_int32.h │ │ │ │ ├── epilogue_helpers.h │ │ │ │ ├── ft_gemm_configs.h │ │ │ │ ├── gemm │ │ │ │ │ ├── kernel │ │ │ │ │ │ ├── default_dequant_gemm_nf4.h │ │ │ │ │ │ ├── default_intA_nf4B_traits.h │ │ │ │ │ │ ├── gemm_with_epilogue_visitor_interleaved_nf4.h │ │ │ │ │ │ └── mixed_gemm_B_layout.h │ │ │ │ │ ├── threadblock │ │ │ │ │ │ ├── default_mma_nf4_int8_interleaved.h │ │ │ │ │ │ ├── default_nf4_int8_interleaved_mma.h │ │ │ │ │ │ ├── default_nf4_int8_interleaved_mma_multistage.h │ │ │ │ │ │ ├── int8_mma_base.h │ │ │ │ │ │ ├── int8_mma_multistage.h │ │ │ │ │ │ ├── int8_mma_pipelined.h │ │ │ │ │ │ ├── nf4_int8_mma_base.h │ │ │ │ │ │ └── nf4_int8_mma_multistage.h │ │ │ │ │ └── warp │ │ │ │ │ │ ├── default_mma_tensor_op.h │ │ │ │ │ │ ├── mma_tensorop_compute_B_with_f16.h │ │ │ │ │ │ └── mma_tensorop_dequantizer.h │ │ │ │ ├── interleaved_numeric_conversion.h │ │ │ │ ├── interleaved_numeric_conversion_nf4.h │ │ │ │ └── tile_interleaved_layout.h │ │ │ ├── cutlass_heuristic_w4a4.h │ │ │ ├── w4a4_gemm_configs.h │ │ │ ├── w4a8_gemm_grouped.h │ │ │ ├── w4a8_moe_cutlass_kernel.h │ │ │ ├── w4a8_moe_cutlass_kernel_template.cu │ │ │ ├── w4a8_moe_gemm_config_search.sh │ │ │ ├── w4a8_moe_gemm_kernel.h │ │ │ ├── w4a8_moe_gemm_kernel_template.h │ │ │ ├── w4a8_moe_gemm_test.cu │ │ │ ├── w4a8_moe_gemm_with_epilogue_visitor.h │ │ │ └── weight_process_utils.h │ │ ├── w8a8 │ │ │ ├── c3x │ │ │ │ ├── cutlass_gemm_caller.cuh │ │ │ │ ├── scaled_mm.cuh │ │ │ │ ├── scaled_mm_azp_sm90_int8.cu │ │ │ │ ├── scaled_mm_helper.hpp │ │ │ │ ├── scaled_mm_kernels.hpp │ │ │ │ ├── scaled_mm_sm90_fp8.cu │ │ │ │ ├── scaled_mm_sm90_fp8_dispatch.cuh │ │ │ │ ├── scaled_mm_sm90_int8.cu │ │ │ │ └── scaled_mm_sm90_int8_dispatch.cuh │ │ │ ├── scaled_mm_c2x.cu │ │ │ ├── scaled_mm_c2x.cuh │ │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ │ ├── scaled_mm_c3x_sm90.cu │ │ │ └── scaled_mm_entry.cu │ │ └── weight_process_utils.h │ ├── dequant_int8.cu │ ├── enforce_generation.cu │ ├── env.h │ ├── flash_mask_attn │ │ ├── flash_mask_attn.cu │ │ ├── flash_mask_attn_kernel.hpp │ │ ├── kernel_traits.h │ │ ├── mainloop_attn.hpp │ │ ├── softmax.hpp │ │ └── utils.hpp │ ├── fp8_gemm_with_cutlass │ │ ├── fp8_common.h │ │ ├── fp8_fp8_fp8_dual_gemm.cu │ │ ├── fp8_fp8_half_block_gemm.cu │ │ ├── fp8_fp8_half_cuda_core_gemm.cu │ │ ├── fp8_fp8_half_cuda_core_gemm.h │ │ ├── fp8_fp8_half_gemm.cu │ │ └── per_channel_fp8_fp8_half_gemm.cu │ ├── fused_get_rotary_embedding.cu │ ├── fused_hadamard_quant_fp8.cu │ ├── fused_neox_rope_embedding.cu │ ├── fused_rotary_position_encoding.cu │ ├── gather_idx.cu │ ├── gelu_tanh.cu │ ├── get_data_ptr_ipc.cu │ ├── get_img_boundaries.cc │ ├── get_mm_split_fuse.cc │ ├── get_output.cc │ ├── get_output_ep.cc │ ├── get_output_msg_with_topk.cc │ ├── get_padding_offset.cu │ ├── get_padding_offset_system.cu │ ├── get_position_ids_and_mask_encoder_batch.cu │ ├── glog │ │ └── logging.h │ ├── helper.cu │ ├── helper.h │ ├── init_signal_layerwise.cc │ ├── int8_gemm_with_cutlass │ │ ├── epilogue_tensor_op_int32.h │ │ ├── gemm_dequant.cu │ │ └── gemm_dequant.h │ ├── ipc_sent_key_value_cache_by_remote_ptr.cu │ ├── limit_thinking_content_length_v1.cu │ ├── limit_thinking_content_length_v2.cu │ ├── machete │ │ ├── generate.py │ │ ├── machete_collective_builder.cuh │ │ ├── machete_cutlass_library_extension.py │ │ ├── machete_interleaving_utils.cuh │ │ ├── machete_mainloop.cuh │ │ ├── machete_mm.cu │ │ ├── machete_mm_kernel.cuh │ │ ├── machete_mm_launcher.cuh │ │ ├── machete_prepack_B.cu │ │ ├── machete_prepack_kernel.cuh │ │ ├── machete_prepack_launcher.cuh │ │ ├── machete_prepacked_layout.cuh │ │ ├── machete_supported_schedules.cu │ │ └── utils │ │ │ ├── cute_utils.cuh │ │ │ ├── machete_collective_builder.cuh │ │ │ ├── machete_custom_types.cuh │ │ │ ├── machete_numeric_conversion.cuh │ │ │ ├── machete_type_utils.cuh │ │ │ ├── paddle_utils.hpp │ │ │ └── scalar_type.h │ ├── merge_prefill_decode_output.cu │ ├── mla_attn │ │ ├── attention_updater.cuh │ │ ├── batch_mla_with_paged_kv_cache.cu │ │ ├── batch_mla_with_paged_kv_cache.h │ │ ├── epilogue.cuh │ │ ├── kernel_traits.cuh │ │ ├── mainloop_load.cuh │ │ ├── mainloop_mma.cuh │ │ ├── mla_hopper.cuh │ │ ├── named_barrier.cuh │ │ └── utils.cuh │ ├── moba_attn │ │ ├── moba_attn.cu │ │ ├── moba_attn.h │ │ ├── moba_attn_utils.hpp │ │ ├── moba_decoder_attn │ │ │ ├── moba_decoder_attn.cu │ │ │ ├── moba_decoder_attn_kernel.h │ │ │ ├── moba_decoder_write_cache.cu │ │ │ └── moba_qk_sort_decoder.cu │ │ ├── moba_encoder_attn │ │ │ ├── kernel_traits.h │ │ │ ├── mainloop_attn.hpp │ │ │ ├── moba_encoder_attn.cu │ │ │ ├── moba_encoder_write_cache.cu │ │ │ ├── moba_qk_sort_encoder.cu │ │ │ └── softmax.hpp │ │ └── moba_process │ │ │ ├── moba_get_kv_from_cache.cu │ │ │ ├── moba_mlp_einsum.cu │ │ │ ├── moba_qk_gemm.cu │ │ │ └── split_qkv_and_rope.cu │ ├── moe │ │ ├── deepgemm_preprocess.cu │ │ ├── ep_moe_expert_dispatch.cu │ │ ├── fused_moe.cu │ │ ├── fused_moe_helper.h │ │ ├── fused_moe_imp_op.h │ │ ├── fused_moe_op.h │ │ ├── gptq_marlin_repack.cu │ │ ├── group_swiglu_with_masked.cu │ │ ├── group_swiglu_with_masked.h │ │ ├── moe_deepgemm_depermute.cu │ │ ├── moe_deepgemm_permute.cu │ │ ├── moe_dispatch.cu │ │ ├── moe_expert_ffn_wint2.cu │ │ ├── moe_fast_hardamard_impl.cuh │ │ ├── moe_fast_hardamard_impl_common.h │ │ ├── moe_fast_hardamard_kernel.cu │ │ ├── moe_fast_hardamard_kernel.h │ │ ├── moe_ffn.cu │ │ ├── moe_reduce.cu │ │ ├── moe_redundant_topk_select.cu │ │ ├── moe_topk_select.cu │ │ ├── moe_wna16_marlin_gemm.cu │ │ ├── moe_wna16_marlin_gemm.h │ │ ├── moe_wna16_marlin_utils │ │ │ ├── CUDAStream.h │ │ │ ├── ScalarType.h │ │ │ ├── dequant.h │ │ │ ├── generate_kernels.py │ │ │ ├── kernel.h │ │ │ ├── marlin.cuh │ │ │ ├── marlin_dtypes.cuh │ │ │ ├── marlin_template.h │ │ │ └── types.h │ │ ├── swigluoai.cu │ │ ├── swigluoai.h │ │ ├── template_config.json │ │ ├── tritonmoe_preprocess.cu │ │ └── winx_unzip.cu │ ├── msg_utils.h │ ├── multi_head_latent_attention.cu │ ├── ngram_mask.cu │ ├── noaux_tc.cu │ ├── noaux_tc_redundant.cu │ ├── noauxtc_kernel.h │ ├── open_shm_and_get_meta_signal.cc │ ├── per_token_quant_fp8.cu │ ├── quantization │ │ ├── common.cu │ │ └── common.cuh │ ├── read_data_ipc.cu │ ├── read_ids.py │ ├── read_temp_ids.py │ ├── rebuild_padding.cu │ ├── recover_decode_task.cu │ ├── remote_cache_kv_ipc.cc │ ├── remote_cache_kv_ipc.h │ ├── sample_kernels │ │ ├── air_top_p_sampling.cu │ │ ├── min_p_sampling_from_probs.cu │ │ ├── rejection_top_p_sampling.cu │ │ ├── sampling.cuh │ │ ├── top_k_renorm_probs.cu │ │ └── utils.cuh │ ├── save_output_msg_with_topk.cc │ ├── save_with_output.cc │ ├── save_with_output_msg.cc │ ├── save_with_output_msg.h │ ├── scaled_gemm_f8_i4_f16_gemm.cu │ ├── scaled_gemm_f8_i4_f16_weight_quantize.cu │ ├── seqs2seqs.cu │ ├── set_data_ipc.cu │ ├── set_flags.cu │ ├── set_mask_value.cu │ ├── set_value_by_flags_and_idx.cu │ ├── share_external_data.cu │ ├── speculate_decoding │ │ ├── draft_model │ │ │ ├── draft_model_postprocess.cu │ │ │ ├── draft_model_preprocess.cu │ │ │ ├── draft_model_set_value_by_flags.cu │ │ │ ├── draft_model_update.cu │ │ │ ├── eagle_get_hidden_states.cu │ │ │ ├── eagle_get_self_hidden_states.cu │ │ │ ├── hydra_fetch_hidden_states.cu │ │ │ ├── mtp_save_first_token.cc │ │ │ ├── mtp_step_paddle.cu │ │ │ └── ngram_match_mixed.cu │ │ ├── ngram_match.cc │ │ ├── speculate_calcu_accept_ratio.cu │ │ ├── speculate_clear_accept_nums.cu │ │ ├── speculate_get_output.cc │ │ ├── speculate_get_output_padding_offset.cu │ │ ├── speculate_get_output_with_topk.cc │ │ ├── speculate_get_padding_offset.cu │ │ ├── speculate_get_seq_lens_output.cu │ │ ├── speculate_get_token_penalty_multi_scores.cu │ │ ├── speculate_limit_thinking_content_length_v1.cu │ │ ├── speculate_limit_thinking_content_length_v2.cu │ │ ├── speculate_logprob_utils.cu │ │ ├── speculate_msg.h │ │ ├── speculate_save_output.cc │ │ ├── speculate_save_output_with_topk.cc │ │ ├── speculate_schedule_cache.cu │ │ ├── speculate_set_stop_value_multi_seqs.cu │ │ ├── speculate_set_value_by_flags_and_idx.cu │ │ ├── speculate_step.cu │ │ ├── speculate_step_reschedule.cu │ │ ├── speculate_step_system_cache.cu │ │ ├── speculate_update.cu │ │ ├── speculate_update_input_ids_cpu.cc │ │ ├── speculate_verify.cu │ │ └── top_p_candidates.cu │ ├── step.cu │ ├── step_reschedule.cu │ ├── step_system_cache.cu │ ├── stop_generation.cu │ ├── stop_generation_multi_ends.cu │ ├── swap_cache.cu │ ├── swap_cache_batch.cu │ ├── system2group.cu │ ├── text_image_gather_scatter.cu │ ├── text_image_index_out.cu │ ├── token_penalty_multi_scores.cu │ ├── token_penalty_only_once.cu │ ├── token_transfer.hpp │ ├── transfer_output.cc │ ├── tune_cublaslt_gemm.cu │ ├── unset_data_ipc.cu │ ├── update_attn_mask_offsets.cu │ ├── update_inputs.cu │ ├── update_inputs_beam.cu │ ├── update_inputs_v1.cu │ ├── update_split_fuse_input.cu │ ├── w4afp8_gemm │ │ ├── kernel_traits.h │ │ ├── mainloop_fwd.h │ │ ├── utils.hpp │ │ ├── w4afp8_gemm.cu │ │ ├── w4afp8_gemm.h │ │ ├── w4afp8_gemm_kernel.hpp │ │ ├── weight_kernel.hpp │ │ └── weight_scale_kernel.hpp │ └── wfp8afp8_sparse_gemm │ │ ├── kernel_traits.h │ │ ├── mainloop_fwd.h │ │ ├── utils.hpp │ │ ├── w8a8_sparse_gemm_kernel.hpp │ │ ├── wfp8afp8_sparse_gemm.cu │ │ └── wfp8afp8_sparse_gemm_weight.cu ├── iluvatar_ops │ ├── fused_moe_helper.h │ ├── fused_moe_imp_op.h │ ├── fused_moe_op.h │ ├── mixed_fused_attn.cu │ ├── moe_dispatch.cu │ ├── moe_reduce.cu │ ├── paged_attn.cu │ ├── prefill_fused_attn.cu │ ├── runtime │ │ ├── iluvatar_context.cc │ │ └── iluvatar_context.h │ └── w8a16_group_gemm.cu ├── metax_ops │ ├── apply_rope.cu │ ├── fused_moe.cu │ ├── fused_moe_helper.h │ ├── fused_moe_imp_op.h │ ├── fused_moe_op.h │ ├── mc_fused_moe_helper.h │ ├── moe_dispatch.cu │ ├── moe_ffn.cu │ └── moe_reduce.cu ├── setup_ops.py ├── setup_ops_cpu.py ├── utils │ ├── auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py │ ├── auto_gen_fp8_fp8_dual_gemm_fused_kernels.py │ ├── auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py │ ├── auto_gen_fp8_fp8_gemm_fused_kernels.py │ ├── auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py │ ├── auto_gen_template_instantiation.py │ ├── auto_gen_visitor_fp8_gemm_fused_kernels.py │ ├── auto_gen_w4afp8_gemm_kernel.py │ └── auto_gen_wfp8afp8_sparse_gemm_kernel.py └── xpu_ops │ ├── build.sh │ ├── download_dependencies.sh │ ├── setup_ops.py │ ├── src │ ├── ops │ │ ├── adjust_batch.cc │ │ ├── block_attn.cc │ │ ├── device │ │ │ ├── get_context_gm_max_mem_demand.cc │ │ │ ├── get_free_global_memory.cc │ │ │ ├── get_total_global_memory.cc │ │ │ └── get_used_global_memory.cc │ │ ├── fused_rms_norm.cc │ │ ├── gather_next_token.cc │ │ ├── get_img_boundaries.cc │ │ ├── get_infer_param.cc │ │ ├── get_output.cc │ │ ├── get_padding_offset.cc │ │ ├── get_token_penalty_multi_scores.cc │ │ ├── init_signal_layerwise.cc │ │ ├── limit_thinking_content_length_v1.cc │ │ ├── limit_thinking_content_length_v2.cc │ │ ├── moe_ep_combine.cc │ │ ├── moe_ep_dispatch.cc │ │ ├── moe_expert_ffn.cc │ │ ├── moe_layer.cc │ │ ├── moe_redundant_topk_select.cc │ │ ├── moe_topk_select.cc │ │ ├── msg_utils.h │ │ ├── mtp │ │ │ ├── draft_model_postprocess.cc │ │ │ ├── draft_model_preprocess.cc │ │ │ ├── draft_model_update.cc │ │ │ ├── eagle_get_hidden_states.cc │ │ │ ├── eagle_get_self_hidden_states.cc │ │ │ ├── mtp_save_first_token.cc │ │ │ ├── mtp_step_paddle.cc │ │ │ ├── speculate_clear_accept_nums.cc │ │ │ ├── speculate_get_output.cc │ │ │ ├── speculate_get_output_padding_offset.cc │ │ │ ├── speculate_get_padding_offset.cc │ │ │ ├── speculate_get_seq_lens_output.cc │ │ │ ├── speculate_msg.h │ │ │ ├── speculate_rebuild_append_padding.cc │ │ │ ├── speculate_save_output.cc │ │ │ ├── speculate_set_stop_value_multi_seqs.cc │ │ │ ├── speculate_set_value_by_flags.cc │ │ │ ├── speculate_step_paddle.cc │ │ │ ├── speculate_step_reschedule.cc │ │ │ ├── speculate_token_penalty_multi_scores.cc │ │ │ ├── speculate_update_input_ids_cpu.cc │ │ │ ├── speculate_update_v3.cc │ │ │ ├── speculate_verify.cc │ │ │ └── top_p_candidates.cc │ │ ├── open_shm_and_get_meta_signal.cc │ │ ├── pybind │ │ │ ├── alloc_cache_pinned.cc │ │ │ ├── cachekv_signal_thread_worker.cc │ │ │ ├── cachekv_signal_thread_worker.h │ │ │ ├── get_peermem_addr.cc │ │ │ ├── profiler.cc │ │ │ ├── pybind.cc │ │ │ └── pybind.h │ │ ├── read_data_ipc.cc │ │ ├── recover_decode_task.cc │ │ ├── remote_cache_kv_ipc.cc │ │ ├── remote_cache_kv_ipc.h │ │ ├── save_with_output_msg.cc │ │ ├── set_data_ipc.cc │ │ ├── set_value_by_flags_and_idx.cc │ │ ├── share_external_data.cc │ │ ├── step.cc │ │ ├── stop_generation_multi_ends.cc │ │ ├── swap_cache_batch.cc │ │ ├── text_image_gather_scatter.cc │ │ ├── text_image_index_out.cc │ │ ├── update_inputs.cc │ │ ├── update_inputs_v1.cc │ │ ├── utility │ │ │ ├── debug.cc │ │ │ ├── debug.h │ │ │ ├── env.cc │ │ │ ├── env.h │ │ │ ├── helper.h │ │ │ ├── logging.cc │ │ │ └── logging.h │ │ ├── weight_only_linear.cc │ │ ├── weight_quantize_xpu.cc │ │ └── xpu_multiprocess.h │ └── plugin │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── build.sh │ │ ├── include │ │ └── xpu │ │ │ └── plugin.h │ │ └── src │ │ ├── kernel │ │ └── kunlun3cpp │ │ │ ├── ban_bad_words.xpu │ │ │ ├── eb_adjust_batch.xpu │ │ │ ├── eb_gather_next_token.xpu │ │ │ ├── free_and_dispatch_block.xpu │ │ │ ├── get_padding_offset.xpu │ │ │ ├── limit_thinking_content_length_v1.xpu │ │ │ ├── limit_thinking_content_length_v2.xpu │ │ │ ├── min_length_logits_process.xpu │ │ │ ├── mtp_kernel │ │ │ ├── compute_order.xpu │ │ │ ├── compute_self_order.xpu │ │ │ ├── draft_model_postprocess.xpu │ │ │ ├── draft_model_preprocess.xpu │ │ │ ├── draft_model_update.xpu │ │ │ ├── eb_mtp_gather_next_token.xpu │ │ │ ├── mtp_free_and_dispatch_block.xpu │ │ │ ├── rebuild_append_padding.xpu │ │ │ ├── rebuild_hidde_states.xpu │ │ │ ├── rebuild_self_hidde_states.xpu │ │ │ ├── speculate_ban_bad_words.xpu │ │ │ ├── speculate_clear_accept_nums.xpu │ │ │ ├── speculate_free_and_dispatch_block.xpu │ │ │ ├── speculate_free_and_reschedule.xpu │ │ │ ├── speculate_get_output_padding_offset.xpu │ │ │ ├── speculate_get_padding_offset.xpu │ │ │ ├── speculate_get_seq_lens_output.xpu │ │ │ ├── speculate_min_length_logits_process.xpu │ │ │ ├── speculate_recover_block.xpu │ │ │ ├── speculate_set_stop_value_multi_seqs.xpu │ │ │ ├── speculate_set_value_by_flags.xpu │ │ │ ├── speculate_update_repeat_times.xpu │ │ │ ├── speculate_update_v3.xpu │ │ │ ├── speculate_update_value_by_repeat_times.xpu │ │ │ ├── speculate_verify.xpu │ │ │ └── top_p_candidates.xpu │ │ │ ├── quant2d_per_channel.xpu │ │ │ ├── recover_block.xpu │ │ │ ├── recover_decode_task.xpu │ │ │ ├── remove_padding.xpu │ │ │ ├── set_stop_value_multi_ends.xpu │ │ │ ├── set_value_by_flags_and_idx.xpu │ │ │ ├── text_image_gather_scatter.xpu │ │ │ ├── text_image_index_out.xpu │ │ │ ├── update_inputs.xpu │ │ │ ├── update_inputs_v1.xpu │ │ │ ├── update_repeat_times.xpu │ │ │ └── update_value_by_repeat_times.xpu │ │ ├── linker.specs │ │ └── wrapper │ │ ├── eb_adjust_batch.cpp │ │ ├── eb_gather_next_token.cpp │ │ ├── free_and_dispatch_block.cpp │ │ ├── get_padding_offset.cpp │ │ ├── limit_thinking_content_length_v1.cpp │ │ ├── limit_thinking_content_length_v2.cpp │ │ ├── mtp_wrapper │ │ ├── compute_order.cpp │ │ ├── compute_self_order.cpp │ │ ├── draft_model_postprocess.cpp │ │ ├── draft_model_preprocess.cpp │ │ ├── draft_model_update.cpp │ │ ├── eb_mtp_gather_next_token.cpp │ │ ├── mtp_free_and_dispatch_block.cpp │ │ ├── rebuild_hidden_states.cpp │ │ ├── rebuild_self_hidden_states.cpp │ │ ├── speculate_clear_accept_nums.cpp │ │ ├── speculate_free_and_dispatch_block.cpp │ │ ├── speculate_free_and_reschedule.cpp │ │ ├── speculate_get_output_padding_offset.cpp │ │ ├── speculate_get_padding_offset.cpp │ │ ├── speculate_get_seq_lens_output.cpp │ │ ├── speculate_rebuild_append_padding.cpp │ │ ├── speculate_recover_block.cpp │ │ ├── speculate_set_stop_value_multi_seqs.cpp │ │ ├── speculate_set_value_by_flags.cpp │ │ ├── speculate_token_penalty_multi_scores.cpp │ │ ├── speculate_update_v3.cpp │ │ ├── speculate_verify.cpp │ │ └── top_p_candidates.cpp │ │ ├── nn_set_stop_value_multi_ends.cpp │ │ ├── nn_set_value_by_flags_and_idx.cpp │ │ ├── nn_token_penalty_multi_scores.cpp │ │ ├── quant2d_per_channel.cpp │ │ ├── recover_block.cpp │ │ ├── recover_decode_task.cpp │ │ ├── text_image_gather_scatter.cpp │ │ ├── text_image_index_out.cpp │ │ ├── update_inputs.cpp │ │ └── update_inputs_v1.cpp │ └── test │ ├── test_adjust_batch_and_gather_next_token.py │ ├── test_block_attn_prefix_cache.py │ ├── test_draft_model_postprocess.py │ ├── test_draft_model_preprocess.py │ ├── test_draft_model_update.py │ ├── test_eagle_get_hidden_states.py │ ├── test_eagle_get_self_hidden_states.py │ ├── test_fused_rms_norm.py │ ├── test_get_infer_param.py │ ├── test_get_padding_offset.py │ ├── test_get_token_penalty_multi_scores.py │ ├── test_moe_ep_combine.py │ ├── test_moe_ep_dispatch.py │ ├── test_moe_expert_ffn.py │ ├── test_moe_redundant_topk_select.py │ ├── test_moe_topk_select.py │ ├── test_read_data_ipc.py │ ├── test_set_data_ipc.py │ ├── test_set_get_data_ipc.py │ ├── test_set_value_by_flags_and_idx.py │ ├── test_speculate_clear_accept_nums.py │ ├── test_speculate_get_output_padding_offset.py │ ├── test_speculate_get_padding_offset.py │ ├── test_speculate_get_seq_lens_output.py │ ├── test_speculate_get_token_penalty_multi_scores.py │ ├── test_speculate_rebuild_append_padding.py │ ├── test_speculate_set_stop_value_multi_seqs.py │ ├── test_speculate_set_value_by_flags.py │ ├── test_speculate_step.py │ ├── test_speculate_update_v3.py │ ├── test_speculate_verify.py │ ├── test_step.py │ ├── test_stop_generation_multi_ends.py │ ├── test_token_repetition_penalty.py │ ├── test_update_inputs.py │ ├── test_weight_only_linear.py │ └── test_weight_quantize_xpu.py ├── dockerfiles ├── Dockerfile.gpu └── Dockerfile.xpu ├── docs ├── assets │ └── images │ │ ├── favicon.ico │ │ └── logo.jpg ├── benchmark.md ├── best_practices │ ├── ERNIE-4.5-0.3B-Paddle.md │ ├── ERNIE-4.5-21B-A3B-Paddle.md │ ├── ERNIE-4.5-21B-A3B-Thinking.md │ ├── ERNIE-4.5-300B-A47B-Paddle.md │ ├── ERNIE-4.5-VL-28B-A3B-Paddle.md │ ├── ERNIE-4.5-VL-28B-A3B-Thinking.md │ ├── ERNIE-4.5-VL-424B-A47B-Paddle.md │ ├── FAQ.md │ ├── GLM-4-MoE-Text.md │ ├── PaddleOCR-VL-0.9B.md │ └── README.md ├── cli │ ├── README.md │ ├── bench.md │ ├── chat.md │ ├── collect-env.md │ ├── complete.md │ ├── run-batch.md │ ├── serve.md │ └── tokenizer.md ├── features │ ├── chunked_prefill.md │ ├── data_parallel_service.md │ ├── disaggregated.md │ ├── early_stop.md │ ├── graph_optimization.md │ ├── images │ │ ├── GlobalScheduler.png │ │ ├── GraphOptBackendArch.svg │ │ ├── LocalScheduler.png │ │ ├── disaggregated.png │ │ ├── no_scheduler_img.png │ │ ├── plas_inference_union.png │ │ ├── plas_training_distill.png │ │ └── scheduler_img.png │ ├── load_balance.md │ ├── logits_processor.md │ ├── multi-node_deployment.md │ ├── plas_attention.md │ ├── plugins.md │ ├── prefix_caching.md │ ├── reasoning_output.md │ ├── sampling.md │ ├── speculative_decoding.md │ ├── structured_outputs.md │ └── tool_calling.md ├── get_started │ ├── README.md │ ├── ernie-4.5-vl-thinking.md │ ├── ernie-4.5-vl.md │ ├── ernie-4.5.md │ ├── installation │ │ ├── Enflame_gcu.md │ │ ├── README.md │ │ ├── hygon_dcu.md │ │ ├── iluvatar_gpu.md │ │ ├── intel_gaudi.md │ │ ├── kunlunxin_xpu.md │ │ ├── metax_gpu.md │ │ └── nvidia_gpu.md │ ├── quick_start.md │ ├── quick_start_qwen.md │ ├── quick_start_qwen25_vl.md │ └── quick_start_vl.md ├── index.md ├── offline_inference.md ├── online_serving │ ├── README.md │ ├── graceful_shutdown_service.md │ ├── images │ │ └── graceful_shutdown.png │ ├── metrics.md │ └── scheduler.md ├── parameters.md ├── quantization │ ├── README.md │ ├── images │ │ └── wint2.png │ ├── online_quantization.md │ └── wint2.md ├── requirements.txt ├── supported_models.md ├── usage │ ├── code_overview.md │ ├── environment_variables.md │ ├── fastdeploy_unit_test_guide.md │ ├── kunlunxin_xpu_deployment.md │ └── log.md └── zh │ ├── benchmark.md │ ├── best_practices │ ├── ERNIE-4.5-0.3B-Paddle.md │ ├── ERNIE-4.5-21B-A3B-Paddle.md │ ├── ERNIE-4.5-21B-A3B-Thinking.md │ ├── ERNIE-4.5-300B-A47B-Paddle.md │ ├── ERNIE-4.5-VL-28B-A3B-Paddle.md │ ├── ERNIE-4.5-VL-28B-A3B-Thinking.md │ ├── ERNIE-4.5-VL-424B-A47B-Paddle.md │ ├── FAQ.md │ ├── GLM-4-MoE-Text.md │ ├── PaddleOCR-VL-0.9B.md │ └── README.md │ ├── cli │ ├── README.md │ ├── bench.md │ ├── chat.md │ ├── collect-env.md │ ├── complete.md │ ├── run-batch.md │ ├── serve.md │ └── tokenizer.md │ ├── features │ ├── chunked_prefill.md │ ├── data_parallel_service.md │ ├── disaggregated.md │ ├── early_stop.md │ ├── graph_optimization.md │ ├── images │ │ ├── GlobalScheduler.png │ │ ├── GraphOptBackendArch.svg │ │ ├── LocalScheduler.png │ │ ├── disaggregated.png │ │ ├── no_scheduler_img.png │ │ ├── plas_inference_union.png │ │ ├── plas_training_distill.png │ │ └── scheduler_img.png │ ├── load_balance.md │ ├── logits_processor.md │ ├── multi-node_deployment.md │ ├── plas_attention.md │ ├── plugins.md │ ├── prefix_caching.md │ ├── reasoning_output.md │ ├── sampling.md │ ├── speculative_decoding.md │ ├── structured_outputs.md │ └── tool_calling.md │ ├── get_started │ ├── README.md │ ├── ernie-4.5-vl-thinking.md │ ├── ernie-4.5-vl.md │ ├── ernie-4.5.md │ ├── installation │ │ ├── Enflame_gcu.md │ │ ├── README.md │ │ ├── hygon_dcu.md │ │ ├── iluvatar_gpu.md │ │ ├── intel_gaudi.md │ │ ├── kunlunxin_xpu.md │ │ ├── metax_gpu.md │ │ └── nvidia_gpu.md │ ├── quick_start.md │ ├── quick_start_qwen.md │ ├── quick_start_qwen25_vl.md │ └── quick_start_vl.md │ ├── index.md │ ├── offline_inference.md │ ├── online_serving │ ├── README.md │ ├── graceful_shutdown_service.md │ ├── images │ │ └── graceful_shutdown.png │ ├── metrics.md │ └── scheduler.md │ ├── parameters.md │ ├── quantization │ ├── README.md │ ├── images │ │ └── wint2.png │ ├── online_quantization.md │ └── wint2.md │ ├── supported_models.md │ └── usage │ ├── code_overview.md │ ├── environment_variables.md │ ├── fastdeploy_unit_test_guide.md │ ├── kunlunxin_xpu_deployment.md │ └── log.md ├── examples └── splitwise │ ├── README.md │ ├── start_mixed.sh │ ├── start_v0_tp1.sh │ ├── start_v1_tp1.sh │ ├── stop.sh │ ├── test.sh │ └── utils.sh ├── fastdeploy ├── __init__.py ├── benchmarks │ ├── __init__.py │ ├── datasets.py │ ├── latency.py │ ├── lib │ │ ├── __init__.py │ │ ├── endpoint_request_func.py │ │ └── utils.py │ ├── serve.py │ └── throughput.py ├── cache_manager │ ├── __init__.py │ ├── cache_data.py │ ├── cache_messager.py │ ├── cache_metrics.py │ ├── cache_transfer_manager.py │ ├── multimodal_cache_manager.py │ ├── ops.py │ ├── prefix_cache_manager.py │ └── transfer_factory │ │ ├── __init__.py │ │ ├── ipc_cache_transfer.py │ │ ├── kvcache_transfer │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── README_CN.md │ │ ├── include │ │ │ ├── kvcache_connection.h │ │ │ ├── kvcache_rdma.h │ │ │ ├── log.h │ │ │ └── util.h │ │ └── src │ │ │ ├── kvcache_connection.cpp │ │ │ ├── kvcache_rdma.cpp │ │ │ ├── log.cpp │ │ │ └── pybind.cpp │ │ └── rdma_cache_transfer.py ├── collect_env.py ├── config.py ├── demo │ ├── offline_demo.py │ ├── offline_prefix_caching_demo.py │ ├── openai_demo.py │ ├── openai_vl_demo.py │ └── tokenzier_client_demo.py ├── distributed │ ├── __init__.py │ ├── communication.py │ └── custom_all_reduce │ │ ├── __init__.py │ │ ├── cuda_wrapper.py │ │ └── custom_all_reduce.py ├── engine │ ├── __init__.py │ ├── args_utils.py │ ├── async_llm.py │ ├── common_engine.py │ ├── engine.py │ ├── expert_service.py │ ├── kv_cache_interface.py │ ├── pooling_params.py │ ├── request.py │ ├── resource_manager.py │ ├── sampling_params.py │ ├── sched │ │ ├── __init__.py │ │ └── resource_manager_v1.py │ └── tasks.py ├── entrypoints │ ├── __init__.py │ ├── api_server.py │ ├── chat_utils.py │ ├── cli │ │ ├── __init__.py │ │ ├── benchmark │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── eval.py │ │ │ ├── latency.py │ │ │ ├── main.py │ │ │ ├── serve.py │ │ │ └── throughput.py │ │ ├── collect_env.py │ │ ├── main.py │ │ ├── openai.py │ │ ├── run_batch.py │ │ ├── serve.py │ │ ├── tokenizer.py │ │ └── types.py │ ├── engine_client.py │ ├── llm.py │ └── openai │ │ ├── __init__.py │ │ ├── api_server.py │ │ ├── middleware.py │ │ ├── multi_api_server.py │ │ ├── protocol.py │ │ ├── response_processors.py │ │ ├── run_batch.py │ │ ├── serving_chat.py │ │ ├── serving_completion.py │ │ ├── serving_embedding.py │ │ ├── serving_engine.py │ │ ├── serving_models.py │ │ ├── serving_reward.py │ │ ├── test_openai.py │ │ ├── tool_parsers │ │ ├── __init__.py │ │ ├── abstract_tool_parser.py │ │ ├── ernie_45_vl_thinking_tool_parser.py │ │ ├── ernie_x1_tool_parser.py │ │ └── utils.py │ │ ├── usage_calculator.py │ │ └── utils.py ├── envs.py ├── eplb │ ├── __init__.py │ ├── async_expert_loader.py │ ├── eplb.py │ ├── experts_manager.py │ └── utils.py ├── import_ops.py ├── input │ ├── __init__.py │ ├── ernie4_5_processor.py │ ├── ernie4_5_tokenizer.py │ ├── ernie4_5_vl_processor │ │ ├── __init__.py │ │ ├── ernie4_5_vl_processor.py │ │ ├── image_preprocessor │ │ │ ├── __init__.py │ │ │ ├── get_image_preprocessor.py │ │ │ └── image_preprocessor_adaptive.py │ │ ├── process.py │ │ ├── process_video.py │ │ └── utils │ │ │ ├── Roboto-Regular.ttf │ │ │ ├── __init__.py │ │ │ ├── io_utils.py │ │ │ ├── render_timestamp.py │ │ │ └── video_utils.py │ ├── paddleocr_vl_processor │ │ ├── __init__.py │ │ ├── image_processor.py │ │ ├── paddleocr_vl_processor.py │ │ ├── process.py │ │ └── process_video.py │ ├── preprocess.py │ ├── qwen_vl_processor │ │ ├── __init__.py │ │ ├── image_processor.py │ │ ├── process.py │ │ ├── process_video.py │ │ └── qwen_vl_processor.py │ ├── text_processor.py │ ├── tokenzier_client.py │ └── utils.py ├── inter_communicator │ ├── __init__.py │ ├── engine_cache_queue.py │ ├── engine_worker_queue.py │ ├── ipc_signal.py │ ├── ipc_signal_const.py │ ├── zmq_client.py │ └── zmq_server.py ├── logger │ ├── __init__.py │ ├── formatters.py │ ├── handlers.py │ ├── logger.py │ └── setup_logging.py ├── metrics │ ├── __init__.py │ ├── metrics.py │ ├── metrics_middleware.py │ ├── prometheus_multiprocess_setup.py │ ├── stats.py │ └── trace_util.py ├── model_executor │ ├── __init__.py │ ├── forward_meta.py │ ├── graph_optimization │ │ ├── __init__.py │ │ ├── cudagraph_piecewise_backend.py │ │ ├── decorator.py │ │ ├── dynamic_dims_marker.py │ │ ├── graph_optimization_backend.py │ │ └── utils.py │ ├── guided_decoding │ │ ├── __init__.py │ │ ├── base_guided_decoding.py │ │ ├── ernie_tokenizer.py │ │ ├── kernels │ │ │ └── xgrammar_apply_token_bitmask.py │ │ └── xgrammar_backend.py │ ├── layers │ │ ├── __init__.py │ │ ├── activation.py │ │ ├── attention │ │ │ ├── __init__.py │ │ │ ├── append_attn_backend.py │ │ │ ├── attention.py │ │ │ ├── attention_selecter.py │ │ │ ├── base_attention_backend.py │ │ │ ├── block_multihead_attn_backend.py │ │ │ ├── flash_attn_backend.py │ │ │ ├── flash_mask_attn_backend.py │ │ │ ├── iluvatar_attn_backend.py │ │ │ ├── mla_attention_backend.py │ │ │ ├── moba_attention_backend.py │ │ │ ├── native_paddle_backend.py │ │ │ ├── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── append_attention.py │ │ │ │ ├── flash_mask_attention.py │ │ │ │ ├── get_block_shape_and_split_kv_block.py │ │ │ │ ├── gqa_rope_write_cache.py │ │ │ │ ├── init_kv_signal_per_query.py │ │ │ │ ├── init_signal_layerwise.py │ │ │ │ ├── open_shm_and_get_meta_signal.py │ │ │ │ └── pre_cache_len_concat.py │ │ │ ├── utils.py │ │ │ └── xpu_attn_backend.py │ │ ├── backends │ │ │ ├── __init__.py │ │ │ ├── dcu │ │ │ │ ├── __init__.py │ │ │ │ ├── fused_moe_triton_backends.py │ │ │ │ ├── top_p_sampling.py │ │ │ │ ├── triton_moe_kernels.py │ │ │ │ └── weight_only.py │ │ │ ├── gcu │ │ │ │ ├── __init__.py │ │ │ │ ├── attention │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── flash_attn_backend.py │ │ │ │ │ └── mem_efficient_attn_backend.py │ │ │ │ ├── moe │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── fused_moe_method_gcu_backend.py │ │ │ │ └── quantization │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── weight_only.py │ │ │ ├── intel_hpu │ │ │ │ ├── __init__.py │ │ │ │ ├── attention │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── hpu_attn_backend.py │ │ │ │ └── moe │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── fused_moe_hpu_backend.py │ │ │ ├── metax │ │ │ │ ├── __init__.py │ │ │ │ ├── attention │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── flash_attention_interface.py │ │ │ │ │ ├── flash_attn_backend.py │ │ │ │ │ └── mla_attn_metax_backend.py │ │ │ │ └── moe │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── fused_moe_cutlass_metax_backend.py │ │ │ │ │ ├── fused_moe_triton_metax_backend.py │ │ │ │ │ └── triton_moe_kernels.py │ │ │ ├── npu │ │ │ │ └── __init__.py │ │ │ └── xpu │ │ │ │ ├── __init__.py │ │ │ │ ├── moe │ │ │ │ ├── __init__.py │ │ │ │ ├── ep.py │ │ │ │ └── fused_moe.py │ │ │ │ ├── quantization │ │ │ │ ├── __init__.py │ │ │ │ ├── kv_cache.py │ │ │ │ └── weight_only.py │ │ │ │ └── utils.py │ │ ├── batch_invariant_ops │ │ │ ├── __init__.py │ │ │ └── batch_invariant_ops.py │ │ ├── embeddings.py │ │ ├── linear.py │ │ ├── lm_head.py │ │ ├── moe │ │ │ ├── __init__.py │ │ │ ├── ep.py │ │ │ ├── fused_moe_backend_base.py │ │ │ ├── fused_moe_cutlass_backend.py │ │ │ ├── fused_moe_deepgemm_backend.py │ │ │ ├── fused_moe_marlin_backend.py │ │ │ ├── fused_moe_triton_backend.py │ │ │ ├── fused_moe_wint2_backend.py │ │ │ ├── moe.py │ │ │ └── triton_moe_kernels.py │ │ ├── mtp_linear.py │ │ ├── normalization.py │ │ ├── pool │ │ │ ├── __init__.py │ │ │ └── metadata.py │ │ ├── pooler.py │ │ ├── quantization │ │ │ ├── __init__.py │ │ │ ├── block_wise_fp8.py │ │ │ ├── kv_cache.py │ │ │ ├── mix_quant.py │ │ │ ├── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── cutlass_scaled_mm.py │ │ │ │ ├── machete_mm.py │ │ │ │ └── scaled_fp8_quant.py │ │ │ ├── quant_base.py │ │ │ ├── tensor_wise_fp8.py │ │ │ ├── w4a8.py │ │ │ ├── w4afp8.py │ │ │ ├── w8a8.py │ │ │ ├── weight_only.py │ │ │ ├── wfp8afp8.py │ │ │ └── wint2.py │ │ ├── rotary_embedding.py │ │ ├── sample │ │ │ ├── __init__.py │ │ │ ├── early_stopper.py │ │ │ ├── meta_data.py │ │ │ ├── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── apply_penalty_multi_scores.py │ │ │ │ ├── speculate_logprob_utils.py │ │ │ │ └── top_k_top_p_sampling.py │ │ │ └── sampler.py │ │ └── utils.py │ ├── load_weight_utils.py │ ├── logits_processor │ │ ├── __init__.py │ │ ├── base.py │ │ └── builtin.py │ ├── model_loader │ │ ├── __init__.py │ │ ├── base_loader.py │ │ ├── default_loader.py │ │ └── default_loader_v1.py │ ├── models │ │ ├── __init__.py │ │ ├── adapters.py │ │ ├── deepseek_v3.py │ │ ├── ernie4_5_moe.py │ │ ├── ernie4_5_mtp.py │ │ ├── ernie4_5_vl │ │ │ ├── __init__.py │ │ │ ├── dfnrope │ │ │ │ ├── __init__.py │ │ │ │ ├── activation.py │ │ │ │ ├── configuration.py │ │ │ │ └── modeling.py │ │ │ ├── dist_utils.py │ │ │ ├── ernie4_5_vl_moe.py │ │ │ ├── image_op.py │ │ │ └── modeling_resampler.py │ │ ├── ernie_vl_rm.py │ │ ├── glm4_moe.py │ │ ├── gpt_oss.py │ │ ├── interfaces_base.py │ │ ├── model_base.py │ │ ├── paddleocr_vl │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── paddleocr_vl.py │ │ │ ├── projector.py │ │ │ ├── siglip.py │ │ │ └── siglip_ops.py │ │ ├── qwen2.py │ │ ├── qwen2_5_vl │ │ │ ├── __init__.py │ │ │ ├── dfnrope │ │ │ │ ├── __init__.py │ │ │ │ ├── activation.py │ │ │ │ ├── configuration.py │ │ │ │ └── modeling.py │ │ │ └── qwen2_5_vl.py │ │ ├── qwen2_rm.py │ │ ├── qwen3.py │ │ ├── qwen3moe.py │ │ ├── tp_utils.py │ │ └── utils.py │ ├── ops │ │ ├── __init__.py │ │ ├── cpu │ │ │ └── __init__.py │ │ ├── gcu │ │ │ └── __init__.py │ │ ├── gpu │ │ │ └── __init__.py │ │ ├── iluvatar │ │ │ ├── __init__.py │ │ │ ├── moe_ops.py │ │ │ └── paged_attention.py │ │ ├── intel_hpu │ │ │ └── __init__.py │ │ ├── npu │ │ │ └── __init__.py │ │ ├── triton_ops │ │ │ ├── __init__.py │ │ │ ├── repetition_early_stop_kernel.py │ │ │ ├── triton_utils.py │ │ │ ├── triton_utils_v2.py │ │ │ └── wint2_fused_moe_kernel.py │ │ └── xpu │ │ │ └── __init__.py │ ├── pre_and_post_process.py │ ├── utils.py │ └── xpu_pre_and_post_process.py ├── multimodal │ ├── __init__.py │ ├── audio.py │ ├── base.py │ ├── hasher.py │ ├── image.py │ ├── registry.py │ ├── utils.py │ └── video.py ├── output │ ├── __init__.py │ ├── pooler.py │ ├── stream_transfer_data.py │ └── token_processor.py ├── platforms │ ├── __init__.py │ ├── base.py │ ├── cpu.py │ ├── cuda.py │ ├── dcu.py │ ├── gcu.py │ ├── iluvatar.py │ ├── intel_hpu.py │ ├── maca.py │ ├── npu.py │ ├── utils.py │ └── xpu.py ├── plugins │ ├── __init__.py │ ├── input_processor │ │ └── __init__.py │ ├── model_register │ │ └── __init__.py │ ├── model_runner │ │ └── __init__.py │ ├── reasoning_parser │ │ └── __init__.py │ ├── token_processor │ │ └── __init__.py │ └── utils.py ├── reasoning │ ├── __init__.py │ ├── abs_reasoning_parsers.py │ ├── ernie_45_vl_thinking_reasoning_parser.py │ ├── ernie_vl_reasoning_parsers.py │ ├── ernie_x1_reasoning_parsers.py │ └── qwen3_reasoning_parsers.py ├── rl │ ├── __init__.py │ ├── dynamic_weight_manager.py │ ├── rollout_config.py │ └── rollout_model.py ├── router │ ├── __init__.py │ ├── launch.py │ ├── router.py │ └── utils.py ├── scheduler │ ├── __init__.py │ ├── config.py │ ├── data.py │ ├── dp_scheduler.py │ ├── global_scheduler.py │ ├── local_scheduler.py │ ├── splitwise_scheduler.py │ ├── storage.py │ ├── utils.py │ └── workers.py ├── spec_decode │ ├── __init__.py │ ├── base.py │ ├── mtp.py │ └── ngram.py ├── splitwise │ ├── __init__.py │ ├── internal_adapter_utils.py │ └── splitwise_connector.py ├── stop.sh ├── test.yaml ├── trace │ ├── __init__.py │ ├── constants.py │ └── trace_logger.py ├── transformer_utils │ ├── __init__.py │ └── config.py ├── utils.py └── worker │ ├── __init__.py │ ├── dcu_model_runner.py │ ├── dcu_worker.py │ ├── eplb.py │ ├── experts_manager.py │ ├── gcu_model_runner.py │ ├── gcu_worker.py │ ├── gpu_model_runner.py │ ├── gpu_worker.py │ ├── hpu_model_runner.py │ ├── hpu_worker.py │ ├── iluvatar_model_runner.py │ ├── iluvatar_worker.py │ ├── metax_model_runner.py │ ├── metax_worker.py │ ├── model_runner_base.py │ ├── output.py │ ├── worker_base.py │ ├── worker_process.py │ ├── xpu_model_runner.py │ └── xpu_worker.py ├── mkdocs.yml ├── pyproject.toml ├── requirements.txt ├── requirements_dcu.txt ├── requirements_iluvatar.txt ├── requirements_metaxgpu.txt ├── scripts ├── .coveragerc ├── CheckPRTemplate.py ├── check_approval.sh ├── check_pr_approval.py ├── codecov.yml ├── coverage_run.sh ├── extract_mtp_weight_from_safetensor.py ├── generate_diff_coverage_xml.py ├── generate_full_coverage_csv.py ├── get_rdma_nics.sh ├── merge_cache_scale.py ├── offline_w4a8.py ├── run_ci_dcu.sh ├── run_ci_gcu.sh ├── run_ci_hpu.sh ├── run_ci_iluvatar.sh ├── run_ci_xpu.sh ├── run_offline_w4a8.sh ├── run_pre_ce.sh ├── run_unittest.sh ├── tune_cublaslt_int8_gemm.py ├── tune_cutlass_fp8_gemm.py ├── tune_scaled_gemm_f8_i4_f16.py ├── unittest_requirement.txt ├── vit_model_split.py └── vit_model_split.sh ├── setup.py ├── tests ├── batch_invariant │ ├── test_batch_invariance_op_addmm.py │ ├── test_batch_invariance_op_logsoftmax.py │ ├── test_batch_invariance_op_mean.py │ └── test_batch_invariance_op_mm.py ├── benchmarks │ ├── lib │ │ ├── test_endpoint_request_func_benchmarks.py │ │ └── test_utils_benchmarks.py │ ├── test_datasets_benchmarks.py │ ├── test_latency_benchmarks.py │ ├── test_serve_benchmarks.py │ └── test_throughput_benchmarks.py ├── cache_manager │ └── test_cache_transfer_manager.py ├── ce │ ├── accuracy_cases │ │ ├── gsm8k.parquet │ │ └── gsm8k.py │ ├── deploy │ │ └── deploy.py │ ├── performance │ │ └── stress_tools.py │ ├── server │ │ ├── core │ │ │ ├── __init__.py │ │ │ ├── logger.py │ │ │ ├── request_template.py │ │ │ └── utils.py │ │ ├── demo.py │ │ ├── requirements.txt │ │ ├── test_DDoS.py │ │ ├── test_base_chat.py │ │ ├── test_compare_top_logprobs.py │ │ ├── test_completions.py │ │ ├── test_evil_cases.py │ │ ├── test_logprobs.py │ │ ├── test_max_concurrency.py │ │ ├── test_max_waiting_time.py │ │ ├── test_params_boundary.py │ │ ├── test_prompt_ids.py │ │ ├── test_repetition_early_stop.py │ │ ├── test_return_token_ids.py │ │ ├── test_seed_usage.py │ │ └── test_stream.py │ └── stable_cases │ │ ├── launch_model.sh │ │ └── run.sh ├── ci_use │ ├── DCU │ │ └── run_ernie.py │ ├── EB_Lite │ │ └── test_EB_Lite_serving.py │ ├── EB_Lite_with_adapter │ │ ├── test_eblite_serving.py │ │ └── zmq_client.py │ ├── EB_VL_Lite │ │ └── test_EB_VL_Lite_serving.py │ ├── GCU │ │ └── run_ernie.py │ ├── GLM-45-AIR │ │ ├── baseline.txt │ │ └── test_rollout_model.py │ ├── HPU │ │ └── run_ernie.py │ ├── Qwen2-7B-Instruct_offline │ │ └── test_Qwen2-7B-Instruct_offline.py │ ├── Qwen2-7B-Instruct_serving │ │ └── test_Qwen2-7B-Instruct_serving.py │ ├── Qwen2_5_VL │ │ └── test_Qwen2_5_VL_serving.py │ ├── Qwen3-MoE │ │ └── test_Qwen3-MoE_serving.py │ ├── XPU_45T │ │ ├── run_45T.py │ │ ├── run_45vl.py │ │ ├── run_ep.py │ │ ├── run_ep_online.py │ │ ├── run_pd.py │ │ └── run_w4a8.py │ ├── iluvatar_UT │ │ ├── run_ernie300B_4layer.py │ │ └── run_ernie_vl_28B.py │ ├── metrics │ │ └── test_metrics.py │ └── utils │ │ └── rollout_model.py ├── conftest.py ├── cov_pytest.ini ├── distributed │ ├── chunked_moe.py │ ├── custom_all_reduce.py │ ├── test_chunked_moe.py │ ├── test_communication.py │ ├── test_cuda_wrapper.py │ └── test_custom_all_reduce.py ├── e2e │ ├── EB_VL_Lite │ │ ├── baseline.txt │ │ └── test_rollout_model.py │ ├── test_DeepSeek_V3_5layers_serving.py │ ├── test_EB_Lite_serving.py │ ├── test_EB_VL_Lite_serving.py │ ├── test_EB_VL_Lite_sot_serving.py │ ├── test_Qwen2-7B-Instruct_serving.py │ ├── test_Qwen2_5_VL_serving.py │ ├── test_Qwen2_5_VL_torch_serving.py │ ├── test_api_key.py │ ├── test_ernie_03b_pd_router_v0.py │ ├── test_ernie_03b_pd_router_v1_ipc.py │ ├── test_ernie_03b_pd_router_v1_rdma_tp1.py │ ├── test_ernie_03b_pd_router_v1_rdma_tp2.py │ ├── test_ernie_03b_pd_splitwise_scheduler.py │ ├── test_ernie_03b_router.py │ ├── test_ernie_21b_mtp.py │ ├── test_fake_Glm45_AIR_serving.py │ ├── test_paddleocr_vl_serving.py │ └── utils │ │ ├── __init__.py │ │ ├── get_rdma_nics.sh │ │ ├── rollout_model.py │ │ └── serving_utils.py ├── engine │ ├── test_async_llm.py │ ├── test_kv_cache_interface.py │ └── test_sampling_params.py ├── entrypoints │ ├── cli │ │ ├── benchmark │ │ │ ├── test_eval.py │ │ │ └── test_throughput.py │ │ ├── test_collect_env_conmmand.py │ │ ├── test_collect_env_script.py │ │ ├── test_main.py │ │ ├── test_openai.py │ │ ├── test_serve.py │ │ ├── test_tokenizer_cli.py │ │ └── test_types.py │ ├── openai │ │ ├── test_api_authentication.py │ │ ├── test_build_sample_logprobs.py │ │ ├── test_chatcompletion_request.py │ │ ├── test_completion_echo.py │ │ ├── test_dealer_connection_manager.py │ │ ├── test_error_response.py │ │ ├── test_finish_reason.py │ │ ├── test_max_streaming_tokens.py │ │ ├── test_metrics_routes.py │ │ ├── test_multi_api_server.py │ │ ├── test_response_processors.py │ │ ├── test_run_batch.py │ │ ├── test_run_batch_proto.py │ │ ├── test_run_batch_subcommand.py │ │ ├── test_serving_chat.py │ │ ├── test_serving_completion.py │ │ ├── test_serving_embedding.py │ │ ├── test_serving_models.py │ │ ├── test_serving_reward.py │ │ ├── test_usage_calculator.py │ │ ├── test_wrap_streaming_generator.py │ │ └── tool_parsers │ │ │ ├── test_ernie_45_vl_thinking_tool_parser.py │ │ │ ├── test_ernie_x1_tool_parser.py │ │ │ └── test_tool_parsers_utils.py │ ├── test_chat.py │ ├── test_engine_client.py │ ├── test_generation.py │ └── test_vllm_run_engine.py ├── eplb │ ├── test_async_expert_loader.py │ ├── test_eplb.py │ ├── test_eplb_utils.py │ └── test_experts_manager.py ├── graph_optimization │ ├── test_cuda_graph_dynamic_subgraph.py │ ├── test_cuda_graph_recapture.py │ ├── test_cuda_graph_spec_decode.py │ ├── test_graph_opt_backend.py │ └── test_static_graph_cuda_graph_split.py ├── input │ ├── test_ernie4_5_processor.py │ ├── test_ernie_processor.py │ ├── test_ernie_vl_processor.py │ ├── test_paddleocr_vl_processor.py │ ├── test_process_video.py │ ├── test_qwen_vl_processor.py │ ├── test_text_processor.py │ └── test_tokenizer_client.py ├── inter_communicator │ └── test_e2w_queue.py ├── layers │ ├── test_activation.py │ ├── test_append_attention.py │ ├── test_append_attention_with_output.py │ ├── test_attention_layer.py │ ├── test_ffn.py │ ├── test_fusedmoe.py │ ├── test_guided_decoding.py │ ├── test_min_sampling.py │ ├── test_moba_attention_backend.py │ ├── test_native_paddle_backend.py │ ├── test_plas_attention.py │ ├── test_quantized_linear.py │ ├── test_repetition_early_stopper.py │ ├── test_sampler.py │ ├── test_speculative_sampler.py │ └── test_w4a8_moe.py ├── logger │ ├── test_formatters.py │ ├── test_handlers.py │ ├── test_logger.py │ └── test_setup_logging.py ├── metrics │ ├── test_metrics.py │ ├── test_metrics_middleware.py │ ├── test_new_metrics.py │ ├── test_prometheus_multiprocess_setup.py │ └── test_trace_util.py ├── model_executor │ ├── guided_decoding │ │ └── test_xgrammar_checker.py │ ├── ops │ │ └── triton_ops │ │ │ ├── test_triton_utils.py │ │ │ └── test_triton_utils_v2.py │ ├── test_ep.py │ ├── test_forward_meta_str.py │ ├── test_logits_processor.py │ ├── test_tensor_wise_fp8.py │ └── test_tp_utils.py ├── model_loader │ ├── test_load_attention.py │ ├── test_load_ernie_vl.py │ ├── test_load_mtp.py │ ├── test_model_cache.py │ ├── test_offline_model.py │ ├── test_torch_model.py │ ├── test_w4a8_model.py │ └── utils.py ├── multimodal │ ├── test_hasher.py │ └── test_multimodal_utils.py ├── operators │ ├── test_air_top_p_sampling.py │ ├── test_cutlass_fp8_fp8_fp8_dual_gemm_fused.py │ ├── test_cutlass_scaled_mm.py │ ├── test_deqant_int8_cpp_extension.py │ ├── test_dequant.py │ ├── test_draft_model_postprocess.py │ ├── test_draft_model_preprocess.py │ ├── test_draft_model_set_value_by_flags.py │ ├── test_draft_model_update.py │ ├── test_dynamic_per_token_scaled_fp8_quant.py │ ├── test_eagle_get_hidden_states.py │ ├── test_eagle_get_self_hidden_states.py │ ├── test_flash_mask_attn.py │ ├── test_fp8_fp8_half_cuda_core_gemm.py │ ├── test_fused_get_rotary_embedding.py │ ├── test_fused_hadamard_quant_fp8.py │ ├── test_fused_moe.py │ ├── test_fused_neox_rope_embedding.py │ ├── test_fused_rotary_position_encoding.py │ ├── test_gelu_tanh.py │ ├── test_get_padding_offset.py │ ├── test_get_position_ids_and_mask_encoder_batch.py │ ├── test_get_token_penalty_multi_scores.py │ ├── test_group_swiglu_with_masked.py │ ├── test_hybrid_mtp_ngram.py │ ├── test_limit_thinking_content_length.py │ ├── test_machete_mm.py │ ├── test_masked_per_token_quant.py │ ├── test_moe_redundant_topk_select.py │ ├── test_moe_top_k_select.py │ ├── test_ngram_match.py │ ├── test_noaux_tc.py │ ├── test_noaux_tc_redundant.py │ ├── test_per_token_quant.py │ ├── test_pre_cache_len_concat.py │ ├── test_rebuild_padding.py │ ├── test_rejection_top_p_sampling.py │ ├── test_scaled_gemm_f8_i4_f16.py │ ├── test_set_value_by_flags_and_idx.py │ ├── test_share_external_data.py │ ├── test_speculate_get_output_padding_offset.py │ ├── test_speculate_get_padding_offset.py │ ├── test_speculate_get_seq_lens_output.py │ ├── test_speculate_get_target_logits.py │ ├── test_speculate_get_token_penalty_multi_scores.py │ ├── test_speculate_insert_first_token.py │ ├── test_speculate_limit_thinking_content_length.py │ ├── test_speculate_set_stop_value_multi_seqs.py │ ├── test_speculate_update.py │ ├── test_speculate_verify.py │ ├── test_speculative_schedule_cache.py │ ├── test_split_fuse.py │ ├── test_stop_generation_multi_ends.py │ ├── test_token_penalty.py │ ├── test_top_k_renorm_probs.py │ ├── test_top_p_candidates.py │ ├── test_tree_mask.py │ ├── test_tritonmoe_preprocess.py │ ├── test_update_attn_mask.py │ ├── test_update_inputs_v1.py │ ├── test_w4afp8_gemm.py │ └── test_wfp8afp8_sparse_gemm.py ├── output │ ├── test_get_save_output_v1.py │ ├── test_pooler.py │ ├── test_process_batch_draft_tokens.py │ ├── test_process_batch_output.py │ ├── test_process_batch_output_use_zmq.py │ ├── test_stream_transfer_data.py │ └── test_token_processor_trace_print.py ├── platforms │ ├── test_platforms.py │ └── test_utils.py ├── plugins │ ├── fd_add_dummy_model │ │ └── __init__.py │ ├── fd_add_dummy_model_runner │ │ └── __init__.py │ ├── setup.py │ └── test_model_registry.py ├── pooling │ ├── test_Qwen3-Embedding_serving.py │ └── test_embedding.py ├── quantization │ ├── test_kv_cache.py │ ├── test_w4a8.py │ └── test_w4afp8.py ├── reasoning │ └── test_reasoning_parser.py ├── scheduler │ ├── test_dp_scheduler.py │ └── test_workers.py ├── splitwise │ ├── test_internal_adapter_utils.py │ └── test_splitwise_connector.py ├── trace │ ├── test_constants.py │ └── test_trace_logger.py ├── utils.py ├── utils │ ├── test_config.py │ ├── test_custom_chat_template.py │ ├── test_download.py │ ├── test_exception_handler.py │ ├── test_run_batch_tools.py │ └── test_version.py ├── v1 │ ├── cache_manager │ │ ├── test_encoder_cache.py │ │ ├── test_prefix_cache.py │ │ └── test_revert_blocks.py │ ├── test_resource_manager_v1.py │ └── test_schedule_output.py └── woker │ ├── test_gpu_prompt_logprobs.py │ └── test_logprobs_output.py └── tools ├── codestyle └── pre_commit.sh ├── deep_gemm_pre-compile ├── README.md ├── generate_config.py ├── pre_compile.py └── pre_compile.sh └── dockerfile ├── Dockerfile.ci ├── docker_build.sh └── requirements_paddle_nv.txt /.clang-format: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.clang-format -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.flake8 -------------------------------------------------------------------------------- /.github/actions/rerun-workflow/action.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/actions/rerun-workflow/action.yml -------------------------------------------------------------------------------- /.github/actions/rerun-workflow/rerun.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/actions/rerun-workflow/rerun.sh -------------------------------------------------------------------------------- /.github/copilot-instructions.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/copilot-instructions.md -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/pull_request_template.md -------------------------------------------------------------------------------- /.github/workflows/CheckPRTemplate.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/CheckPRTemplate.yml -------------------------------------------------------------------------------- /.github/workflows/Codestyle-Check.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/Codestyle-Check.yml -------------------------------------------------------------------------------- /.github/workflows/_accuracy_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_accuracy_test.yml -------------------------------------------------------------------------------- /.github/workflows/_base_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_base_test.yml -------------------------------------------------------------------------------- /.github/workflows/_build_linux.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_build_linux.yml -------------------------------------------------------------------------------- /.github/workflows/_ci_gcu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_ci_gcu.yml -------------------------------------------------------------------------------- /.github/workflows/_ci_image_build.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_ci_image_build.yml -------------------------------------------------------------------------------- /.github/workflows/_clone_linux.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_clone_linux.yml -------------------------------------------------------------------------------- /.github/workflows/_logprob_test_linux.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_logprob_test_linux.yml -------------------------------------------------------------------------------- /.github/workflows/_pre_ce_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_pre_ce_test.yml -------------------------------------------------------------------------------- /.github/workflows/_stable_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_stable_test.yml -------------------------------------------------------------------------------- /.github/workflows/_unit_test_coverage.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/_unit_test_coverage.yml -------------------------------------------------------------------------------- /.github/workflows/approve.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/approve.yml -------------------------------------------------------------------------------- /.github/workflows/ce_job.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/ce_job.yml -------------------------------------------------------------------------------- /.github/workflows/check-bypass.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/check-bypass.yml -------------------------------------------------------------------------------- /.github/workflows/ci_hpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/ci_hpu.yml -------------------------------------------------------------------------------- /.github/workflows/ci_iluvatar.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/ci_iluvatar.yml -------------------------------------------------------------------------------- /.github/workflows/ci_image_update.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/ci_image_update.yml -------------------------------------------------------------------------------- /.github/workflows/ci_xpu.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/ci_xpu.yml -------------------------------------------------------------------------------- /.github/workflows/gh-pages.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/gh-pages.yml -------------------------------------------------------------------------------- /.github/workflows/pr_build_and_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/pr_build_and_test.yml -------------------------------------------------------------------------------- /.github/workflows/publish_job.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/publish_job.yml -------------------------------------------------------------------------------- /.github/workflows/remove-skip-ci-labels.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/remove-skip-ci-labels.yml -------------------------------------------------------------------------------- /.github/workflows/rerun.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.github/workflows/rerun.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.gitmodules -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | README_CN.md -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/README_CN.md -------------------------------------------------------------------------------- /README_EN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/README_EN.md -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/README.md -------------------------------------------------------------------------------- /benchmarks/backend_request_func.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/backend_request_func.py -------------------------------------------------------------------------------- /benchmarks/benchmark_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/benchmark_dataset.py -------------------------------------------------------------------------------- /benchmarks/benchmark_mtp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/benchmark_mtp.py -------------------------------------------------------------------------------- /benchmarks/benchmark_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/benchmark_serving.py -------------------------------------------------------------------------------- /benchmarks/benchmark_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/benchmark_utils.py -------------------------------------------------------------------------------- /benchmarks/paddleocr_vl/PaddleOCR-VL.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/paddleocr_vl/PaddleOCR-VL.yaml -------------------------------------------------------------------------------- /benchmarks/paddleocr_vl/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/paddleocr_vl/README.md -------------------------------------------------------------------------------- /benchmarks/paddleocr_vl/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/paddleocr_vl/benchmark.py -------------------------------------------------------------------------------- /benchmarks/quick_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/quick_benchmark.py -------------------------------------------------------------------------------- /benchmarks/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | tqdm 3 | numpy 4 | Pillow 5 | pyyaml 6 | requests 7 | -------------------------------------------------------------------------------- /benchmarks/yaml/GLM45-air-32k-bf16.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/GLM45-air-32k-bf16.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/GLM45-air-32k-wfp8afp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/GLM45-air-32k-wfp8afp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/deepseek-32k-tp8-wint4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/deepseek-32k-tp8-wint4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-128k-wint4-tp1-plas.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-128k-wint4-tp1-plas.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint2-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint2-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-8k-fp8-tp1-dp8_ep.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-8k-fp8-tp1-dp8_ep.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/GLM-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/GLM-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/deepseek-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/deepseek-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/eb45-128k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/eb45-128k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/eb45-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/eb45-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/eb45-vl-128k.yaml: -------------------------------------------------------------------------------- 1 | max_tokens: 131071 2 | -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/eb45-vl-32k.yaml: -------------------------------------------------------------------------------- 1 | max_tokens: 12288 2 | -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/qwen2-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/qwen2-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/qwen3-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/qwen3-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/request.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/request.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/x1-128k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/x1-128k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/request_yaml/x1-32k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/request_yaml/x1-32k.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml -------------------------------------------------------------------------------- /benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml -------------------------------------------------------------------------------- /custom_ops/0001-DeepGEMM-95e81b3.patch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/0001-DeepGEMM-95e81b3.patch -------------------------------------------------------------------------------- /custom_ops/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/MANIFEST.in -------------------------------------------------------------------------------- /custom_ops/cpu_ops/avx_weight_only_fake.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/avx_weight_only_fake.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/get_padding_offset.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/get_padding_offset.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/rebuild_padding.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/rebuild_padding.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/set_value_by_flags.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/set_value_by_flags.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/simd_sort.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/simd_sort.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/simd_sort_fake.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/simd_sort_fake.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/update_inputs.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/update_inputs.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/xft_all_layer_fake.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/xft_all_layer_fake.cc -------------------------------------------------------------------------------- /custom_ops/cpu_ops/xft_greedy_search_fake.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/cpu_ops/xft_greedy_search_fake.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/append_attention.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/append_attention.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/append_attn/mem_util.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/append_attn/mem_util.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/append_attn/utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/append_attn/utils.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/beam_search_softmax.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/beam_search_softmax.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/common/configManager.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/common/configManager.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/common/cudaUtils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/common/cudaUtils.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/common/quantization.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/common/quantization.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/cpp_extensions.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/cpp_extensions.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/cuda_multiprocess.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/cuda_multiprocess.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/dequant_int8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/dequant_int8.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/enforce_generation.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/enforce_generation.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/env.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/env.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/flash_mask_attn/softmax.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/flash_mask_attn/softmax.hpp -------------------------------------------------------------------------------- /custom_ops/gpu_ops/flash_mask_attn/utils.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/flash_mask_attn/utils.hpp -------------------------------------------------------------------------------- /custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/fused_neox_rope_embedding.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/fused_neox_rope_embedding.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/gather_idx.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/gather_idx.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/gelu_tanh.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/gelu_tanh.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_data_ptr_ipc.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_data_ptr_ipc.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_img_boundaries.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_img_boundaries.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_mm_split_fuse.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_mm_split_fuse.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_output.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_output.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_output_ep.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_output_ep.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_output_msg_with_topk.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_output_msg_with_topk.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_padding_offset.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_padding_offset.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/get_padding_offset_system.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/get_padding_offset_system.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/glog/logging.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/glog/logging.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/helper.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/helper.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/helper.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/init_signal_layerwise.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/init_signal_layerwise.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/generate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/generate.py -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/machete_mainloop.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/machete_mainloop.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/machete_mm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/machete_mm.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/machete_prepack_B.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/machete_prepack_B.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/utils/cute_utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/utils/cute_utils.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/machete/utils/scalar_type.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/machete/utils/scalar_type.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/epilogue.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/epilogue.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/kernel_traits.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/kernel_traits.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/mainloop_load.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/mainloop_load.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/mainloop_mma.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/mainloop_mma.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/mla_hopper.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/mla_hopper.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/named_barrier.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/named_barrier.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/mla_attn/utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/mla_attn/utils.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moba_attn/moba_attn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moba_attn/moba_attn.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moba_attn/moba_attn.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moba_attn/moba_attn.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/deepgemm_preprocess.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/fused_moe.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/fused_moe.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/fused_moe_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/fused_moe_helper.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/fused_moe_imp_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/fused_moe_imp_op.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/fused_moe_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/fused_moe_op.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/gptq_marlin_repack.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/gptq_marlin_repack.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_dispatch.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_dispatch.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_expert_ffn_wint2.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_expert_ffn_wint2.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_ffn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_ffn.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_reduce.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_topk_select.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_topk_select.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/moe_wna16_marlin_gemm.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/swigluoai.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/swigluoai.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/swigluoai.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/swigluoai.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/template_config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/template_config.json -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/moe/winx_unzip.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/moe/winx_unzip.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/msg_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/msg_utils.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/ngram_mask.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/ngram_mask.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/noaux_tc.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/noaux_tc.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/noaux_tc_redundant.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/noaux_tc_redundant.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/noauxtc_kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/noauxtc_kernel.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/per_token_quant_fp8.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/per_token_quant_fp8.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/quantization/common.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/quantization/common.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/quantization/common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/quantization/common.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/read_data_ipc.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/read_data_ipc.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/read_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/read_ids.py -------------------------------------------------------------------------------- /custom_ops/gpu_ops/read_temp_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/read_temp_ids.py -------------------------------------------------------------------------------- /custom_ops/gpu_ops/rebuild_padding.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/rebuild_padding.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/recover_decode_task.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/recover_decode_task.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/remote_cache_kv_ipc.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/remote_cache_kv_ipc.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/remote_cache_kv_ipc.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/remote_cache_kv_ipc.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/sample_kernels/sampling.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/sample_kernels/sampling.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/sample_kernels/utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/sample_kernels/utils.cuh -------------------------------------------------------------------------------- /custom_ops/gpu_ops/save_output_msg_with_topk.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/save_output_msg_with_topk.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/save_with_output.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/save_with_output.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/save_with_output_msg.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/save_with_output_msg.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/save_with_output_msg.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/save_with_output_msg.h -------------------------------------------------------------------------------- /custom_ops/gpu_ops/seqs2seqs.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/seqs2seqs.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/set_data_ipc.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/set_data_ipc.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/set_flags.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/set_flags.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/set_mask_value.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/set_mask_value.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/share_external_data.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/share_external_data.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/step.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/step.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/step_reschedule.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/step_reschedule.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/step_system_cache.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/step_system_cache.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/stop_generation.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/stop_generation.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/swap_cache.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/swap_cache.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/swap_cache_batch.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/swap_cache_batch.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/system2group.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/system2group.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/text_image_gather_scatter.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/text_image_gather_scatter.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/text_image_index_out.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/text_image_index_out.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/token_penalty_only_once.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/token_penalty_only_once.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/token_transfer.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/token_transfer.hpp -------------------------------------------------------------------------------- /custom_ops/gpu_ops/transfer_output.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/transfer_output.cc -------------------------------------------------------------------------------- /custom_ops/gpu_ops/tune_cublaslt_gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/tune_cublaslt_gemm.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/unset_data_ipc.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/unset_data_ipc.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/update_attn_mask_offsets.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/update_attn_mask_offsets.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/update_inputs.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/update_inputs.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/update_inputs_beam.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/update_inputs_beam.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/update_inputs_v1.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/update_inputs_v1.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/update_split_fuse_input.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/update_split_fuse_input.cu -------------------------------------------------------------------------------- /custom_ops/gpu_ops/w4afp8_gemm/utils.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/w4afp8_gemm/utils.hpp -------------------------------------------------------------------------------- /custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm.h -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/fused_moe_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/fused_moe_helper.h -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/fused_moe_imp_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/fused_moe_imp_op.h -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/fused_moe_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/fused_moe_op.h -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/mixed_fused_attn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/mixed_fused_attn.cu -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/moe_dispatch.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/moe_dispatch.cu -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/moe_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/moe_reduce.cu -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/paged_attn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/paged_attn.cu -------------------------------------------------------------------------------- /custom_ops/iluvatar_ops/w8a16_group_gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/iluvatar_ops/w8a16_group_gemm.cu -------------------------------------------------------------------------------- /custom_ops/metax_ops/apply_rope.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/apply_rope.cu -------------------------------------------------------------------------------- /custom_ops/metax_ops/fused_moe.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/fused_moe.cu -------------------------------------------------------------------------------- /custom_ops/metax_ops/fused_moe_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/fused_moe_helper.h -------------------------------------------------------------------------------- /custom_ops/metax_ops/fused_moe_imp_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/fused_moe_imp_op.h -------------------------------------------------------------------------------- /custom_ops/metax_ops/fused_moe_op.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/fused_moe_op.h -------------------------------------------------------------------------------- /custom_ops/metax_ops/mc_fused_moe_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/mc_fused_moe_helper.h -------------------------------------------------------------------------------- /custom_ops/metax_ops/moe_dispatch.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/moe_dispatch.cu -------------------------------------------------------------------------------- /custom_ops/metax_ops/moe_ffn.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/moe_ffn.cu -------------------------------------------------------------------------------- /custom_ops/metax_ops/moe_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/metax_ops/moe_reduce.cu -------------------------------------------------------------------------------- /custom_ops/setup_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/setup_ops.py -------------------------------------------------------------------------------- /custom_ops/setup_ops_cpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/setup_ops_cpu.py -------------------------------------------------------------------------------- /custom_ops/xpu_ops/build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/build.sh -------------------------------------------------------------------------------- /custom_ops/xpu_ops/download_dependencies.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/download_dependencies.sh -------------------------------------------------------------------------------- /custom_ops/xpu_ops/setup_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/setup_ops.py -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/adjust_batch.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/adjust_batch.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/block_attn.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/block_attn.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/fused_rms_norm.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/fused_rms_norm.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/get_output.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/get_output.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/moe_ep_combine.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/moe_ep_combine.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/moe_layer.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/moe_layer.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/msg_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/msg_utils.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/pybind/pybind.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/pybind/pybind.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/pybind/pybind.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/pybind/pybind.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/read_data_ipc.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/read_data_ipc.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/set_data_ipc.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/set_data_ipc.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/step.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/step.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/update_inputs.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/update_inputs.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/debug.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/debug.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/debug.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/debug.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/env.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/env.cc -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/env.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/env.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/helper.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/ops/utility/logging.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/ops/utility/logging.h -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/plugin/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/plugin/CMakeLists.txt -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/plugin/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/plugin/README.md -------------------------------------------------------------------------------- /custom_ops/xpu_ops/src/plugin/build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/src/plugin/build.sh -------------------------------------------------------------------------------- /custom_ops/xpu_ops/test/test_set_data_ipc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/test/test_set_data_ipc.py -------------------------------------------------------------------------------- /custom_ops/xpu_ops/test/test_step.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/custom_ops/xpu_ops/test/test_step.py -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.gpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/dockerfiles/Dockerfile.gpu -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.xpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/dockerfiles/Dockerfile.xpu -------------------------------------------------------------------------------- /docs/assets/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/assets/images/favicon.ico -------------------------------------------------------------------------------- /docs/assets/images/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/assets/images/logo.jpg -------------------------------------------------------------------------------- /docs/benchmark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/benchmark.md -------------------------------------------------------------------------------- /docs/best_practices/ERNIE-4.5-0.3B-Paddle.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/best_practices/ERNIE-4.5-0.3B-Paddle.md -------------------------------------------------------------------------------- /docs/best_practices/FAQ.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/best_practices/FAQ.md -------------------------------------------------------------------------------- /docs/best_practices/GLM-4-MoE-Text.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/best_practices/GLM-4-MoE-Text.md -------------------------------------------------------------------------------- /docs/best_practices/PaddleOCR-VL-0.9B.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/best_practices/PaddleOCR-VL-0.9B.md -------------------------------------------------------------------------------- /docs/best_practices/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/best_practices/README.md -------------------------------------------------------------------------------- /docs/cli/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/README.md -------------------------------------------------------------------------------- /docs/cli/bench.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/bench.md -------------------------------------------------------------------------------- /docs/cli/chat.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/chat.md -------------------------------------------------------------------------------- /docs/cli/collect-env.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/collect-env.md -------------------------------------------------------------------------------- /docs/cli/complete.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/complete.md -------------------------------------------------------------------------------- /docs/cli/run-batch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/run-batch.md -------------------------------------------------------------------------------- /docs/cli/serve.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/serve.md -------------------------------------------------------------------------------- /docs/cli/tokenizer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/cli/tokenizer.md -------------------------------------------------------------------------------- /docs/features/chunked_prefill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/chunked_prefill.md -------------------------------------------------------------------------------- /docs/features/data_parallel_service.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/data_parallel_service.md -------------------------------------------------------------------------------- /docs/features/disaggregated.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/disaggregated.md -------------------------------------------------------------------------------- /docs/features/early_stop.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/early_stop.md -------------------------------------------------------------------------------- /docs/features/graph_optimization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/graph_optimization.md -------------------------------------------------------------------------------- /docs/features/images/GlobalScheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/GlobalScheduler.png -------------------------------------------------------------------------------- /docs/features/images/GraphOptBackendArch.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/GraphOptBackendArch.svg -------------------------------------------------------------------------------- /docs/features/images/LocalScheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/LocalScheduler.png -------------------------------------------------------------------------------- /docs/features/images/disaggregated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/disaggregated.png -------------------------------------------------------------------------------- /docs/features/images/no_scheduler_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/no_scheduler_img.png -------------------------------------------------------------------------------- /docs/features/images/scheduler_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/images/scheduler_img.png -------------------------------------------------------------------------------- /docs/features/load_balance.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/load_balance.md -------------------------------------------------------------------------------- /docs/features/logits_processor.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/logits_processor.md -------------------------------------------------------------------------------- /docs/features/multi-node_deployment.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/multi-node_deployment.md -------------------------------------------------------------------------------- /docs/features/plas_attention.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/plas_attention.md -------------------------------------------------------------------------------- /docs/features/plugins.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/plugins.md -------------------------------------------------------------------------------- /docs/features/prefix_caching.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/prefix_caching.md -------------------------------------------------------------------------------- /docs/features/reasoning_output.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/reasoning_output.md -------------------------------------------------------------------------------- /docs/features/sampling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/sampling.md -------------------------------------------------------------------------------- /docs/features/speculative_decoding.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/speculative_decoding.md -------------------------------------------------------------------------------- /docs/features/structured_outputs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/structured_outputs.md -------------------------------------------------------------------------------- /docs/features/tool_calling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/features/tool_calling.md -------------------------------------------------------------------------------- /docs/get_started/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/README.md -------------------------------------------------------------------------------- /docs/get_started/ernie-4.5-vl-thinking.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/ernie-4.5-vl-thinking.md -------------------------------------------------------------------------------- /docs/get_started/ernie-4.5-vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/ernie-4.5-vl.md -------------------------------------------------------------------------------- /docs/get_started/ernie-4.5.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/ernie-4.5.md -------------------------------------------------------------------------------- /docs/get_started/installation/Enflame_gcu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/Enflame_gcu.md -------------------------------------------------------------------------------- /docs/get_started/installation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/README.md -------------------------------------------------------------------------------- /docs/get_started/installation/hygon_dcu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/hygon_dcu.md -------------------------------------------------------------------------------- /docs/get_started/installation/intel_gaudi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/intel_gaudi.md -------------------------------------------------------------------------------- /docs/get_started/installation/metax_gpu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/metax_gpu.md -------------------------------------------------------------------------------- /docs/get_started/installation/nvidia_gpu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/installation/nvidia_gpu.md -------------------------------------------------------------------------------- /docs/get_started/quick_start.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/quick_start.md -------------------------------------------------------------------------------- /docs/get_started/quick_start_qwen.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/quick_start_qwen.md -------------------------------------------------------------------------------- /docs/get_started/quick_start_qwen25_vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/quick_start_qwen25_vl.md -------------------------------------------------------------------------------- /docs/get_started/quick_start_vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/get_started/quick_start_vl.md -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/index.md -------------------------------------------------------------------------------- /docs/offline_inference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/offline_inference.md -------------------------------------------------------------------------------- /docs/online_serving/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/online_serving/README.md -------------------------------------------------------------------------------- /docs/online_serving/metrics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/online_serving/metrics.md -------------------------------------------------------------------------------- /docs/online_serving/scheduler.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/online_serving/scheduler.md -------------------------------------------------------------------------------- /docs/parameters.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/parameters.md -------------------------------------------------------------------------------- /docs/quantization/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/quantization/README.md -------------------------------------------------------------------------------- /docs/quantization/images/wint2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/quantization/images/wint2.png -------------------------------------------------------------------------------- /docs/quantization/online_quantization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/quantization/online_quantization.md -------------------------------------------------------------------------------- /docs/quantization/wint2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/quantization/wint2.md -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/requirements.txt -------------------------------------------------------------------------------- /docs/supported_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/supported_models.md -------------------------------------------------------------------------------- /docs/usage/code_overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/usage/code_overview.md -------------------------------------------------------------------------------- /docs/usage/environment_variables.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/usage/environment_variables.md -------------------------------------------------------------------------------- /docs/usage/fastdeploy_unit_test_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/usage/fastdeploy_unit_test_guide.md -------------------------------------------------------------------------------- /docs/usage/kunlunxin_xpu_deployment.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/usage/kunlunxin_xpu_deployment.md -------------------------------------------------------------------------------- /docs/usage/log.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/usage/log.md -------------------------------------------------------------------------------- /docs/zh/benchmark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/benchmark.md -------------------------------------------------------------------------------- /docs/zh/best_practices/FAQ.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/best_practices/FAQ.md -------------------------------------------------------------------------------- /docs/zh/best_practices/GLM-4-MoE-Text.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/best_practices/GLM-4-MoE-Text.md -------------------------------------------------------------------------------- /docs/zh/best_practices/PaddleOCR-VL-0.9B.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/best_practices/PaddleOCR-VL-0.9B.md -------------------------------------------------------------------------------- /docs/zh/best_practices/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/best_practices/README.md -------------------------------------------------------------------------------- /docs/zh/cli/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/README.md -------------------------------------------------------------------------------- /docs/zh/cli/bench.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/bench.md -------------------------------------------------------------------------------- /docs/zh/cli/chat.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/chat.md -------------------------------------------------------------------------------- /docs/zh/cli/collect-env.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/collect-env.md -------------------------------------------------------------------------------- /docs/zh/cli/complete.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/complete.md -------------------------------------------------------------------------------- /docs/zh/cli/run-batch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/run-batch.md -------------------------------------------------------------------------------- /docs/zh/cli/serve.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/serve.md -------------------------------------------------------------------------------- /docs/zh/cli/tokenizer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/cli/tokenizer.md -------------------------------------------------------------------------------- /docs/zh/features/chunked_prefill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/chunked_prefill.md -------------------------------------------------------------------------------- /docs/zh/features/data_parallel_service.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/data_parallel_service.md -------------------------------------------------------------------------------- /docs/zh/features/disaggregated.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/disaggregated.md -------------------------------------------------------------------------------- /docs/zh/features/early_stop.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/early_stop.md -------------------------------------------------------------------------------- /docs/zh/features/graph_optimization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/graph_optimization.md -------------------------------------------------------------------------------- /docs/zh/features/images/GlobalScheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/images/GlobalScheduler.png -------------------------------------------------------------------------------- /docs/zh/features/images/LocalScheduler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/images/LocalScheduler.png -------------------------------------------------------------------------------- /docs/zh/features/images/disaggregated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/images/disaggregated.png -------------------------------------------------------------------------------- /docs/zh/features/images/no_scheduler_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/images/no_scheduler_img.png -------------------------------------------------------------------------------- /docs/zh/features/images/scheduler_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/images/scheduler_img.png -------------------------------------------------------------------------------- /docs/zh/features/load_balance.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/load_balance.md -------------------------------------------------------------------------------- /docs/zh/features/logits_processor.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/logits_processor.md -------------------------------------------------------------------------------- /docs/zh/features/multi-node_deployment.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/multi-node_deployment.md -------------------------------------------------------------------------------- /docs/zh/features/plas_attention.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/plas_attention.md -------------------------------------------------------------------------------- /docs/zh/features/plugins.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/plugins.md -------------------------------------------------------------------------------- /docs/zh/features/prefix_caching.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/prefix_caching.md -------------------------------------------------------------------------------- /docs/zh/features/reasoning_output.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/reasoning_output.md -------------------------------------------------------------------------------- /docs/zh/features/sampling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/sampling.md -------------------------------------------------------------------------------- /docs/zh/features/speculative_decoding.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/speculative_decoding.md -------------------------------------------------------------------------------- /docs/zh/features/structured_outputs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/structured_outputs.md -------------------------------------------------------------------------------- /docs/zh/features/tool_calling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/features/tool_calling.md -------------------------------------------------------------------------------- /docs/zh/get_started/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/README.md -------------------------------------------------------------------------------- /docs/zh/get_started/ernie-4.5-vl-thinking.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/ernie-4.5-vl-thinking.md -------------------------------------------------------------------------------- /docs/zh/get_started/ernie-4.5-vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/ernie-4.5-vl.md -------------------------------------------------------------------------------- /docs/zh/get_started/ernie-4.5.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/ernie-4.5.md -------------------------------------------------------------------------------- /docs/zh/get_started/installation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/installation/README.md -------------------------------------------------------------------------------- /docs/zh/get_started/quick_start.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/quick_start.md -------------------------------------------------------------------------------- /docs/zh/get_started/quick_start_qwen.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/quick_start_qwen.md -------------------------------------------------------------------------------- /docs/zh/get_started/quick_start_qwen25_vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/quick_start_qwen25_vl.md -------------------------------------------------------------------------------- /docs/zh/get_started/quick_start_vl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/get_started/quick_start_vl.md -------------------------------------------------------------------------------- /docs/zh/index.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/index.md -------------------------------------------------------------------------------- /docs/zh/offline_inference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/offline_inference.md -------------------------------------------------------------------------------- /docs/zh/online_serving/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/online_serving/README.md -------------------------------------------------------------------------------- /docs/zh/online_serving/metrics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/online_serving/metrics.md -------------------------------------------------------------------------------- /docs/zh/online_serving/scheduler.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/online_serving/scheduler.md -------------------------------------------------------------------------------- /docs/zh/parameters.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/parameters.md -------------------------------------------------------------------------------- /docs/zh/quantization/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/quantization/README.md -------------------------------------------------------------------------------- /docs/zh/quantization/images/wint2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/quantization/images/wint2.png -------------------------------------------------------------------------------- /docs/zh/quantization/online_quantization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/quantization/online_quantization.md -------------------------------------------------------------------------------- /docs/zh/quantization/wint2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/quantization/wint2.md -------------------------------------------------------------------------------- /docs/zh/supported_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/supported_models.md -------------------------------------------------------------------------------- /docs/zh/usage/code_overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/usage/code_overview.md -------------------------------------------------------------------------------- /docs/zh/usage/environment_variables.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/usage/environment_variables.md -------------------------------------------------------------------------------- /docs/zh/usage/fastdeploy_unit_test_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/usage/fastdeploy_unit_test_guide.md -------------------------------------------------------------------------------- /docs/zh/usage/kunlunxin_xpu_deployment.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/usage/kunlunxin_xpu_deployment.md -------------------------------------------------------------------------------- /docs/zh/usage/log.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/docs/zh/usage/log.md -------------------------------------------------------------------------------- /examples/splitwise/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/README.md -------------------------------------------------------------------------------- /examples/splitwise/start_mixed.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/start_mixed.sh -------------------------------------------------------------------------------- /examples/splitwise/start_v0_tp1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/start_v0_tp1.sh -------------------------------------------------------------------------------- /examples/splitwise/start_v1_tp1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/start_v1_tp1.sh -------------------------------------------------------------------------------- /examples/splitwise/stop.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/stop.sh -------------------------------------------------------------------------------- /examples/splitwise/test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/test.sh -------------------------------------------------------------------------------- /examples/splitwise/utils.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/examples/splitwise/utils.sh -------------------------------------------------------------------------------- /fastdeploy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/__init__.py -------------------------------------------------------------------------------- /fastdeploy/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastdeploy/benchmarks/datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/benchmarks/datasets.py -------------------------------------------------------------------------------- /fastdeploy/benchmarks/latency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/benchmarks/latency.py -------------------------------------------------------------------------------- /fastdeploy/benchmarks/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastdeploy/benchmarks/lib/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/benchmarks/lib/utils.py -------------------------------------------------------------------------------- /fastdeploy/benchmarks/serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/benchmarks/serve.py -------------------------------------------------------------------------------- /fastdeploy/benchmarks/throughput.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/benchmarks/throughput.py -------------------------------------------------------------------------------- /fastdeploy/cache_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/cache_manager/__init__.py -------------------------------------------------------------------------------- /fastdeploy/cache_manager/cache_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/cache_manager/cache_data.py -------------------------------------------------------------------------------- /fastdeploy/cache_manager/cache_messager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/cache_manager/cache_messager.py -------------------------------------------------------------------------------- /fastdeploy/cache_manager/cache_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/cache_manager/cache_metrics.py -------------------------------------------------------------------------------- /fastdeploy/cache_manager/ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/cache_manager/ops.py -------------------------------------------------------------------------------- /fastdeploy/collect_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/collect_env.py -------------------------------------------------------------------------------- /fastdeploy/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/config.py -------------------------------------------------------------------------------- /fastdeploy/demo/offline_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/demo/offline_demo.py -------------------------------------------------------------------------------- /fastdeploy/demo/openai_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/demo/openai_demo.py -------------------------------------------------------------------------------- /fastdeploy/demo/openai_vl_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/demo/openai_vl_demo.py -------------------------------------------------------------------------------- /fastdeploy/demo/tokenzier_client_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/demo/tokenzier_client_demo.py -------------------------------------------------------------------------------- /fastdeploy/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/distributed/__init__.py -------------------------------------------------------------------------------- /fastdeploy/distributed/communication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/distributed/communication.py -------------------------------------------------------------------------------- /fastdeploy/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/__init__.py -------------------------------------------------------------------------------- /fastdeploy/engine/args_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/args_utils.py -------------------------------------------------------------------------------- /fastdeploy/engine/async_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/async_llm.py -------------------------------------------------------------------------------- /fastdeploy/engine/common_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/common_engine.py -------------------------------------------------------------------------------- /fastdeploy/engine/engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/engine.py -------------------------------------------------------------------------------- /fastdeploy/engine/expert_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/expert_service.py -------------------------------------------------------------------------------- /fastdeploy/engine/kv_cache_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/kv_cache_interface.py -------------------------------------------------------------------------------- /fastdeploy/engine/pooling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/pooling_params.py -------------------------------------------------------------------------------- /fastdeploy/engine/request.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/request.py -------------------------------------------------------------------------------- /fastdeploy/engine/resource_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/resource_manager.py -------------------------------------------------------------------------------- /fastdeploy/engine/sampling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/sampling_params.py -------------------------------------------------------------------------------- /fastdeploy/engine/sched/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/sched/__init__.py -------------------------------------------------------------------------------- /fastdeploy/engine/tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/engine/tasks.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/__init__.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/api_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/api_server.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/chat_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/chat_utils.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/__init__.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/benchmark/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/benchmark/base.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/benchmark/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/benchmark/eval.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/benchmark/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/benchmark/main.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/collect_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/collect_env.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/main.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/openai.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/run_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/run_batch.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/serve.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/tokenizer.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/cli/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/cli/types.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/engine_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/engine_client.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/llm.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/api_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/api_server.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/middleware.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/middleware.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/protocol.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/run_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/run_batch.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/test_openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/test_openai.py -------------------------------------------------------------------------------- /fastdeploy/entrypoints/openai/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/entrypoints/openai/utils.py -------------------------------------------------------------------------------- /fastdeploy/envs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/envs.py -------------------------------------------------------------------------------- /fastdeploy/eplb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/eplb/__init__.py -------------------------------------------------------------------------------- /fastdeploy/eplb/async_expert_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/eplb/async_expert_loader.py -------------------------------------------------------------------------------- /fastdeploy/eplb/eplb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/eplb/eplb.py -------------------------------------------------------------------------------- /fastdeploy/eplb/experts_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/eplb/experts_manager.py -------------------------------------------------------------------------------- /fastdeploy/eplb/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/eplb/utils.py -------------------------------------------------------------------------------- /fastdeploy/import_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/import_ops.py -------------------------------------------------------------------------------- /fastdeploy/input/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/__init__.py -------------------------------------------------------------------------------- /fastdeploy/input/ernie4_5_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/ernie4_5_processor.py -------------------------------------------------------------------------------- /fastdeploy/input/ernie4_5_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/ernie4_5_tokenizer.py -------------------------------------------------------------------------------- /fastdeploy/input/preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/preprocess.py -------------------------------------------------------------------------------- /fastdeploy/input/text_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/text_processor.py -------------------------------------------------------------------------------- /fastdeploy/input/tokenzier_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/tokenzier_client.py -------------------------------------------------------------------------------- /fastdeploy/input/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/input/utils.py -------------------------------------------------------------------------------- /fastdeploy/inter_communicator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/inter_communicator/__init__.py -------------------------------------------------------------------------------- /fastdeploy/inter_communicator/ipc_signal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/inter_communicator/ipc_signal.py -------------------------------------------------------------------------------- /fastdeploy/inter_communicator/zmq_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/inter_communicator/zmq_client.py -------------------------------------------------------------------------------- /fastdeploy/inter_communicator/zmq_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/inter_communicator/zmq_server.py -------------------------------------------------------------------------------- /fastdeploy/logger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastdeploy/logger/formatters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/logger/formatters.py -------------------------------------------------------------------------------- /fastdeploy/logger/handlers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/logger/handlers.py -------------------------------------------------------------------------------- /fastdeploy/logger/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/logger/logger.py -------------------------------------------------------------------------------- /fastdeploy/logger/setup_logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/logger/setup_logging.py -------------------------------------------------------------------------------- /fastdeploy/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/metrics/__init__.py -------------------------------------------------------------------------------- /fastdeploy/metrics/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/metrics/metrics.py -------------------------------------------------------------------------------- /fastdeploy/metrics/metrics_middleware.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/metrics/metrics_middleware.py -------------------------------------------------------------------------------- /fastdeploy/metrics/stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/metrics/stats.py -------------------------------------------------------------------------------- /fastdeploy/metrics/trace_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/metrics/trace_util.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/__init__.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/forward_meta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/forward_meta.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/linear.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/lm_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/lm_head.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/moe/ep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/moe/ep.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/moe/moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/moe/moe.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/pooler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/pooler.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/layers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/layers/utils.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/__init__.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/adapters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/adapters.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/glm4_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/glm4_moe.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/gpt_oss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/gpt_oss.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/qwen2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/qwen2.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/qwen2_rm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/qwen2_rm.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/qwen3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/qwen3.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/qwen3moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/qwen3moe.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/tp_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/tp_utils.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/models/utils.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/ops/__init__.py -------------------------------------------------------------------------------- /fastdeploy/model_executor/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/model_executor/utils.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/__init__.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/audio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/audio.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/base.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/hasher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/hasher.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/image.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/registry.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/utils.py -------------------------------------------------------------------------------- /fastdeploy/multimodal/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/multimodal/video.py -------------------------------------------------------------------------------- /fastdeploy/output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/output/__init__.py -------------------------------------------------------------------------------- /fastdeploy/output/pooler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/output/pooler.py -------------------------------------------------------------------------------- /fastdeploy/output/stream_transfer_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/output/stream_transfer_data.py -------------------------------------------------------------------------------- /fastdeploy/output/token_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/output/token_processor.py -------------------------------------------------------------------------------- /fastdeploy/platforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/__init__.py -------------------------------------------------------------------------------- /fastdeploy/platforms/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/base.py -------------------------------------------------------------------------------- /fastdeploy/platforms/cpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/cpu.py -------------------------------------------------------------------------------- /fastdeploy/platforms/cuda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/cuda.py -------------------------------------------------------------------------------- /fastdeploy/platforms/dcu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/dcu.py -------------------------------------------------------------------------------- /fastdeploy/platforms/gcu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/gcu.py -------------------------------------------------------------------------------- /fastdeploy/platforms/iluvatar.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/iluvatar.py -------------------------------------------------------------------------------- /fastdeploy/platforms/intel_hpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/intel_hpu.py -------------------------------------------------------------------------------- /fastdeploy/platforms/maca.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/maca.py -------------------------------------------------------------------------------- /fastdeploy/platforms/npu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/npu.py -------------------------------------------------------------------------------- /fastdeploy/platforms/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/utils.py -------------------------------------------------------------------------------- /fastdeploy/platforms/xpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/platforms/xpu.py -------------------------------------------------------------------------------- /fastdeploy/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/plugins/__init__.py -------------------------------------------------------------------------------- /fastdeploy/plugins/model_runner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/plugins/model_runner/__init__.py -------------------------------------------------------------------------------- /fastdeploy/plugins/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/plugins/utils.py -------------------------------------------------------------------------------- /fastdeploy/reasoning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/reasoning/__init__.py -------------------------------------------------------------------------------- /fastdeploy/rl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/rl/__init__.py -------------------------------------------------------------------------------- /fastdeploy/rl/dynamic_weight_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/rl/dynamic_weight_manager.py -------------------------------------------------------------------------------- /fastdeploy/rl/rollout_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/rl/rollout_config.py -------------------------------------------------------------------------------- /fastdeploy/rl/rollout_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/rl/rollout_model.py -------------------------------------------------------------------------------- /fastdeploy/router/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/router/__init__.py -------------------------------------------------------------------------------- /fastdeploy/router/launch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/router/launch.py -------------------------------------------------------------------------------- /fastdeploy/router/router.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/router/router.py -------------------------------------------------------------------------------- /fastdeploy/router/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/router/utils.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/__init__.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/config.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/data.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/dp_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/dp_scheduler.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/global_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/global_scheduler.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/local_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/local_scheduler.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/splitwise_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/splitwise_scheduler.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/storage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/storage.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/utils.py -------------------------------------------------------------------------------- /fastdeploy/scheduler/workers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/scheduler/workers.py -------------------------------------------------------------------------------- /fastdeploy/spec_decode/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/spec_decode/__init__.py -------------------------------------------------------------------------------- /fastdeploy/spec_decode/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/spec_decode/base.py -------------------------------------------------------------------------------- /fastdeploy/spec_decode/mtp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/spec_decode/mtp.py -------------------------------------------------------------------------------- /fastdeploy/spec_decode/ngram.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/spec_decode/ngram.py -------------------------------------------------------------------------------- /fastdeploy/splitwise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/splitwise/__init__.py -------------------------------------------------------------------------------- /fastdeploy/splitwise/splitwise_connector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/splitwise/splitwise_connector.py -------------------------------------------------------------------------------- /fastdeploy/stop.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/stop.sh -------------------------------------------------------------------------------- /fastdeploy/test.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/test.yaml -------------------------------------------------------------------------------- /fastdeploy/trace/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/trace/__init__.py -------------------------------------------------------------------------------- /fastdeploy/trace/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/trace/constants.py -------------------------------------------------------------------------------- /fastdeploy/trace/trace_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/trace/trace_logger.py -------------------------------------------------------------------------------- /fastdeploy/transformer_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/transformer_utils/__init__.py -------------------------------------------------------------------------------- /fastdeploy/transformer_utils/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/transformer_utils/config.py -------------------------------------------------------------------------------- /fastdeploy/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/utils.py -------------------------------------------------------------------------------- /fastdeploy/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/__init__.py -------------------------------------------------------------------------------- /fastdeploy/worker/dcu_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/dcu_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/dcu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/dcu_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/eplb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/eplb.py -------------------------------------------------------------------------------- /fastdeploy/worker/experts_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/experts_manager.py -------------------------------------------------------------------------------- /fastdeploy/worker/gcu_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/gcu_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/gcu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/gcu_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/gpu_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/gpu_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/gpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/gpu_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/hpu_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/hpu_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/hpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/hpu_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/iluvatar_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/iluvatar_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/iluvatar_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/iluvatar_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/metax_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/metax_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/metax_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/metax_worker.py -------------------------------------------------------------------------------- /fastdeploy/worker/model_runner_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/model_runner_base.py -------------------------------------------------------------------------------- /fastdeploy/worker/output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/output.py -------------------------------------------------------------------------------- /fastdeploy/worker/worker_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/worker_base.py -------------------------------------------------------------------------------- /fastdeploy/worker/worker_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/worker_process.py -------------------------------------------------------------------------------- /fastdeploy/worker/xpu_model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/xpu_model_runner.py -------------------------------------------------------------------------------- /fastdeploy/worker/xpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/fastdeploy/worker/xpu_worker.py -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/mkdocs.yml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirements_dcu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/requirements_dcu.txt -------------------------------------------------------------------------------- /requirements_iluvatar.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/requirements_iluvatar.txt -------------------------------------------------------------------------------- /requirements_metaxgpu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/requirements_metaxgpu.txt -------------------------------------------------------------------------------- /scripts/.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/.coveragerc -------------------------------------------------------------------------------- /scripts/CheckPRTemplate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/CheckPRTemplate.py -------------------------------------------------------------------------------- /scripts/check_approval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/check_approval.sh -------------------------------------------------------------------------------- /scripts/check_pr_approval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/check_pr_approval.py -------------------------------------------------------------------------------- /scripts/codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/codecov.yml -------------------------------------------------------------------------------- /scripts/coverage_run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/coverage_run.sh -------------------------------------------------------------------------------- /scripts/generate_diff_coverage_xml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/generate_diff_coverage_xml.py -------------------------------------------------------------------------------- /scripts/generate_full_coverage_csv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/generate_full_coverage_csv.py -------------------------------------------------------------------------------- /scripts/get_rdma_nics.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/get_rdma_nics.sh -------------------------------------------------------------------------------- /scripts/merge_cache_scale.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/merge_cache_scale.py -------------------------------------------------------------------------------- /scripts/offline_w4a8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/offline_w4a8.py -------------------------------------------------------------------------------- /scripts/run_ci_dcu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_ci_dcu.sh -------------------------------------------------------------------------------- /scripts/run_ci_gcu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_ci_gcu.sh -------------------------------------------------------------------------------- /scripts/run_ci_hpu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_ci_hpu.sh -------------------------------------------------------------------------------- /scripts/run_ci_iluvatar.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_ci_iluvatar.sh -------------------------------------------------------------------------------- /scripts/run_ci_xpu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_ci_xpu.sh -------------------------------------------------------------------------------- /scripts/run_offline_w4a8.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_offline_w4a8.sh -------------------------------------------------------------------------------- /scripts/run_pre_ce.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_pre_ce.sh -------------------------------------------------------------------------------- /scripts/run_unittest.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/run_unittest.sh -------------------------------------------------------------------------------- /scripts/tune_cublaslt_int8_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/tune_cublaslt_int8_gemm.py -------------------------------------------------------------------------------- /scripts/tune_cutlass_fp8_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/tune_cutlass_fp8_gemm.py -------------------------------------------------------------------------------- /scripts/tune_scaled_gemm_f8_i4_f16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/tune_scaled_gemm_f8_i4_f16.py -------------------------------------------------------------------------------- /scripts/unittest_requirement.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/unittest_requirement.txt -------------------------------------------------------------------------------- /scripts/vit_model_split.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/vit_model_split.py -------------------------------------------------------------------------------- /scripts/vit_model_split.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/scripts/vit_model_split.sh -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/setup.py -------------------------------------------------------------------------------- /tests/benchmarks/test_datasets_benchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/benchmarks/test_datasets_benchmarks.py -------------------------------------------------------------------------------- /tests/benchmarks/test_latency_benchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/benchmarks/test_latency_benchmarks.py -------------------------------------------------------------------------------- /tests/benchmarks/test_serve_benchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/benchmarks/test_serve_benchmarks.py -------------------------------------------------------------------------------- /tests/ce/accuracy_cases/gsm8k.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/accuracy_cases/gsm8k.parquet -------------------------------------------------------------------------------- /tests/ce/accuracy_cases/gsm8k.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/accuracy_cases/gsm8k.py -------------------------------------------------------------------------------- /tests/ce/deploy/deploy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/deploy/deploy.py -------------------------------------------------------------------------------- /tests/ce/performance/stress_tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/performance/stress_tools.py -------------------------------------------------------------------------------- /tests/ce/server/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/core/__init__.py -------------------------------------------------------------------------------- /tests/ce/server/core/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/core/logger.py -------------------------------------------------------------------------------- /tests/ce/server/core/request_template.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/core/request_template.py -------------------------------------------------------------------------------- /tests/ce/server/core/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/core/utils.py -------------------------------------------------------------------------------- /tests/ce/server/demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/demo.py -------------------------------------------------------------------------------- /tests/ce/server/requirements.txt: -------------------------------------------------------------------------------- 1 | sympy 2 | tqdm 3 | openai 4 | datasets 5 | -------------------------------------------------------------------------------- /tests/ce/server/test_DDoS.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_DDoS.py -------------------------------------------------------------------------------- /tests/ce/server/test_base_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_base_chat.py -------------------------------------------------------------------------------- /tests/ce/server/test_compare_top_logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_compare_top_logprobs.py -------------------------------------------------------------------------------- /tests/ce/server/test_completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_completions.py -------------------------------------------------------------------------------- /tests/ce/server/test_evil_cases.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_evil_cases.py -------------------------------------------------------------------------------- /tests/ce/server/test_logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_logprobs.py -------------------------------------------------------------------------------- /tests/ce/server/test_max_concurrency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_max_concurrency.py -------------------------------------------------------------------------------- /tests/ce/server/test_max_waiting_time.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_max_waiting_time.py -------------------------------------------------------------------------------- /tests/ce/server/test_params_boundary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_params_boundary.py -------------------------------------------------------------------------------- /tests/ce/server/test_prompt_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_prompt_ids.py -------------------------------------------------------------------------------- /tests/ce/server/test_return_token_ids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_return_token_ids.py -------------------------------------------------------------------------------- /tests/ce/server/test_seed_usage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_seed_usage.py -------------------------------------------------------------------------------- /tests/ce/server/test_stream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/server/test_stream.py -------------------------------------------------------------------------------- /tests/ce/stable_cases/launch_model.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/stable_cases/launch_model.sh -------------------------------------------------------------------------------- /tests/ce/stable_cases/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ce/stable_cases/run.sh -------------------------------------------------------------------------------- /tests/ci_use/DCU/run_ernie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/DCU/run_ernie.py -------------------------------------------------------------------------------- /tests/ci_use/EB_Lite/test_EB_Lite_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/EB_Lite/test_EB_Lite_serving.py -------------------------------------------------------------------------------- /tests/ci_use/GCU/run_ernie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/GCU/run_ernie.py -------------------------------------------------------------------------------- /tests/ci_use/GLM-45-AIR/baseline.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/GLM-45-AIR/baseline.txt -------------------------------------------------------------------------------- /tests/ci_use/HPU/run_ernie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/HPU/run_ernie.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_45T.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_45T.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_45vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_45vl.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_ep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_ep.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_ep_online.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_ep_online.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_pd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_pd.py -------------------------------------------------------------------------------- /tests/ci_use/XPU_45T/run_w4a8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/XPU_45T/run_w4a8.py -------------------------------------------------------------------------------- /tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py -------------------------------------------------------------------------------- /tests/ci_use/metrics/test_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/metrics/test_metrics.py -------------------------------------------------------------------------------- /tests/ci_use/utils/rollout_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/ci_use/utils/rollout_model.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/cov_pytest.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/cov_pytest.ini -------------------------------------------------------------------------------- /tests/distributed/chunked_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/chunked_moe.py -------------------------------------------------------------------------------- /tests/distributed/custom_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/custom_all_reduce.py -------------------------------------------------------------------------------- /tests/distributed/test_chunked_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/test_chunked_moe.py -------------------------------------------------------------------------------- /tests/distributed/test_communication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/test_communication.py -------------------------------------------------------------------------------- /tests/distributed/test_cuda_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/test_cuda_wrapper.py -------------------------------------------------------------------------------- /tests/distributed/test_custom_all_reduce.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/distributed/test_custom_all_reduce.py -------------------------------------------------------------------------------- /tests/e2e/EB_VL_Lite/baseline.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/EB_VL_Lite/baseline.txt -------------------------------------------------------------------------------- /tests/e2e/EB_VL_Lite/test_rollout_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/EB_VL_Lite/test_rollout_model.py -------------------------------------------------------------------------------- /tests/e2e/test_EB_Lite_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_EB_Lite_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_EB_VL_Lite_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_EB_VL_Lite_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_EB_VL_Lite_sot_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_EB_VL_Lite_sot_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_Qwen2-7B-Instruct_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_Qwen2-7B-Instruct_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_Qwen2_5_VL_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_Qwen2_5_VL_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_Qwen2_5_VL_torch_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_Qwen2_5_VL_torch_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_api_key.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_api_key.py -------------------------------------------------------------------------------- /tests/e2e/test_ernie_03b_pd_router_v0.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_ernie_03b_pd_router_v0.py -------------------------------------------------------------------------------- /tests/e2e/test_ernie_03b_pd_router_v1_ipc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_ernie_03b_pd_router_v1_ipc.py -------------------------------------------------------------------------------- /tests/e2e/test_ernie_03b_router.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_ernie_03b_router.py -------------------------------------------------------------------------------- /tests/e2e/test_ernie_21b_mtp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_ernie_21b_mtp.py -------------------------------------------------------------------------------- /tests/e2e/test_fake_Glm45_AIR_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_fake_Glm45_AIR_serving.py -------------------------------------------------------------------------------- /tests/e2e/test_paddleocr_vl_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/test_paddleocr_vl_serving.py -------------------------------------------------------------------------------- /tests/e2e/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/e2e/utils/get_rdma_nics.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/utils/get_rdma_nics.sh -------------------------------------------------------------------------------- /tests/e2e/utils/rollout_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/utils/rollout_model.py -------------------------------------------------------------------------------- /tests/e2e/utils/serving_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/e2e/utils/serving_utils.py -------------------------------------------------------------------------------- /tests/engine/test_async_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/engine/test_async_llm.py -------------------------------------------------------------------------------- /tests/engine/test_kv_cache_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/engine/test_kv_cache_interface.py -------------------------------------------------------------------------------- /tests/engine/test_sampling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/engine/test_sampling_params.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/benchmark/test_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/benchmark/test_eval.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/test_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/test_main.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/test_openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/test_openai.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/test_serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/test_serve.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/test_tokenizer_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/test_tokenizer_cli.py -------------------------------------------------------------------------------- /tests/entrypoints/cli/test_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/cli/test_types.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/test_run_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/openai/test_run_batch.py -------------------------------------------------------------------------------- /tests/entrypoints/test_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/test_chat.py -------------------------------------------------------------------------------- /tests/entrypoints/test_engine_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/test_engine_client.py -------------------------------------------------------------------------------- /tests/entrypoints/test_generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/test_generation.py -------------------------------------------------------------------------------- /tests/entrypoints/test_vllm_run_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/entrypoints/test_vllm_run_engine.py -------------------------------------------------------------------------------- /tests/eplb/test_async_expert_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/eplb/test_async_expert_loader.py -------------------------------------------------------------------------------- /tests/eplb/test_eplb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/eplb/test_eplb.py -------------------------------------------------------------------------------- /tests/eplb/test_eplb_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/eplb/test_eplb_utils.py -------------------------------------------------------------------------------- /tests/eplb/test_experts_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/eplb/test_experts_manager.py -------------------------------------------------------------------------------- /tests/input/test_ernie4_5_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_ernie4_5_processor.py -------------------------------------------------------------------------------- /tests/input/test_ernie_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_ernie_processor.py -------------------------------------------------------------------------------- /tests/input/test_ernie_vl_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_ernie_vl_processor.py -------------------------------------------------------------------------------- /tests/input/test_paddleocr_vl_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_paddleocr_vl_processor.py -------------------------------------------------------------------------------- /tests/input/test_process_video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_process_video.py -------------------------------------------------------------------------------- /tests/input/test_qwen_vl_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_qwen_vl_processor.py -------------------------------------------------------------------------------- /tests/input/test_text_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_text_processor.py -------------------------------------------------------------------------------- /tests/input/test_tokenizer_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/input/test_tokenizer_client.py -------------------------------------------------------------------------------- /tests/inter_communicator/test_e2w_queue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/inter_communicator/test_e2w_queue.py -------------------------------------------------------------------------------- /tests/layers/test_activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_activation.py -------------------------------------------------------------------------------- /tests/layers/test_append_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_append_attention.py -------------------------------------------------------------------------------- /tests/layers/test_attention_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_attention_layer.py -------------------------------------------------------------------------------- /tests/layers/test_ffn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_ffn.py -------------------------------------------------------------------------------- /tests/layers/test_fusedmoe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_fusedmoe.py -------------------------------------------------------------------------------- /tests/layers/test_guided_decoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_guided_decoding.py -------------------------------------------------------------------------------- /tests/layers/test_min_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_min_sampling.py -------------------------------------------------------------------------------- /tests/layers/test_moba_attention_backend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_moba_attention_backend.py -------------------------------------------------------------------------------- /tests/layers/test_native_paddle_backend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_native_paddle_backend.py -------------------------------------------------------------------------------- /tests/layers/test_plas_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_plas_attention.py -------------------------------------------------------------------------------- /tests/layers/test_quantized_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_quantized_linear.py -------------------------------------------------------------------------------- /tests/layers/test_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_sampler.py -------------------------------------------------------------------------------- /tests/layers/test_speculative_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_speculative_sampler.py -------------------------------------------------------------------------------- /tests/layers/test_w4a8_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/layers/test_w4a8_moe.py -------------------------------------------------------------------------------- /tests/logger/test_formatters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/logger/test_formatters.py -------------------------------------------------------------------------------- /tests/logger/test_handlers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/logger/test_handlers.py -------------------------------------------------------------------------------- /tests/logger/test_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/logger/test_logger.py -------------------------------------------------------------------------------- /tests/logger/test_setup_logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/logger/test_setup_logging.py -------------------------------------------------------------------------------- /tests/metrics/test_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/metrics/test_metrics.py -------------------------------------------------------------------------------- /tests/metrics/test_metrics_middleware.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/metrics/test_metrics_middleware.py -------------------------------------------------------------------------------- /tests/metrics/test_new_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/metrics/test_new_metrics.py -------------------------------------------------------------------------------- /tests/metrics/test_trace_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/metrics/test_trace_util.py -------------------------------------------------------------------------------- /tests/model_executor/test_ep.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_executor/test_ep.py -------------------------------------------------------------------------------- /tests/model_executor/test_tensor_wise_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_executor/test_tensor_wise_fp8.py -------------------------------------------------------------------------------- /tests/model_executor/test_tp_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_executor/test_tp_utils.py -------------------------------------------------------------------------------- /tests/model_loader/test_load_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_load_attention.py -------------------------------------------------------------------------------- /tests/model_loader/test_load_ernie_vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_load_ernie_vl.py -------------------------------------------------------------------------------- /tests/model_loader/test_load_mtp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_load_mtp.py -------------------------------------------------------------------------------- /tests/model_loader/test_model_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_model_cache.py -------------------------------------------------------------------------------- /tests/model_loader/test_offline_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_offline_model.py -------------------------------------------------------------------------------- /tests/model_loader/test_torch_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_torch_model.py -------------------------------------------------------------------------------- /tests/model_loader/test_w4a8_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/test_w4a8_model.py -------------------------------------------------------------------------------- /tests/model_loader/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/model_loader/utils.py -------------------------------------------------------------------------------- /tests/multimodal/test_hasher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/multimodal/test_hasher.py -------------------------------------------------------------------------------- /tests/multimodal/test_multimodal_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/multimodal/test_multimodal_utils.py -------------------------------------------------------------------------------- /tests/operators/test_air_top_p_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_air_top_p_sampling.py -------------------------------------------------------------------------------- /tests/operators/test_cutlass_scaled_mm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_cutlass_scaled_mm.py -------------------------------------------------------------------------------- /tests/operators/test_dequant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_dequant.py -------------------------------------------------------------------------------- /tests/operators/test_draft_model_update.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_draft_model_update.py -------------------------------------------------------------------------------- /tests/operators/test_flash_mask_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_flash_mask_attn.py -------------------------------------------------------------------------------- /tests/operators/test_fused_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_fused_moe.py -------------------------------------------------------------------------------- /tests/operators/test_gelu_tanh.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_gelu_tanh.py -------------------------------------------------------------------------------- /tests/operators/test_get_padding_offset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_get_padding_offset.py -------------------------------------------------------------------------------- /tests/operators/test_hybrid_mtp_ngram.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_hybrid_mtp_ngram.py -------------------------------------------------------------------------------- /tests/operators/test_machete_mm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_machete_mm.py -------------------------------------------------------------------------------- /tests/operators/test_moe_top_k_select.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_moe_top_k_select.py -------------------------------------------------------------------------------- /tests/operators/test_ngram_match.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_ngram_match.py -------------------------------------------------------------------------------- /tests/operators/test_noaux_tc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_noaux_tc.py -------------------------------------------------------------------------------- /tests/operators/test_noaux_tc_redundant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_noaux_tc_redundant.py -------------------------------------------------------------------------------- /tests/operators/test_per_token_quant.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_per_token_quant.py -------------------------------------------------------------------------------- /tests/operators/test_pre_cache_len_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_pre_cache_len_concat.py -------------------------------------------------------------------------------- /tests/operators/test_rebuild_padding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_rebuild_padding.py -------------------------------------------------------------------------------- /tests/operators/test_share_external_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_share_external_data.py -------------------------------------------------------------------------------- /tests/operators/test_speculate_update.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_speculate_update.py -------------------------------------------------------------------------------- /tests/operators/test_speculate_verify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_speculate_verify.py -------------------------------------------------------------------------------- /tests/operators/test_split_fuse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_split_fuse.py -------------------------------------------------------------------------------- /tests/operators/test_token_penalty.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_token_penalty.py -------------------------------------------------------------------------------- /tests/operators/test_top_k_renorm_probs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_top_k_renorm_probs.py -------------------------------------------------------------------------------- /tests/operators/test_top_p_candidates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_top_p_candidates.py -------------------------------------------------------------------------------- /tests/operators/test_tree_mask.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_tree_mask.py -------------------------------------------------------------------------------- /tests/operators/test_tritonmoe_preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_tritonmoe_preprocess.py -------------------------------------------------------------------------------- /tests/operators/test_update_attn_mask.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_update_attn_mask.py -------------------------------------------------------------------------------- /tests/operators/test_update_inputs_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_update_inputs_v1.py -------------------------------------------------------------------------------- /tests/operators/test_w4afp8_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_w4afp8_gemm.py -------------------------------------------------------------------------------- /tests/operators/test_wfp8afp8_sparse_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/operators/test_wfp8afp8_sparse_gemm.py -------------------------------------------------------------------------------- /tests/output/test_get_save_output_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/output/test_get_save_output_v1.py -------------------------------------------------------------------------------- /tests/output/test_pooler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/output/test_pooler.py -------------------------------------------------------------------------------- /tests/output/test_process_batch_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/output/test_process_batch_output.py -------------------------------------------------------------------------------- /tests/output/test_stream_transfer_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/output/test_stream_transfer_data.py -------------------------------------------------------------------------------- /tests/platforms/test_platforms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/platforms/test_platforms.py -------------------------------------------------------------------------------- /tests/platforms/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/platforms/test_utils.py -------------------------------------------------------------------------------- /tests/plugins/fd_add_dummy_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/plugins/fd_add_dummy_model/__init__.py -------------------------------------------------------------------------------- /tests/plugins/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/plugins/setup.py -------------------------------------------------------------------------------- /tests/plugins/test_model_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/plugins/test_model_registry.py -------------------------------------------------------------------------------- /tests/pooling/test_embedding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/pooling/test_embedding.py -------------------------------------------------------------------------------- /tests/quantization/test_kv_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/quantization/test_kv_cache.py -------------------------------------------------------------------------------- /tests/quantization/test_w4a8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/quantization/test_w4a8.py -------------------------------------------------------------------------------- /tests/quantization/test_w4afp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/quantization/test_w4afp8.py -------------------------------------------------------------------------------- /tests/reasoning/test_reasoning_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/reasoning/test_reasoning_parser.py -------------------------------------------------------------------------------- /tests/scheduler/test_dp_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/scheduler/test_dp_scheduler.py -------------------------------------------------------------------------------- /tests/scheduler/test_workers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/scheduler/test_workers.py -------------------------------------------------------------------------------- /tests/splitwise/test_splitwise_connector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/splitwise/test_splitwise_connector.py -------------------------------------------------------------------------------- /tests/trace/test_constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/trace/test_constants.py -------------------------------------------------------------------------------- /tests/trace/test_trace_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/trace/test_trace_logger.py -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils.py -------------------------------------------------------------------------------- /tests/utils/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_config.py -------------------------------------------------------------------------------- /tests/utils/test_custom_chat_template.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_custom_chat_template.py -------------------------------------------------------------------------------- /tests/utils/test_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_download.py -------------------------------------------------------------------------------- /tests/utils/test_exception_handler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_exception_handler.py -------------------------------------------------------------------------------- /tests/utils/test_run_batch_tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_run_batch_tools.py -------------------------------------------------------------------------------- /tests/utils/test_version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/utils/test_version.py -------------------------------------------------------------------------------- /tests/v1/cache_manager/test_encoder_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/v1/cache_manager/test_encoder_cache.py -------------------------------------------------------------------------------- /tests/v1/cache_manager/test_prefix_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/v1/cache_manager/test_prefix_cache.py -------------------------------------------------------------------------------- /tests/v1/cache_manager/test_revert_blocks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/v1/cache_manager/test_revert_blocks.py -------------------------------------------------------------------------------- /tests/v1/test_resource_manager_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/v1/test_resource_manager_v1.py -------------------------------------------------------------------------------- /tests/v1/test_schedule_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/v1/test_schedule_output.py -------------------------------------------------------------------------------- /tests/woker/test_gpu_prompt_logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/woker/test_gpu_prompt_logprobs.py -------------------------------------------------------------------------------- /tests/woker/test_logprobs_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tests/woker/test_logprobs_output.py -------------------------------------------------------------------------------- /tools/codestyle/pre_commit.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/codestyle/pre_commit.sh -------------------------------------------------------------------------------- /tools/deep_gemm_pre-compile/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/deep_gemm_pre-compile/README.md -------------------------------------------------------------------------------- /tools/deep_gemm_pre-compile/pre_compile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/deep_gemm_pre-compile/pre_compile.py -------------------------------------------------------------------------------- /tools/deep_gemm_pre-compile/pre_compile.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/deep_gemm_pre-compile/pre_compile.sh -------------------------------------------------------------------------------- /tools/dockerfile/Dockerfile.ci: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/dockerfile/Dockerfile.ci -------------------------------------------------------------------------------- /tools/dockerfile/docker_build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/dockerfile/docker_build.sh -------------------------------------------------------------------------------- /tools/dockerfile/requirements_paddle_nv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/HEAD/tools/dockerfile/requirements_paddle_nv.txt --------------------------------------------------------------------------------