├── .buildkite ├── check-wheel-size.py ├── generate_index.py ├── lm-eval-harness │ ├── configs │ │ ├── DeepSeek-V2-Lite-Chat.yaml │ │ ├── Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-70B-Instruct.yaml │ │ ├── Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-FP8.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml │ │ ├── Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml │ │ ├── Meta-Llama-3-8B-Instruct.yaml │ │ ├── Meta-Llama-3.2-1B-Instruct-FP8-compressed-tensors.yaml │ │ ├── Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Minitron-4B-Base-FP8.yaml │ │ ├── Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1-FP8.yaml │ │ ├── Mixtral-8x7B-Instruct-v0.1.yaml │ │ ├── Qwen1.5-MoE-W4A16-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-FP8W8.yaml │ │ ├── Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml │ │ ├── Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml │ │ ├── Qwen2-57B-A14-Instruct.yaml │ │ ├── Qwen2.5-1.5B-Instruct.yaml │ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml │ │ ├── SparseLlama3.1_2of4_fp8_compressed.yaml │ │ ├── models-large.txt │ │ └── models-small.txt │ ├── conftest.py │ ├── run-lm-eval-gsm-hf-baseline.sh │ ├── run-lm-eval-gsm-vllm-baseline.sh │ └── test_lm_eval_correctness.py ├── nightly-benchmarks │ ├── README.md │ ├── benchmark-pipeline.yaml │ ├── nightly-annotation.md │ ├── nightly-descriptions.md │ ├── nightly-pipeline.yaml │ ├── performance-benchmarks-descriptions.md │ ├── scripts │ │ ├── compare-json-results.py │ │ ├── convert-results-json-to-markdown.py │ │ ├── download-tokenizer.py │ │ ├── generate-nightly-markdown.py │ │ ├── get-lmdeploy-modelname.py │ │ ├── launch-server.sh │ │ ├── nightly-annotate.sh │ │ ├── run-nightly-benchmarks.sh │ │ ├── run-performance-benchmarks.sh │ │ ├── summary-nightly-results.py │ │ └── wait-for-image.sh │ └── tests │ │ ├── genai-perf-tests.json │ │ ├── latency-tests-cpu.json │ │ ├── latency-tests.json │ │ ├── nightly-tests.json │ │ ├── serving-tests-cpu-snc2.json │ │ ├── serving-tests-cpu-snc3.json │ │ ├── serving-tests-cpu.json │ │ ├── serving-tests.json │ │ ├── throughput-tests-cpu.json │ │ └── throughput-tests.json ├── pyproject.toml ├── release-pipeline.yaml ├── scripts │ ├── annotate-release.sh │ ├── ci-clean-log.sh │ ├── cleanup-nightly-builds.sh │ ├── hardware_ci │ │ ├── run-amd-test.sh │ │ ├── run-cpu-test-ppc64le.sh │ │ ├── run-cpu-test-s390x.sh │ │ ├── run-cpu-test.sh │ │ ├── run-gh200-test.sh │ │ ├── run-hpu-test.sh │ │ ├── run-tpu-v1-test-part2.sh │ │ ├── run-tpu-v1-test.sh │ │ └── run-xpu-test.sh │ ├── rerun-test.sh │ ├── run-benchmarks.sh │ ├── run-multi-node-test.sh │ ├── tpu │ │ ├── cleanup_docker.sh │ │ ├── config_v6e_1.env │ │ ├── docker_run_bm.sh │ │ ├── quantized_v6e_1.env │ │ └── run_bm.sh │ └── upload-wheels.sh └── test-pipeline.yaml ├── .clang-format ├── .coveragerc ├── .dockerignore ├── .gemini └── config.yaml ├── .github ├── .bc-linter.yml ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 200-installation.yml │ ├── 300-usage.yml │ ├── 400-bug-report.yml │ ├── 450-ci-failure.yml │ ├── 500-feature-request.yml │ ├── 600-new-model.yml │ ├── 700-performance-discussion.yml │ ├── 750-RFC.yml │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml ├── mergify.yml ├── scale-config.yml ├── scripts │ └── cleanup_pr_body.sh └── workflows │ ├── add_label_automerge.yml │ ├── bc-lint.yml │ ├── cleanup_pr_body.yml │ ├── issue_autolabel.yml │ ├── matchers │ ├── actionlint.json │ ├── markdownlint.json │ └── mypy.json │ ├── pre-commit.yml │ ├── reminder_comment.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── stale.yml ├── .gitignore ├── .markdownlint.yaml ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .shellcheckrc ├── .yapfignore ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DCO ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── SECURITY.md ├── benchmarks ├── README.md ├── auto_tune │ ├── README.md │ ├── auto_tune.sh │ └── batch_auto_tune.sh ├── backend_request_func.py ├── benchmark_block_pool.py ├── benchmark_latency.py ├── benchmark_long_document_qa_throughput.py ├── benchmark_ngram_proposer.py ├── benchmark_prefix_caching.py ├── benchmark_prioritization.py ├── benchmark_serving.py ├── benchmark_serving_structured_output.py ├── benchmark_throughput.py ├── benchmark_utils.py ├── cutlass_benchmarks │ ├── sparse_benchmarks.py │ ├── utils.py │ ├── w8a8_benchmarks.py │ └── weight_shapes.py ├── disagg_benchmarks │ ├── disagg_overhead_benchmark.sh │ ├── disagg_performance_benchmark.sh │ ├── disagg_prefill_proxy_server.py │ ├── rate_limiter.py │ ├── request_queue.py │ ├── round_robin_proxy.py │ └── visualize_benchmark_results.py ├── fused_kernels │ └── layernorm_rms_benchmarks.py ├── kernels │ ├── bench_block_fp8_gemm.py │ ├── bench_fp8_gemm.py │ ├── bench_int8_gemm.py │ ├── bench_nvfp4_gemm.py │ ├── bench_per_token_quant_fp8.py │ ├── benchmark_activation.py │ ├── benchmark_bitblas.py │ ├── benchmark_cutlass_fp4_moe.py │ ├── benchmark_device_communicators.py │ ├── benchmark_grouped_gemm_cutlass.py │ ├── benchmark_layernorm.py │ ├── benchmark_lora.py │ ├── benchmark_machete.py │ ├── benchmark_marlin.py │ ├── benchmark_moe.py │ ├── benchmark_moe_align_block_size.py │ ├── benchmark_moe_permute_unpermute.py │ ├── benchmark_mrope.py │ ├── benchmark_paged_attention.py │ ├── benchmark_per_token_group_quant.py │ ├── benchmark_polynorm.py │ ├── benchmark_quant.py │ ├── benchmark_reshape_and_cache_flash.py │ ├── benchmark_rmsnorm.py │ ├── benchmark_rope.py │ ├── benchmark_shapes.py │ ├── benchmark_silu_mul_fp8_quant.py │ ├── benchmark_trtllm_decode_attention.py │ ├── benchmark_trtllm_prefill_attention.py │ ├── benchmark_w8a8_block_fp8.py │ ├── deepgemm │ │ ├── README.md │ │ └── benchmark_fp8_block_dense_gemm.py │ ├── graph_machete_bench.py │ ├── requirements.txt │ ├── utils.py │ └── weight_shapes.py ├── multi_turn │ ├── README.md │ ├── bench_dataset.py │ ├── bench_utils.py │ ├── benchmark_serving_multi_turn.py │ ├── convert_sharegpt_to_openai.py │ ├── generate_multi_turn.json │ └── requirements.txt ├── overheads │ └── benchmark_hashing.py ├── pyproject.toml ├── run_structured_output_benchmark.sh ├── sonnet.txt └── structured_schemas │ └── structured_schema_1.json ├── cmake ├── cpu_extension.cmake ├── external_projects │ ├── flashmla.cmake │ └── vllm_flash_attn.cmake ├── hipify.py └── utils.cmake ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cuh │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ ├── dtype_float32.cuh │ ├── dtype_fp8.cuh │ ├── merge_attn_states.cu │ ├── mla │ │ ├── cutlass_sm100_mla │ │ │ ├── device │ │ │ │ └── sm100_mla.hpp │ │ │ └── kernel │ │ │ │ ├── sm100_fmha_mla_reduction.hpp │ │ │ │ ├── sm100_fmha_mla_tma_warpspecialized.hpp │ │ │ │ └── sm100_mla_tile_scheduler.hpp │ │ └── sm100_cutlass_mla_kernel.cu │ ├── paged_attention_v1.cu │ ├── paged_attention_v2.cu │ └── vertical_slash_index.cu ├── cache.h ├── cache_kernels.cu ├── core │ ├── exception.hpp │ ├── math.hpp │ ├── registration.h │ └── scalar_type.hpp ├── cpu │ ├── activation.cpp │ ├── attention.cpp │ ├── cache.cpp │ ├── cpu_types.hpp │ ├── cpu_types_arm.hpp │ ├── cpu_types_vsx.hpp │ ├── cpu_types_vxe.hpp │ ├── cpu_types_x86.hpp │ ├── dnnl_helper.cpp │ ├── dnnl_helper.h │ ├── dnnl_kernels.cpp │ ├── layernorm.cpp │ ├── mla_decode.cpp │ ├── pos_encoding.cpp │ ├── sgl-kernels │ │ ├── common.h │ │ ├── gemm.cpp │ │ ├── gemm.h │ │ ├── gemm_fp8.cpp │ │ ├── gemm_int8.cpp │ │ ├── moe.cpp │ │ ├── moe_fp8.cpp │ │ ├── moe_int8.cpp │ │ └── vec.h │ ├── shm.cpp │ ├── torch_bindings.cpp │ └── utils.cpp ├── cub_helpers.h ├── cuda_compat.h ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── cuda_view.cu ├── cumem_allocator.cpp ├── custom_all_reduce.cu ├── custom_all_reduce.cuh ├── custom_all_reduce_test.cu ├── custom_quickreduce.cu ├── cutlass_extensions │ ├── common.cpp │ ├── common.hpp │ ├── cute_utils.cuh │ ├── epilogue │ │ ├── broadcast_load_epilogue_array_c3x.hpp │ │ ├── broadcast_load_epilogue_c2x.hpp │ │ ├── broadcast_load_epilogue_c3x.hpp │ │ ├── scaled_mm_epilogues_c2x.hpp │ │ └── scaled_mm_epilogues_c3x.hpp │ ├── torch_utils.hpp │ ├── vllm_collective_builder.cuh │ ├── vllm_custom_types.cuh │ ├── vllm_cutlass_library_extension.py │ ├── vllm_numeric_conversion.cuh │ └── vllm_type_utils.cuh ├── dispatch_utils.h ├── layernorm_kernels.cu ├── layernorm_quant_kernels.cu ├── mamba │ └── mamba_ssm │ │ ├── selective_scan.h │ │ ├── selective_scan_fwd.cu │ │ └── static_switch.h ├── moe │ ├── grouped_topk_kernels.cu │ ├── marlin_moe_wna16 │ │ ├── .gitignore │ │ ├── generate_kernels.py │ │ ├── kernel.h │ │ ├── marlin_template.h │ │ └── ops.cu │ ├── moe_align_sum_kernels.cu │ ├── moe_ops.h │ ├── moe_permute_unpermute_op.cu │ ├── moe_wna16.cu │ ├── moe_wna16_utils.h │ ├── permute_unpermute_kernels │ │ ├── dispatch.h │ │ ├── moe_permute_unpermute_kernel.cu │ │ ├── moe_permute_unpermute_kernel.h │ │ └── moe_permute_unpermute_kernel.inl │ ├── topk_softmax_kernels.cu │ └── torch_bindings.cpp ├── ops.h ├── permute_cols.cu ├── pos_encoding_kernels.cu ├── quantization │ ├── activation_kernels.cu │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ ├── compressed_tensors │ │ └── int8_quant_kernels.cu │ ├── cutlass_w4a8 │ │ └── w4a8_mm_entry.cu │ ├── cutlass_w8a8 │ │ ├── Epilogues.md │ │ ├── c3x │ │ │ ├── cutlass_gemm_caller.cuh │ │ │ ├── scaled_mm.cuh │ │ │ ├── scaled_mm_azp_sm90_int8.cu │ │ │ ├── scaled_mm_blockwise_sm100_fp8.cu │ │ │ ├── scaled_mm_blockwise_sm100_fp8_dispatch.cuh │ │ │ ├── scaled_mm_blockwise_sm120_fp8.cu │ │ │ ├── scaled_mm_blockwise_sm120_fp8_dispatch.cuh │ │ │ ├── scaled_mm_blockwise_sm90_fp8.cu │ │ │ ├── scaled_mm_blockwise_sm90_fp8_dispatch.cuh │ │ │ ├── scaled_mm_helper.hpp │ │ │ ├── scaled_mm_kernels.hpp │ │ │ ├── scaled_mm_sm100_fp8.cu │ │ │ ├── scaled_mm_sm100_fp8_dispatch.cuh │ │ │ ├── scaled_mm_sm120_fp8.cu │ │ │ ├── scaled_mm_sm120_fp8_dispatch.cuh │ │ │ ├── scaled_mm_sm90_fp8.cu │ │ │ ├── scaled_mm_sm90_fp8_dispatch.cuh │ │ │ ├── scaled_mm_sm90_int8.cu │ │ │ └── scaled_mm_sm90_int8_dispatch.cuh │ │ ├── moe │ │ │ ├── blockwise_scaled_group_mm_sm100.cu │ │ │ ├── get_group_starts.cuh │ │ │ ├── grouped_mm_c3x.cuh │ │ │ ├── grouped_mm_c3x_sm100.cu │ │ │ ├── grouped_mm_c3x_sm90.cu │ │ │ └── moe_data.cu │ │ ├── scaled_mm_c2x.cu │ │ ├── scaled_mm_c2x.cuh │ │ ├── scaled_mm_c2x_sm75_dispatch.cuh │ │ ├── scaled_mm_c2x_sm80_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_fp8_dispatch.cuh │ │ ├── scaled_mm_c2x_sm89_int8_dispatch.cuh │ │ ├── scaled_mm_c3x_sm100.cu │ │ ├── scaled_mm_c3x_sm120.cu │ │ ├── scaled_mm_c3x_sm90.cu │ │ └── scaled_mm_entry.cu │ ├── fp4 │ │ ├── activation_nvfp4_quant_fusion_kernels.cu │ │ ├── nvfp4_blockwise_moe_kernel.cu │ │ ├── nvfp4_experts_quant.cu │ │ ├── nvfp4_quant_entry.cu │ │ ├── nvfp4_quant_kernels.cu │ │ ├── nvfp4_scaled_mm_entry.cu │ │ ├── nvfp4_scaled_mm_kernels.cu │ │ ├── nvfp4_scaled_mm_sm120_kernels.cu │ │ └── nvfp4_utils.cuh │ ├── fp8 │ │ ├── amd │ │ │ └── quant_utils.cuh │ │ ├── common.cu │ │ ├── common.cuh │ │ ├── nvidia │ │ │ └── quant_utils.cuh │ │ └── per_token_group_quant.cu │ ├── fused_kernels │ │ ├── fused_layernorm_dynamic_per_token_quant.cu │ │ ├── layernorm_utils.cuh │ │ └── quant_conversions.cuh │ ├── gguf │ │ ├── dequantize.cuh │ │ ├── ggml-common.h │ │ ├── gguf_kernel.cu │ │ ├── mmq.cuh │ │ ├── mmvq.cuh │ │ ├── moe.cuh │ │ ├── moe_vec.cuh │ │ └── vecdotq.cuh │ ├── gptq │ │ ├── compat.cuh │ │ ├── matrix_view.cuh │ │ ├── q_gemm.cu │ │ ├── qdq_2.cuh │ │ ├── qdq_3.cuh │ │ ├── qdq_4.cuh │ │ ├── qdq_8.cuh │ │ └── qdq_util.cuh │ ├── gptq_allspark │ │ ├── allspark_qgemm_w8a16.cu │ │ ├── allspark_repack.cu │ │ └── allspark_utils.cuh │ ├── gptq_marlin │ │ ├── .gitignore │ │ ├── awq_marlin_repack.cu │ │ ├── dequant.h │ │ ├── generate_kernels.py │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin_repack.cu │ │ ├── kernel.h │ │ ├── marlin.cuh │ │ ├── marlin_dtypes.cuh │ │ └── marlin_template.h │ ├── hadamard │ │ └── hadacore │ │ │ └── hadamard_transform_cuda.cu │ ├── machete │ │ ├── Readme.md │ │ ├── generate.py │ │ ├── machete_collective_builder.cuh │ │ ├── machete_interleaving_utils.cuh │ │ ├── machete_mainloop.cuh │ │ ├── machete_mm_kernel.cuh │ │ ├── machete_mm_launcher.cuh │ │ ├── machete_prepack_kernel.cuh │ │ ├── machete_prepack_launcher.cuh │ │ ├── machete_prepacked_layout.cuh │ │ └── machete_pytorch.cu │ ├── marlin │ │ └── sparse │ │ │ ├── LICENSE │ │ │ ├── common │ │ │ ├── base.h │ │ │ ├── mem.h │ │ │ └── mma.h │ │ │ └── marlin_24_cuda_kernel.cu │ ├── per_token_group_quant_8bit.h │ ├── utils.cuh │ ├── vectorization.cuh │ └── vectorization_utils.cuh ├── quickreduce │ ├── base.h │ ├── quick_reduce.h │ └── quick_reduce_impl.cuh ├── rocm │ ├── attention.cu │ ├── ops.h │ ├── skinny_gemms.cu │ └── torch_bindings.cpp ├── sampler.cu ├── sparse │ └── cutlass │ │ ├── sparse_compressor_c3x.cuh │ │ ├── sparse_scaled_mm_c3x.cu │ │ ├── sparse_scaled_mm_c3x.cuh │ │ └── sparse_scaled_mm_entry.cu ├── torch_bindings.cpp └── type_convert.cuh ├── docker ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.nightly_torch ├── Dockerfile.ppc64le ├── Dockerfile.rocm ├── Dockerfile.rocm_base ├── Dockerfile.s390x ├── Dockerfile.tpu └── Dockerfile.xpu ├── docs ├── .nav.yml ├── README.md ├── api │ ├── README.md │ └── vllm │ │ └── .meta.yml ├── assets │ ├── contributing │ │ └── dockerfile-stages-dependency.png │ ├── deployment │ │ ├── anything-llm-chat-with-doc.png │ │ ├── anything-llm-chat-without-doc.png │ │ ├── anything-llm-provider.png │ │ ├── anything-llm-upload-doc.png │ │ ├── architecture_helm_deployment.png │ │ ├── chatbox-chat.png │ │ ├── chatbox-settings.png │ │ ├── dify-chat.png │ │ ├── dify-create-chatbot.png │ │ ├── dify-settings.png │ │ ├── dp_external_lb.png │ │ ├── dp_internal_lb.png │ │ ├── open_webui.png │ │ └── streamlit-chat.png │ ├── design │ │ ├── arch_overview │ │ │ ├── entrypoints.excalidraw.png │ │ │ └── llm_engine.excalidraw.png │ │ ├── fused_moe_modular_kernel │ │ │ ├── fused_experts_blocks.png │ │ │ ├── fused_moe_batched.png │ │ │ ├── fused_moe_non_batched.png │ │ │ └── prepare_and_finalize_blocks.png │ │ ├── hierarchy.png │ │ ├── hybrid_kv_cache_manager │ │ │ ├── basic_grouping_example.png │ │ │ ├── full_attn.png │ │ │ ├── memory_layout.png │ │ │ ├── overview.png │ │ │ └── sw_attn.png │ │ ├── metrics │ │ │ ├── intervals-1.png │ │ │ ├── intervals-2.png │ │ │ └── intervals-3.png │ │ ├── paged_attention │ │ │ ├── k_vecs.png │ │ │ ├── key.png │ │ │ ├── logits_vec.png │ │ │ ├── q_vecs.png │ │ │ ├── query.png │ │ │ ├── v_vec.png │ │ │ └── value.png │ │ ├── prefix_caching │ │ │ ├── example-time-1.png │ │ │ ├── example-time-3.png │ │ │ ├── example-time-4.png │ │ │ ├── example-time-5.png │ │ │ ├── example-time-6.png │ │ │ ├── example-time-7.png │ │ │ ├── free.png │ │ │ └── overview.png │ │ └── tpu │ │ │ └── most_model_len.png │ ├── features │ │ └── disagg_prefill │ │ │ ├── abstraction.jpg │ │ │ ├── high_level_design.png │ │ │ ├── overview.jpg │ │ │ └── workflow.png │ └── logos │ │ ├── vllm-logo-only-light.ico │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png ├── cli │ ├── .meta.yml │ ├── .nav.yml │ ├── README.md │ ├── bench │ │ ├── latency.md │ │ ├── serve.md │ │ └── throughput.md │ ├── chat.md │ ├── complete.md │ ├── json_tip.inc.md │ ├── run-batch.md │ └── serve.md ├── community │ ├── contact_us.md │ ├── meetups.md │ └── sponsors.md ├── configuration │ ├── README.md │ ├── conserving_memory.md │ ├── engine_args.md │ ├── env_vars.md │ ├── model_resolution.md │ ├── optimization.md │ ├── serve_args.md │ └── tpu.md ├── contributing │ ├── README.md │ ├── benchmarks.md │ ├── ci │ │ ├── failures.md │ │ └── update_pytorch_version.md │ ├── deprecation_policy.md │ ├── dockerfile │ │ └── dockerfile.md │ ├── incremental_build.md │ ├── model │ │ ├── README.md │ │ ├── basic.md │ │ ├── multimodal.md │ │ ├── registration.md │ │ ├── tests.md │ │ └── transcription.md │ ├── profiling.md │ └── vulnerability_management.md ├── deployment │ ├── docker.md │ ├── frameworks │ │ ├── anyscale.md │ │ ├── anything-llm.md │ │ ├── autogen.md │ │ ├── bentoml.md │ │ ├── cerebrium.md │ │ ├── chatbox.md │ │ ├── dify.md │ │ ├── dstack.md │ │ ├── haystack.md │ │ ├── helm.md │ │ ├── litellm.md │ │ ├── lobe-chat.md │ │ ├── lws.md │ │ ├── modal.md │ │ ├── open-webui.md │ │ ├── retrieval_augmented_generation.md │ │ ├── skypilot.md │ │ ├── streamlit.md │ │ └── triton.md │ ├── integrations │ │ ├── kserve.md │ │ ├── kubeai.md │ │ ├── kuberay.md │ │ ├── llamastack.md │ │ ├── llmaz.md │ │ └── production-stack.md │ ├── k8s.md │ └── nginx.md ├── design │ ├── arch_overview.md │ ├── fused_moe_modular_kernel.md │ ├── huggingface_integration.md │ ├── hybrid_kv_cache_manager.md │ ├── io_processor_plugins.md │ ├── logits_processors.md │ ├── metrics.md │ ├── mm_processing.md │ ├── multiprocessing.md │ ├── p2p_nccl_connector.md │ ├── paged_attention.md │ ├── plugin_system.md │ ├── prefix_caching.md │ └── torch_compile.md ├── examples │ └── README.md ├── features │ ├── README.md │ ├── automatic_prefix_caching.md │ ├── custom_arguments.md │ ├── custom_logitsprocs.md │ ├── disagg_prefill.md │ ├── lora.md │ ├── multimodal_inputs.md │ ├── prompt_embeds.md │ ├── quantization │ │ ├── README.md │ │ ├── auto_awq.md │ │ ├── auto_round.md │ │ ├── bitblas.md │ │ ├── bnb.md │ │ ├── fp8.md │ │ ├── gguf.md │ │ ├── gptqmodel.md │ │ ├── inc.md │ │ ├── int4.md │ │ ├── int8.md │ │ ├── modelopt.md │ │ ├── quantized_kvcache.md │ │ ├── quark.md │ │ └── torchao.md │ ├── reasoning_outputs.md │ ├── sleep_mode.md │ ├── spec_decode.md │ ├── structured_outputs.md │ └── tool_calling.md ├── getting_started │ ├── installation │ │ ├── .nav.yml │ │ ├── README.md │ │ ├── cpu.md │ │ ├── cpu │ │ │ ├── apple.inc.md │ │ │ ├── arm.inc.md │ │ │ ├── build.inc.md │ │ │ ├── s390x.inc.md │ │ │ └── x86.inc.md │ │ ├── device.template.md │ │ ├── google_tpu.md │ │ ├── gpu.md │ │ ├── gpu │ │ │ ├── cuda.inc.md │ │ │ ├── rocm.inc.md │ │ │ └── xpu.inc.md │ │ └── python_env_setup.inc.md │ └── quickstart.md ├── mkdocs │ ├── hooks │ │ ├── generate_argparse.py │ │ ├── generate_examples.py │ │ ├── remove_announcement.py │ │ └── url_schemes.py │ ├── javascript │ │ ├── edit_and_feedback.js │ │ ├── mathjax.js │ │ ├── run_llm_widget.js │ │ └── slack_and_forum.js │ ├── overrides │ │ ├── main.html │ │ └── partials │ │ │ └── toc-item.html │ └── stylesheets │ │ └── extra.css ├── models │ ├── extensions │ │ ├── fastsafetensor.md │ │ ├── runai_model_streamer.md │ │ └── tensorizer.md │ ├── generative_models.md │ ├── hardware_supported_models │ │ └── tpu.md │ ├── pooling_models.md │ └── supported_models.md ├── serving │ ├── data_parallel_deployment.md │ ├── distributed_troubleshooting.md │ ├── expert_parallel_deployment.md │ ├── integrations │ │ ├── langchain.md │ │ └── llamaindex.md │ ├── offline_inference.md │ ├── openai_compatible_server.md │ └── parallelism_scaling.md ├── training │ ├── rlhf.md │ └── trl.md └── usage │ ├── README.md │ ├── faq.md │ ├── metrics.md │ ├── reproducibility.md │ ├── security.md │ ├── troubleshooting.md │ ├── usage_stats.md │ └── v1_guide.md ├── examples ├── offline_inference │ ├── async_llm_streaming.py │ ├── audio_language.py │ ├── automatic_prefix_caching.py │ ├── basic │ │ ├── README.md │ │ ├── basic.py │ │ ├── chat.py │ │ ├── classify.py │ │ ├── embed.py │ │ ├── generate.py │ │ ├── reward.py │ │ └── score.py │ ├── batch_llm_inference.py │ ├── chat_with_tools.py │ ├── context_extension.py │ ├── data_parallel.py │ ├── disaggregated-prefill-v1 │ │ ├── README.md │ │ ├── decode_example.py │ │ ├── prefill_example.py │ │ └── run.sh │ ├── disaggregated_prefill.py │ ├── encoder_decoder_multimodal.py │ ├── llm_engine_example.py │ ├── load_sharded_state.py │ ├── logits_processor │ │ ├── custom.py │ │ ├── custom_req.py │ │ └── custom_req_init.py │ ├── lora_with_quantization_inference.py │ ├── metrics.py │ ├── mistral-small.py │ ├── mlpspeculator.py │ ├── multilora_inference.py │ ├── openai_batch │ │ ├── README.md │ │ └── openai_example_batch.jsonl │ ├── pooling │ │ ├── README.md │ │ ├── convert_model_to_seq_cls.py │ │ ├── embed_jina_embeddings_v3.py │ │ ├── embed_matryoshka_fy.py │ │ └── qwen3_reranker.py │ ├── prefix_caching.py │ ├── prithvi_geospatial_mae.py │ ├── prithvi_geospatial_mae_io_processor.py │ ├── profiling.py │ ├── profiling_tpu │ │ ├── README.md │ │ └── profiling.py │ ├── prompt_embed_inference.py │ ├── qwen2_5_omni │ │ ├── README.md │ │ └── only_thinker.py │ ├── qwen_1m.py │ ├── reproducibility.py │ ├── rlhf.py │ ├── rlhf_colocate.py │ ├── rlhf_utils.py │ ├── save_sharded_state.py │ ├── simple_profiling.py │ ├── skip_loading_weights_in_engine_init.py │ ├── spec_decode.py │ ├── structured_outputs.py │ ├── torchrun_example.py │ ├── tpu.py │ ├── vision_language.py │ ├── vision_language_multi_image.py │ └── vision_language_pooling.py ├── online_serving │ ├── api_client.py │ ├── chart-helm │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── README.md │ │ ├── ct.yaml │ │ ├── lintconf.yaml │ │ ├── templates │ │ │ ├── _helpers.tpl │ │ │ ├── configmap.yaml │ │ │ ├── custom-objects.yaml │ │ │ ├── deployment.yaml │ │ │ ├── hpa.yaml │ │ │ ├── job.yaml │ │ │ ├── poddisruptionbudget.yaml │ │ │ ├── pvc.yaml │ │ │ ├── secrets.yaml │ │ │ └── service.yaml │ │ ├── values.schema.json │ │ └── values.yaml │ ├── dashboards │ │ ├── README.md │ │ ├── grafana │ │ │ ├── README.md │ │ │ ├── performance_statistics.json │ │ │ └── query_statistics.json │ │ └── perses │ │ │ ├── README.md │ │ │ ├── performance_statistics.yaml │ │ │ └── query_statistics.yaml │ ├── disaggregated_prefill.sh │ ├── disaggregated_serving │ │ ├── README.md │ │ ├── disagg_proxy_demo.py │ │ └── kv_events.sh │ ├── disaggregated_serving_p2p_nccl_xpyd │ │ ├── disagg_example_p2p_nccl_xpyd.sh │ │ └── disagg_proxy_p2p_nccl_xpyd.py │ ├── elastic_ep │ │ ├── bench.sh │ │ ├── scale.py │ │ └── serve_deepseek_v2.sh │ ├── gradio_openai_chatbot_webserver.py │ ├── gradio_webserver.py │ ├── kv_events_subscriber.py │ ├── multi-node-serving.sh │ ├── multi_instance_data_parallel.py │ ├── openai_chat_completion_client.py │ ├── openai_chat_completion_client_for_multimodal.py │ ├── openai_chat_completion_client_with_tools.py │ ├── openai_chat_completion_client_with_tools_required.py │ ├── openai_chat_completion_client_with_tools_xlam.py │ ├── openai_chat_completion_client_with_tools_xlam_streaming.py │ ├── openai_chat_completion_tool_calls_with_reasoning.py │ ├── openai_chat_completion_with_reasoning.py │ ├── openai_chat_completion_with_reasoning_streaming.py │ ├── openai_completion_client.py │ ├── openai_cross_encoder_score.py │ ├── openai_cross_encoder_score_for_multimodal.py │ ├── openai_embedding_long_text │ │ ├── README.md │ │ ├── client.py │ │ └── service.sh │ ├── openai_transcription_client.py │ ├── openai_translation_client.py │ ├── opentelemetry │ │ ├── README.md │ │ └── dummy_client.py │ ├── pooling │ │ ├── README.md │ │ ├── cohere_rerank_client.py │ │ ├── jinaai_rerank_client.py │ │ ├── openai_chat_embedding_client_for_multimodal.py │ │ ├── openai_classification_client.py │ │ ├── openai_embedding_client.py │ │ ├── openai_embedding_matryoshka_fy.py │ │ └── openai_pooling_client.py │ ├── prithvi_geospatial_mae.py │ ├── prometheus_grafana │ │ ├── README.md │ │ ├── docker-compose.yaml │ │ ├── grafana.json │ │ └── prometheus.yaml │ ├── prompt_embed_inference_with_openai_client.py │ ├── ray_serve_deepseek.py │ ├── retrieval_augmented_generation_with_langchain.py │ ├── retrieval_augmented_generation_with_llamaindex.py │ ├── run_cluster.sh │ ├── sagemaker-entrypoint.sh │ ├── streamlit_openai_chatbot_webserver.py │ ├── structured_outputs │ │ ├── README.md │ │ ├── pyproject.toml │ │ └── structured_outputs.py │ └── utils.py ├── others │ ├── lmcache │ │ ├── README.md │ │ ├── cpu_offload_lmcache.py │ │ ├── disagg_prefill_lmcache_v0.py │ │ ├── disagg_prefill_lmcache_v1 │ │ │ ├── configs │ │ │ │ ├── lmcache-decoder-config.yaml │ │ │ │ └── lmcache-prefiller-config.yaml │ │ │ ├── disagg_example_nixl.sh │ │ │ ├── disagg_proxy_server.py │ │ │ └── disagg_vllm_launcher.sh │ │ └── kv_cache_sharing_lmcache_v1.py │ ├── logging_configuration.md │ └── tensorize_vllm_model.py ├── pyproject.toml ├── template_alpaca.jinja ├── template_baichuan.jinja ├── template_chatglm.jinja ├── template_chatglm2.jinja ├── template_chatml.jinja ├── template_dse_qwen2_vl.jinja ├── template_falcon.jinja ├── template_falcon_180b.jinja ├── template_inkbot.jinja ├── template_teleflm.jinja ├── template_vlm2vec.jinja ├── tool_chat_template_deepseekr1.jinja ├── tool_chat_template_deepseekv3.jinja ├── tool_chat_template_deepseekv31.jinja ├── tool_chat_template_gemma3_pythonic.jinja ├── tool_chat_template_granite.jinja ├── tool_chat_template_granite_20b_fc.jinja ├── tool_chat_template_hermes.jinja ├── tool_chat_template_hunyuan_a13b.jinja ├── tool_chat_template_internlm2_tool.jinja ├── tool_chat_template_llama3.1_json.jinja ├── tool_chat_template_llama3.2_json.jinja ├── tool_chat_template_llama3.2_pythonic.jinja ├── tool_chat_template_llama4_json.jinja ├── tool_chat_template_llama4_pythonic.jinja ├── tool_chat_template_minimax_m1.jinja ├── tool_chat_template_mistral.jinja ├── tool_chat_template_mistral3.jinja ├── tool_chat_template_mistral_parallel.jinja ├── tool_chat_template_phi4_mini.jinja ├── tool_chat_template_qwen3coder.jinja ├── tool_chat_template_toolace.jinja ├── tool_chat_template_xlam_llama.jinja └── tool_chat_template_xlam_qwen.jinja ├── format.sh ├── mkdocs.yaml ├── pyproject.toml ├── requirements ├── build.txt ├── common.txt ├── cpu-build.txt ├── cpu.txt ├── cuda.txt ├── dev.txt ├── docs.txt ├── kv_connectors.txt ├── lint.txt ├── nightly_torch_test.txt ├── rocm-build.txt ├── rocm-test.txt ├── rocm.txt ├── test.in ├── test.txt ├── tpu.txt └── xpu.txt ├── setup.py ├── tests ├── __init__.py ├── basic_correctness │ ├── __init__.py │ ├── test_basic_correctness.py │ ├── test_cpu_offload.py │ └── test_cumem.py ├── benchmarks │ ├── __init__.py │ ├── test_latency_cli.py │ ├── test_random_dataset.py │ ├── test_serve_cli.py │ └── test_throughput_cli.py ├── build_cython.py ├── ci_envs.py ├── compile │ ├── __init__.py │ ├── backend.py │ ├── piecewise │ │ ├── __init__.py │ │ ├── test_full_cudagraph.py │ │ ├── test_multiple_graphs.py │ │ ├── test_simple.py │ │ └── test_toy_llama.py │ ├── silly_attention.py │ ├── test_async_tp.py │ ├── test_basic_correctness.py │ ├── test_config.py │ ├── test_decorator.py │ ├── test_full_graph.py │ ├── test_functionalization.py │ ├── test_fusion.py │ ├── test_fusion_all_reduce.py │ ├── test_fusion_attn.py │ ├── test_noop_elimination.py │ ├── test_pass_manager.py │ ├── test_sequence_parallelism.py │ ├── test_silu_mul_quant_fusion.py │ └── test_wrapper.py ├── config │ ├── test_config.yaml │ ├── test_config_generation.py │ ├── test_config_with_model.yaml │ └── test_mp_reducer.py ├── conftest.py ├── cuda │ └── test_cuda_context.py ├── detokenizer │ ├── __init__.py │ ├── test_disable_detokenization.py │ ├── test_min_tokens.py │ ├── test_stop_reason.py │ ├── test_stop_string_while_stop_model_terminates.py │ └── test_stop_strings.py ├── distributed │ ├── __init__.py │ ├── conftest.py │ ├── test_ca_buffer_sharing.py │ ├── test_comm_ops.py │ ├── test_context_parallel.py │ ├── test_custom_all_reduce.py │ ├── test_distributed_oot.py │ ├── test_eplb_algo.py │ ├── test_eplb_execute.py │ ├── test_events.py │ ├── test_expert_parallel.py │ ├── test_expert_placement.py │ ├── test_kvlayout.py │ ├── test_multi_node_assignment.py │ ├── test_node_count.py │ ├── test_pipeline_parallel.py │ ├── test_pipeline_partition.py │ ├── test_pp_cudagraph.py │ ├── test_pynccl.py │ ├── test_quick_all_reduce.py │ ├── test_same_node.py │ ├── test_sequence_parallel.py │ ├── test_shm_broadcast.py │ ├── test_shm_buffer.py │ ├── test_shm_storage.py │ ├── test_symm_mem_allreduce.py │ ├── test_torchrun_example.py │ └── test_utils.py ├── engine │ ├── __init__.py │ ├── test_arg_utils.py │ └── test_short_mm_context.py ├── entrypoints │ ├── __init__.py │ ├── conftest.py │ ├── llm │ │ ├── __init__.py │ │ ├── test_accuracy.py │ │ ├── test_chat.py │ │ ├── test_collective_rpc.py │ │ ├── test_generate.py │ │ ├── test_gpu_utilization.py │ │ ├── test_lazy_outlines.py │ │ └── test_prompt_validation.py │ ├── offline_mode │ │ ├── __init__.py │ │ └── test_offline_mode.py │ ├── openai │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── correctness │ │ │ ├── __init__.py │ │ │ ├── test_lmeval.py │ │ │ └── test_transcription_api_correctness.py │ │ ├── test_async_tokenization.py │ │ ├── test_audio.py │ │ ├── test_basic.py │ │ ├── test_chat.py │ │ ├── test_chat_echo.py │ │ ├── test_chat_logit_bias_validation.py │ │ ├── test_chat_template.py │ │ ├── test_chat_with_tool_reasoning.py │ │ ├── test_chunked_prompt.py │ │ ├── test_cli_args.py │ │ ├── test_collective_rpc.py │ │ ├── test_completion.py │ │ ├── test_completion_with_function_calling.py │ │ ├── test_completion_with_prompt_embeds.py │ │ ├── test_default_mm_loras.py │ │ ├── test_lora_adapters.py │ │ ├── test_lora_resolvers.py │ │ ├── test_metrics.py │ │ ├── test_models.py │ │ ├── test_oot_registration.py │ │ ├── test_openai_schema.py │ │ ├── test_optional_middleware.py │ │ ├── test_prompt_validation.py │ │ ├── test_response_api_with_harmony.py │ │ ├── test_return_token_ids.py │ │ ├── test_return_tokens_as_ids.py │ │ ├── test_root_path.py │ │ ├── test_run_batch.py │ │ ├── test_serving_chat.py │ │ ├── test_serving_models.py │ │ ├── test_shutdown.py │ │ ├── test_skip_tokenizer.py │ │ ├── test_sleep.py │ │ ├── test_tensorizer_entrypoint.py │ │ ├── test_token_in_token_out.py │ │ ├── test_tokenization.py │ │ ├── test_transcription_validation.py │ │ ├── test_translation_validation.py │ │ ├── test_uds.py │ │ ├── test_video.py │ │ ├── test_vision.py │ │ └── tool_parsers │ │ │ ├── __init__.py │ │ │ ├── test_hermes_tool_parser.py │ │ │ ├── test_hunyuan_a13b_tool_parser.py │ │ │ ├── test_llama3_json_tool_parser.py │ │ │ ├── test_llama4_pythonic_tool_parser.py │ │ │ ├── test_pythonic_tool_parser.py │ │ │ └── utils.py │ ├── pooling │ │ ├── __init__.py │ │ ├── correctness │ │ │ ├── __init__.py │ │ │ ├── test_mteb_embed.py │ │ │ └── test_mteb_score.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── test_classify.py │ │ │ ├── test_embedding.py │ │ │ ├── test_encode.py │ │ │ ├── test_reward.py │ │ │ └── test_score.py │ │ └── openai │ │ │ ├── __init__.py │ │ │ ├── test_classification.py │ │ │ ├── test_embedding.py │ │ │ ├── test_embedding_dimensions.py │ │ │ ├── test_embedding_long_text.py │ │ │ ├── test_pooling.py │ │ │ ├── test_rerank.py │ │ │ ├── test_score.py │ │ │ ├── test_truncation.py │ │ │ └── test_vision_embedding.py │ ├── test_api_server_process_manager.py │ ├── test_chat_utils.py │ ├── test_context.py │ ├── test_renderer.py │ └── test_ssl_cert_refresher.py ├── evals │ ├── gpt_oss │ │ ├── __init__.py │ │ ├── conftest.py │ │ └── test_gpqa_correctness.py │ └── gsm8k │ │ ├── README.md │ │ ├── __init__.py │ │ ├── configs │ │ ├── DeepSeek-V2-Lite-Instruct-FP8.yaml │ │ ├── Llama-3-8B-Instruct-nonuniform-CT.yaml │ │ ├── Llama-3.2-1B-Instruct-INT8-CT.yaml │ │ ├── Qwen1.5-MoE-W4A16-CT.yaml │ │ ├── Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml │ │ ├── Qwen3-0.6B-FP8.yaml │ │ └── models-small.txt │ │ ├── conftest.py │ │ ├── gsm8k_eval.py │ │ └── test_gsm8k_correctness.py ├── fastsafetensors_loader │ ├── __init__.py │ ├── test_fastsafetensors_loader.py │ └── test_weight_utils.py ├── kernels │ ├── __init__.py │ ├── allclose_default.py │ ├── attention │ │ ├── conftest.py │ │ ├── test_aiter_flash_attn.py │ │ ├── test_attention.py │ │ ├── test_attention_selector.py │ │ ├── test_cache.py │ │ ├── test_cascade_flash_attn.py │ │ ├── test_cutlass_mla_decode.py │ │ ├── test_flash_attn.py │ │ ├── test_flashinfer.py │ │ ├── test_flashinfer_mla_decode.py │ │ ├── test_flashinfer_trtllm_attention.py │ │ ├── test_flashmla.py │ │ ├── test_lightning_attn.py │ │ ├── test_merge_attn_states.py │ │ ├── test_mha_attn.py │ │ ├── test_mla_decode_cpu.py │ │ ├── test_prefix_prefill.py │ │ ├── test_rocm_attention_selector.py │ │ ├── test_triton_decode_attention.py │ │ └── test_triton_unified_attention.py │ ├── core │ │ ├── test_activation.py │ │ ├── test_fused_quant_layernorm.py │ │ ├── test_layernorm.py │ │ ├── test_mrope.py │ │ ├── test_opcheck.py │ │ ├── test_permute_cols.py │ │ ├── test_pos_encoding.py │ │ ├── test_rotary_embedding.py │ │ └── test_uva.py │ ├── mamba │ │ ├── test_causal_conv1d.py │ │ ├── test_mamba_mixer2.py │ │ ├── test_mamba_ssm.py │ │ └── test_mamba_ssm_ssd.py │ ├── moe │ │ ├── __init__.py │ │ ├── modular_kernel_tools │ │ │ ├── __init__.py │ │ │ ├── cli_args.py │ │ │ ├── common.py │ │ │ ├── make_feature_matrix.py │ │ │ ├── mk_objects.py │ │ │ ├── parallel_utils.py │ │ │ └── profile_modular_kernel.py │ │ ├── parallel_utils.py │ │ ├── test_batched_deepgemm.py │ │ ├── test_batched_moe.py │ │ ├── test_block_fp8.py │ │ ├── test_block_int8.py │ │ ├── test_count_expert_num_tokens.py │ │ ├── test_cutlass_grouped_gemm.py │ │ ├── test_cutlass_moe.py │ │ ├── test_deepep_deepgemm_moe.py │ │ ├── test_deepep_moe.py │ │ ├── test_deepgemm.py │ │ ├── test_flashinfer.py │ │ ├── test_flashinfer_moe.py │ │ ├── test_gpt_oss_triton_kernels.py │ │ ├── test_grouped_topk.py │ │ ├── test_modular_kernel_combinations.py │ │ ├── test_moe.py │ │ ├── test_moe_align_block_size.py │ │ ├── test_moe_permute_unpermute.py │ │ ├── test_mxfp4_moe.py │ │ ├── test_nvfp4_moe.py │ │ ├── test_pplx_cutlass_moe.py │ │ ├── test_pplx_moe.py │ │ ├── test_rocm_aiter_topk.py │ │ ├── test_silu_mul_fp8_quant_deep_gemm.py │ │ ├── test_triton_moe_ptpc_fp8.py │ │ └── utils.py │ ├── quant_utils.py │ ├── quantization │ │ ├── nvfp4_utils.py │ │ ├── test_allspark_gemm.py │ │ ├── test_awq.py │ │ ├── test_awq_triton.py │ │ ├── test_block_fp8.py │ │ ├── test_block_int8.py │ │ ├── test_cutlass_2of4_sparse.py │ │ ├── test_cutlass_scaled_mm.py │ │ ├── test_cutlass_w4a8.py │ │ ├── test_flashinfer_nvfp4_scaled_mm.py │ │ ├── test_flashinfer_scaled_mm.py │ │ ├── test_fp8_quant.py │ │ ├── test_fp8_quant_group.py │ │ ├── test_ggml.py │ │ ├── test_gguf.py │ │ ├── test_gptq.py │ │ ├── test_hadacore.py │ │ ├── test_int8_kernel.py │ │ ├── test_int8_quant.py │ │ ├── test_machete_mm.py │ │ ├── test_marlin_gemm.py │ │ ├── test_nvfp4_quant.py │ │ ├── test_nvfp4_scaled_mm.py │ │ ├── test_per_token_group_quant.py │ │ ├── test_rocm_skinny_gemms.py │ │ ├── test_silu_mul_nvfp4_quant.py │ │ └── test_triton_scaled_mm.py │ ├── test_apply_repetition_penalties.py │ ├── test_flex_attention.py │ ├── test_fused_quant_activation.py │ ├── test_onednn.py │ ├── test_shuffle_rows.py │ ├── test_triton_flash_attention.py │ └── utils.py ├── kv_transfer │ ├── test_lookup_buffer.py │ ├── test_lookup_buffer.sh │ ├── test_module.py │ ├── test_send_recv.py │ └── test_send_recv.sh ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── test_add_lora.py │ ├── test_chatglm3_tp.py │ ├── test_default_mm_loras.py │ ├── test_layers.py │ ├── test_llama_tp.py │ ├── test_llm_with_multi_loras.py │ ├── test_lora_checkpoints.py │ ├── test_lora_functions.py │ ├── test_lora_huggingface.py │ ├── test_lora_manager.py │ ├── test_minicpmv_tp.py │ ├── test_mixtral.py │ ├── test_peft_helper.py │ ├── test_punica_ops.py │ ├── test_quant_model.py │ ├── test_qwen2vl.py │ ├── test_resolver.py │ ├── test_transformers_model.py │ ├── test_utils.py │ ├── test_worker.py │ └── utils.py ├── mistral_tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_mistral_tool_calls.py │ └── utils.py ├── model_executor │ ├── __init__.py │ ├── conftest.py │ ├── model_loader │ │ ├── __init__.py │ │ └── test_registry.py │ ├── test_enabled_custom_ops.py │ ├── test_model_load_with_params.py │ └── test_weight_utils.py ├── models │ ├── __init__.py │ ├── fixtures │ │ ├── mistral_small_3_chat.json │ │ └── pixtral_chat.json │ ├── language │ │ ├── __init__.py │ │ ├── generation │ │ │ ├── __init__.py │ │ │ ├── test_common.py │ │ │ ├── test_gemma.py │ │ │ ├── test_granite.py │ │ │ ├── test_hybrid.py │ │ │ ├── test_mistral.py │ │ │ └── test_phimoe.py │ │ ├── generation_ppl_test │ │ │ ├── __init__.py │ │ │ ├── ppl_utils.py │ │ │ ├── test_gemma.py │ │ │ ├── test_gpt.py │ │ │ └── test_qwen.py │ │ ├── pooling │ │ │ ├── __init__.py │ │ │ ├── embed_utils.py │ │ │ ├── test_auto_prefix_cache_support.py │ │ │ ├── test_classification.py │ │ │ ├── test_embedding.py │ │ │ ├── test_gritlm.py │ │ │ ├── test_mm_classifier_conversion.py │ │ │ ├── test_multilabel_classification_support.py │ │ │ ├── test_nomic_max_model_len.py │ │ │ ├── test_override_pooler_config.py │ │ │ ├── test_reward.py │ │ │ ├── test_scoring.py │ │ │ └── test_truncation_control.py │ │ └── pooling_mteb_test │ │ │ ├── __init__.py │ │ │ ├── mteb_utils.py │ │ │ ├── test_baai.py │ │ │ ├── test_bge_reranker_v2_gemma.py │ │ │ ├── test_cross_encoder.py │ │ │ ├── test_gte.py │ │ │ ├── test_intfloat.py │ │ │ ├── test_jina.py │ │ │ ├── test_mxbai_rerank.py │ │ │ ├── test_nomic.py │ │ │ ├── test_qwen3_reranker.py │ │ │ ├── test_snowflake_arctic_embed.py │ │ │ └── test_st_projector.py │ ├── multimodal │ │ ├── __init__.py │ │ ├── generation │ │ │ ├── __init__.py │ │ │ ├── test_common.py │ │ │ ├── test_granite_speech.py │ │ │ ├── test_interleaved.py │ │ │ ├── test_maverick.py │ │ │ ├── test_phi4_multimodal.py │ │ │ ├── test_phi4mm.py │ │ │ ├── test_pixtral.py │ │ │ ├── test_qwen2_vl.py │ │ │ ├── test_ultravox.py │ │ │ ├── test_voxtral.py │ │ │ ├── test_whisper.py │ │ │ └── vlm_utils │ │ │ │ ├── __init__.py │ │ │ │ ├── builders.py │ │ │ │ ├── case_filtering.py │ │ │ │ ├── core.py │ │ │ │ ├── custom_inputs.py │ │ │ │ ├── model_utils.py │ │ │ │ ├── runners.py │ │ │ │ └── types.py │ │ ├── pooling │ │ │ ├── __init__.py │ │ │ ├── test_dse_qwen2_vl.py │ │ │ ├── test_intern_vit.py │ │ │ ├── test_jinavl_reranker.py │ │ │ ├── test_llava_next.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_prithvi_mae.py │ │ │ └── test_radio.py │ │ ├── processing │ │ │ ├── __init__.py │ │ │ ├── test_common.py │ │ │ ├── test_glm4_1v.py │ │ │ ├── test_h2ovl.py │ │ │ ├── test_idefics3.py │ │ │ ├── test_internvl.py │ │ │ ├── test_llama4.py │ │ │ ├── test_llava_next.py │ │ │ ├── test_llava_onevision.py │ │ │ ├── test_minimax_vl_01.py │ │ │ ├── test_mllama4.py │ │ │ ├── test_nemotron_vl.py │ │ │ ├── test_phi3v.py │ │ │ ├── test_phi4mm.py │ │ │ ├── test_qwen2_vl.py │ │ │ ├── test_smolvlm.py │ │ │ ├── test_tensor_schema.py │ │ │ └── test_transformers.py │ │ └── test_mapping.py │ ├── quantization │ │ ├── __init__.py │ │ ├── test_awq.py │ │ ├── test_bitblas.py │ │ ├── test_bitsandbytes.py │ │ ├── test_fp8.py │ │ ├── test_gguf.py │ │ ├── test_gptq_bitblas.py │ │ ├── test_gptq_marlin.py │ │ ├── test_gptq_marlin_24.py │ │ ├── test_modelopt.py │ │ ├── test_mxfp4.py │ │ └── test_nvfp4.py │ ├── registry.py │ ├── test_initialization.py │ ├── test_oot_registration.py │ ├── test_registry.py │ ├── test_terratorch.py │ ├── test_transformers.py │ ├── test_utils.py │ ├── test_vision.py │ └── utils.py ├── multimodal │ ├── __init__.py │ ├── assets │ │ ├── image1.png │ │ ├── image2.png │ │ └── rgba.png │ ├── test_cache.py │ ├── test_hasher.py │ ├── test_image.py │ ├── test_inputs.py │ ├── test_processing.py │ ├── test_registry.py │ ├── test_utils.py │ ├── test_video.py │ └── utils.py ├── plugins │ ├── lora_resolvers │ │ ├── __init__.py │ │ └── test_filesystem_resolver.py │ ├── prithvi_io_processor_plugin │ │ ├── prithvi_io_processor │ │ │ ├── __init__.py │ │ │ ├── prithvi_processor.py │ │ │ └── types.py │ │ └── setup.py │ ├── vllm_add_dummy_model │ │ ├── setup.py │ │ └── vllm_add_dummy_model │ │ │ ├── __init__.py │ │ │ ├── my_gemma_embedding.py │ │ │ ├── my_llava.py │ │ │ └── my_opt.py │ └── vllm_add_dummy_platform │ │ ├── setup.py │ │ └── vllm_add_dummy_platform │ │ ├── __init__.py │ │ ├── dummy_attention_backend.py │ │ ├── dummy_custom_ops.py │ │ └── dummy_platform.py ├── plugins_tests │ ├── test_io_processor_plugins.py │ ├── test_platform_plugins.py │ └── test_scheduler_plugins.py ├── prompts │ ├── example.txt │ └── summary.txt ├── quantization │ ├── __init__.py │ ├── reference_mxfp4.py │ ├── test_auto_round.py │ ├── test_compressed_tensors.py │ ├── test_configs.py │ ├── test_cpu_offload.py │ ├── test_experts_int8.py │ ├── test_fp8.py │ ├── test_gptq_dynamic.py │ ├── test_ipex_quant.py │ ├── test_lm_head.py │ ├── test_modelopt.py │ ├── test_ptpc_fp8.py │ ├── test_quark.py │ ├── test_register_quantization_config.py │ ├── test_rtn.py │ ├── test_torchao.py │ └── utils.py ├── reasoning │ ├── __init__.py │ ├── test_deepseekr1_reasoning_parser.py │ ├── test_granite_reasoning_parser.py │ ├── test_hunyuan_reasoning_parser.py │ ├── test_mistral_reasoning_parser.py │ ├── test_qwen3_reasoning_parser.py │ └── utils.py ├── runai_model_streamer_test │ ├── __init__.py │ ├── test_runai_model_streamer_loader.py │ ├── test_runai_utils.py │ └── test_weight_utils.py ├── samplers │ ├── __init__.py │ ├── test_beam_search.py │ ├── test_ignore_eos.py │ ├── test_no_bad_words.py │ └── test_ranks.py ├── speculative_decoding │ └── speculators │ │ └── test_eagle3.py ├── standalone_tests │ ├── lazy_imports.py │ ├── python_only_compile.sh │ └── pytorch_nightly_dependency.sh ├── system_messages │ └── sonnet3.5_nov2024.txt ├── tensorizer_loader │ ├── __init__.py │ ├── conftest.py │ └── test_tensorizer.py ├── test_config.py ├── test_embedded_commit.py ├── test_inputs.py ├── test_logger.py ├── test_outputs.py ├── test_pooling_params.py ├── test_regression.py ├── test_routing_simulator.py ├── test_sampling_params.py ├── test_scalartype.py ├── test_seed_behavior.py ├── test_sequence.py ├── test_sharded_state_loader.py ├── test_test.py ├── test_triton_utils.py ├── test_version.py ├── test_vllm_port.py ├── tokenization │ ├── __init__.py │ ├── test_cached_tokenizer.py │ ├── test_detokenize.py │ ├── test_do_lower_case.py │ ├── test_get_eos.py │ ├── test_mistral_tokenizer.py │ ├── test_tokenizer.py │ └── test_tokenizer_registry.py ├── tool_use │ ├── __init__.py │ ├── conftest.py │ ├── test_chat_completion_request_validations.py │ ├── test_chat_completions.py │ ├── test_glm4_moe_tool_parser.py │ ├── test_jamba_tool_parser.py │ ├── test_kimi_k2_tool_parser.py │ ├── test_minimax_tool_parser.py │ ├── test_openai_tool_parser.py │ ├── test_parallel_tool_calls.py │ ├── test_qwen3coder_tool_parser.py │ ├── test_seed_oss_tool_parser.py │ ├── test_tool_calls.py │ ├── test_tool_choice_required.py │ ├── test_xlam_tool_parser.py │ └── utils.py ├── tools │ ├── __init__.py │ └── test_config_validator.py ├── tpu │ ├── __init__.py │ ├── lora │ │ ├── __init__.py │ │ └── test_lora.py │ ├── test_compilation.py │ ├── test_custom_dispatcher.py │ ├── test_moe_pallas.py │ └── test_quantization_accuracy.py ├── transformers_utils │ ├── __init__.py │ └── test_config_parser_registry.py ├── utils.py ├── utils_ │ ├── __init__.py │ ├── test_tensor_schema.py │ └── test_utils.py ├── v1 │ ├── __init__.py │ ├── attention │ │ ├── test_attention_backends.py │ │ ├── test_attention_backends_selection.py │ │ ├── test_attention_splitting.py │ │ ├── test_chunked_local_attention.py │ │ ├── test_mla_backends.py │ │ └── utils.py │ ├── core │ │ ├── __init__.py │ │ ├── test_async_scheduler.py │ │ ├── test_encoder_cache_manager.py │ │ ├── test_kv_cache_utils.py │ │ ├── test_prefix_caching.py │ │ ├── test_scheduler.py │ │ ├── test_scheduler_e2e.py │ │ ├── test_single_type_kv_cache_manager.py │ │ └── utils.py │ ├── cudagraph │ │ ├── __init__.py │ │ ├── test_cudagraph_dispatch.py │ │ └── test_cudagraph_mode.py │ ├── e2e │ │ ├── __init__.py │ │ ├── test_cascade_attention.py │ │ ├── test_correctness_sliding_window.py │ │ ├── test_kv_sharing_fast_prefill.py │ │ ├── test_min_tokens.py │ │ └── test_spec_decode.py │ ├── engine │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_async_llm.py │ │ ├── test_engine_args.py │ │ ├── test_engine_core.py │ │ ├── test_engine_core_client.py │ │ ├── test_fast_incdec_prefix_err.py │ │ ├── test_llm_engine.py │ │ ├── test_output_processor.py │ │ ├── test_processor_multi_modal_uuids.py │ │ └── utils.py │ ├── entrypoints │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ └── test_struct_output_generate.py │ │ └── openai │ │ │ ├── responses │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_basic.py │ │ │ ├── test_image.py │ │ │ ├── test_stateful.py │ │ │ └── test_structured_output.py │ │ │ ├── test_chat_completion.py │ │ │ ├── test_completion.py │ │ │ ├── test_completion_with_image_embeds.py │ │ │ └── test_multi_api_servers.py │ ├── executor │ │ ├── __init__.py │ │ └── test_executor.py │ ├── kv_connector │ │ ├── __init__.py │ │ ├── nixl_integration │ │ │ ├── run_accuracy_test.sh │ │ │ ├── run_edge_case_test.sh │ │ │ ├── run_tpu_disagg_accuracy_test.sh │ │ │ ├── run_tpu_edge_case_test.sh │ │ │ ├── test_accuracy.py │ │ │ ├── test_disagg_accuracy.py │ │ │ ├── test_edge_cases.py │ │ │ └── toy_proxy_server.py │ │ └── unit │ │ │ ├── __init__.py │ │ │ ├── test_multi_connector.py │ │ │ ├── test_nixl_connector.py │ │ │ ├── test_output_aggreagator.py │ │ │ ├── test_remote_decode_lifecycle.py │ │ │ ├── test_remote_prefill_lifecycle.py │ │ │ ├── test_shared_storage_connector.py │ │ │ └── utils.py │ ├── logits_processors │ │ ├── __init__.py │ │ ├── test_correctness.py │ │ ├── test_custom_offline.py │ │ ├── test_custom_online.py │ │ └── utils.py │ ├── metrics │ │ ├── test_engine_logger_apis.py │ │ └── test_ray_metrics.py │ ├── sample │ │ ├── __init__.py │ │ ├── test_logprobs.py │ │ ├── test_logprobs_e2e.py │ │ ├── test_rejection_sampler.py │ │ ├── test_sampler.py │ │ ├── test_sampling_params_e2e.py │ │ ├── test_topk_topp_sampler.py │ │ └── utils.py │ ├── shutdown │ │ ├── test_delete.py │ │ ├── test_forward_error.py │ │ ├── test_processor_error.py │ │ ├── test_startup_error.py │ │ └── utils.py │ ├── spec_decode │ │ ├── test_eagle.py │ │ ├── test_max_len.py │ │ ├── test_ngram.py │ │ └── test_tree_attention.py │ ├── structured_output │ │ ├── __init__.py │ │ └── test_utils.py │ ├── test_async_llm_dp.py │ ├── test_external_lb_dp.py │ ├── test_hybrid_lb_dp.py │ ├── test_internal_lb_dp.py │ ├── test_kv_sharing.py │ ├── test_metrics_reader.py │ ├── test_oracle.py │ ├── test_request.py │ ├── test_serial_utils.py │ ├── test_utils.py │ ├── tpu │ │ ├── __init__.py │ │ ├── test_basic.py │ │ ├── test_kv_cache_update_kernel.py │ │ ├── test_mha_attn.py │ │ ├── test_multimodal.py │ │ ├── test_pallas.py │ │ ├── test_perf.py │ │ ├── test_sampler.py │ │ ├── test_spmd_model_weight_loading.py │ │ ├── test_topk_topp_sampler.py │ │ ├── test_tpu_int8.py │ │ ├── test_tpu_qkv_linear.py │ │ └── worker │ │ │ ├── __init__.py │ │ │ └── test_tpu_model_runner.py │ ├── tracing │ │ ├── __init__.py │ │ └── test_tracing.py │ └── worker │ │ ├── __init__.py │ │ ├── test_gpu_input_batch.py │ │ └── test_gpu_model_runner.py ├── vllm_test_utils │ ├── setup.py │ └── vllm_test_utils │ │ ├── __init__.py │ │ ├── blame.py │ │ └── monitor.py └── weight_loading │ ├── models-large.txt │ ├── models.txt │ ├── run_model_weight_loading_test.sh │ └── test_weight_loading.py ├── tools ├── check_init_lazy_imports.py ├── check_pickle_imports.py ├── check_repo.sh ├── check_spdx_header.py ├── check_triton_import.py ├── enforce_regex_import.py ├── ep_kernels │ ├── README.md │ ├── configure_system_drivers.sh │ ├── elastic_ep │ │ ├── eep_nvshmem.patch │ │ └── install_eep_libraries.sh │ └── install_python_libraries.sh ├── generate_cmake_presets.py ├── generate_nightly_torch_test.py ├── install_deepgemm.sh ├── install_gdrcopy.sh ├── mypy.sh ├── png-lint.sh ├── profiler │ ├── nsys_profile_tools │ │ ├── README.md │ │ ├── gputrc2graph.py │ │ ├── images │ │ │ ├── csv1.png │ │ │ ├── html.png │ │ │ └── html_tbl.png │ │ └── vllm_engine_model.json │ ├── print_layerwise_table.py │ └── visualize_layerwise_profile.py ├── report_build_time_ninja.py ├── shellcheck.sh ├── update-dockerfile-graph.sh └── validate_config.py ├── use_existing_torch.py └── vllm ├── __init__.py ├── _bc_linter.py ├── _custom_ops.py ├── _ipex_ops.py ├── assets ├── __init__.py ├── audio.py ├── base.py ├── image.py └── video.py ├── attention ├── __init__.py ├── backends │ ├── __init__.py │ ├── abstract.py │ ├── differential_flash_attn.py │ ├── dual_chunk_flash_attn.py │ ├── flash_attn.py │ ├── flashmla.py │ ├── mla │ │ ├── __init__.py │ │ └── common.py │ ├── placeholder_attn.py │ ├── rocm_aiter_mla.py │ ├── rocm_flash_attn.py │ ├── triton_mla.py │ ├── utils.py │ └── xformers.py ├── layer.py ├── layers │ ├── __init__.py │ ├── chunked_local_attention.py │ ├── cross_attention.py │ └── encoder_only_attention.py ├── ops │ ├── __init__.py │ ├── chunked_prefill_paged_decode.py │ ├── common.py │ ├── flashmla.py │ ├── merge_attn_states.py │ ├── paged_attn.py │ ├── pallas_kv_cache_update.py │ ├── prefix_prefill.py │ ├── rocm_aiter_mla.py │ ├── rocm_aiter_paged_attn.py │ ├── triton_decode_attention.py │ ├── triton_flash_attention.py │ ├── triton_merge_attn_states.py │ └── triton_unified_attention.py ├── selector.py └── utils │ ├── __init__.py │ ├── fa_utils.py │ └── kv_sharing_utils.py ├── beam_search.py ├── benchmarks ├── __init__.py ├── datasets.py ├── latency.py ├── lib │ ├── __init__.py │ ├── endpoint_request_func.py │ ├── ready_checker.py │ └── utils.py ├── serve.py └── throughput.py ├── collect_env.py ├── compilation ├── __init__.py ├── activation_quant_fusion.py ├── backends.py ├── base_static_graph.py ├── collective_fusion.py ├── compiler_interface.py ├── counter.py ├── cuda_graph.py ├── cuda_piecewise_backend.py ├── decorators.py ├── fix_functionalization.py ├── fusion.py ├── fusion_attn.py ├── fx_utils.py ├── inductor_pass.py ├── monitor.py ├── multi_output_match.py ├── noop_elimination.py ├── pass_manager.py ├── sequence_parallelism.py ├── torch25_custom_graph_pass.py ├── vllm_inductor_pass.py └── wrapper.py ├── config ├── __init__.py ├── cache.py ├── compilation.py ├── kv_events.py ├── kv_transfer.py ├── load.py ├── lora.py ├── multimodal.py ├── parallel.py ├── scheduler.py ├── speculative.py └── utils.py ├── connections.py ├── core ├── __init__.py ├── block │ ├── __init__.py │ ├── block_table.py │ ├── common.py │ ├── cpu_gpu_block_allocator.py │ ├── interfaces.py │ ├── naive_block.py │ ├── prefix_caching_block.py │ └── utils.py ├── block_manager.py ├── evictor.py ├── interfaces.py ├── placeholder_block_space_manager.py └── scheduler.py ├── device_allocator ├── __init__.py └── cumem.py ├── distributed ├── __init__.py ├── communication_op.py ├── device_communicators │ ├── __init__.py │ ├── all2all.py │ ├── all_reduce_utils.py │ ├── base_device_communicator.py │ ├── cpu_communicator.py │ ├── cuda_communicator.py │ ├── cuda_wrapper.py │ ├── custom_all_reduce.py │ ├── pynccl.py │ ├── pynccl_wrapper.py │ ├── quick_all_reduce.py │ ├── ray_communicator.py │ ├── shm_broadcast.py │ ├── shm_object_storage.py │ ├── symm_mem.py │ ├── tpu_communicator.py │ └── xpu_communicator.py ├── eplb │ ├── __init__.py │ ├── eplb_state.py │ ├── rebalance_algo.py │ └── rebalance_execute.py ├── kv_events.py ├── kv_transfer │ ├── README.md │ ├── __init__.py │ ├── disagg_prefill_workflow.jpg │ ├── kv_connector │ │ ├── __init__.py │ │ ├── base.py │ │ ├── factory.py │ │ ├── utils.py │ │ └── v1 │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── lmcache_connector.py │ │ │ ├── multi_connector.py │ │ │ ├── nixl_connector.py │ │ │ ├── p2p │ │ │ ├── __init__.py │ │ │ ├── p2p_nccl_connector.py │ │ │ ├── p2p_nccl_engine.py │ │ │ └── tensor_memory_pool.py │ │ │ └── shared_storage_connector.py │ ├── kv_lookup_buffer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mooncake_store.py │ │ └── simple_buffer.py │ ├── kv_pipe │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mooncake_pipe.py │ │ └── pynccl_pipe.py │ └── kv_transfer_state.py ├── parallel_state.py ├── tpu_distributed_utils.py └── utils.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── async_timeout.py ├── llm_engine.py ├── metrics.py ├── metrics_types.py ├── output_processor │ ├── __init__.py │ ├── interfaces.py │ ├── single_step.py │ └── stop_checker.py └── protocol.py ├── entrypoints ├── __init__.py ├── api_server.py ├── chat_utils.py ├── cli │ ├── __init__.py │ ├── benchmark │ │ ├── __init__.py │ │ ├── base.py │ │ ├── latency.py │ │ ├── main.py │ │ ├── serve.py │ │ └── throughput.py │ ├── collect_env.py │ ├── main.py │ ├── openai.py │ ├── run_batch.py │ ├── serve.py │ └── types.py ├── constants.py ├── context.py ├── harmony_utils.py ├── launcher.py ├── llm.py ├── logger.py ├── openai │ ├── __init__.py │ ├── api_server.py │ ├── cli_args.py │ ├── logits_processors.py │ ├── protocol.py │ ├── run_batch.py │ ├── serving_chat.py │ ├── serving_classification.py │ ├── serving_completion.py │ ├── serving_embedding.py │ ├── serving_engine.py │ ├── serving_models.py │ ├── serving_pooling.py │ ├── serving_responses.py │ ├── serving_score.py │ ├── serving_tokenization.py │ ├── serving_transcription.py │ ├── speech_to_text.py │ └── tool_parsers │ │ ├── __init__.py │ │ ├── abstract_tool_parser.py │ │ ├── deepseekv31_tool_parser.py │ │ ├── deepseekv3_tool_parser.py │ │ ├── glm4_moe_tool_parser.py │ │ ├── granite_20b_fc_tool_parser.py │ │ ├── granite_tool_parser.py │ │ ├── hermes_tool_parser.py │ │ ├── hunyuan_a13b_tool_parser.py │ │ ├── internlm2_tool_parser.py │ │ ├── jamba_tool_parser.py │ │ ├── kimi_k2_tool_parser.py │ │ ├── llama4_pythonic_tool_parser.py │ │ ├── llama_tool_parser.py │ │ ├── minimax_tool_parser.py │ │ ├── mistral_tool_parser.py │ │ ├── openai_tool_parser.py │ │ ├── phi4mini_tool_parser.py │ │ ├── pythonic_tool_parser.py │ │ ├── qwen3coder_tool_parser.py │ │ ├── seed_oss_tool_parser.py │ │ ├── step3_tool_parser.py │ │ ├── utils.py │ │ └── xlam_tool_parser.py ├── renderer.py ├── score_utils.py ├── ssl.py ├── tool.py ├── tool_server.py └── utils.py ├── env_override.py ├── envs.py ├── executor ├── __init__.py ├── executor_base.py ├── mp_distributed_executor.py ├── msgspec_utils.py ├── multiproc_worker_utils.py ├── ray_distributed_executor.py ├── ray_utils.py └── uniproc_executor.py ├── forward_context.py ├── inputs ├── __init__.py ├── data.py ├── parse.py ├── preprocess.py └── registry.py ├── logger.py ├── logging_utils ├── __init__.py ├── dump_input.py └── formatter.py ├── logits_process.py ├── logprobs.py ├── lora ├── __init__.py ├── layers │ ├── __init__.py │ ├── base.py │ ├── base_linear.py │ ├── column_parallel_linear.py │ ├── logits_processor.py │ ├── qkv_x_parallel_linear.py │ ├── replicated_linear.py │ ├── row_parallel_linear.py │ ├── utils.py │ └── vocal_parallel_embedding.py ├── lora.py ├── models.py ├── ops │ ├── __init__.py │ ├── ipex_ops │ │ ├── __init__.py │ │ └── lora_ops.py │ ├── torch_ops │ │ ├── __init__.py │ │ └── lora_ops.py │ ├── triton_ops │ │ ├── __init__.py │ │ ├── kernel_utils.py │ │ ├── lora_expand_op.py │ │ ├── lora_kernel_metadata.py │ │ ├── lora_shrink_op.py │ │ └── utils.py │ └── xla_ops │ │ ├── __init__.py │ │ └── lora_ops.py ├── peft_helper.py ├── punica_wrapper │ ├── __init__.py │ ├── punica_base.py │ ├── punica_cpu.py │ ├── punica_gpu.py │ ├── punica_selector.py │ ├── punica_tpu.py │ ├── punica_xpu.py │ └── utils.py ├── request.py ├── resolver.py ├── utils.py └── worker_manager.py ├── model_executor ├── __init__.py ├── custom_op.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── attention_layer_base.py │ ├── fla │ │ ├── __init__.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── chunk.py │ │ │ ├── chunk_delta_h.py │ │ │ ├── chunk_o.py │ │ │ ├── chunk_scaled_dot_kkt.py │ │ │ ├── cumsum.py │ │ │ ├── fused_recurrent.py │ │ │ ├── index.py │ │ │ ├── l2norm.py │ │ │ ├── layernorm_guard.py │ │ │ ├── op.py │ │ │ ├── solve_tril.py │ │ │ ├── utils.py │ │ │ └── wy_fast.py │ ├── fused_moe │ │ ├── __init__.py │ │ ├── batched_deep_gemm_moe.py │ │ ├── batched_triton_or_deep_gemm_moe.py │ │ ├── config.py │ │ ├── configs │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=128,N=1024,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json │ │ │ ├── E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200.json │ │ │ ├── E=128,N=96,device_name=NVIDIA_H20.json │ │ │ ├── E=16,N=1024,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=1024,device_name=NVIDIA_B200.json │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H100.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=160,N=192,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=160,N=320,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json │ │ │ ├── E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json │ │ │ ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=512,N=128,device_name=NVIDIA_B200.json │ │ │ ├── E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json │ │ │ ├── E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=512,N=128,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=512,N=128,device_name=NVIDIA_H200.json │ │ │ ├── E=512,N=256,device_name=NVIDIA_B200.json │ │ │ ├── E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json │ │ │ ├── E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=512,N=256,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=512,N=256,device_name=NVIDIA_H200.json │ │ │ ├── E=512,N=512,device_name=NVIDIA_B200.json │ │ │ ├── E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json │ │ │ ├── E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=512,N=512,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=512,N=512,device_name=NVIDIA_H200.json │ │ │ ├── E=512,N=64,device_name=NVIDIA_B200.json │ │ │ ├── E=512,N=64,device_name=NVIDIA_H20-3e.json │ │ │ ├── E=512,N=64,device_name=NVIDIA_H200.json │ │ │ ├── E=60,N=1408,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=176,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=352,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=60,N=704,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=3072,device_name=NVIDIA_H20.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=384,device_name=NVIDIA_H20.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200.json │ │ │ ├── E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json │ │ │ ├── E=64,N=768,device_name=NVIDIA_H20.json │ │ │ ├── E=64,N=896,device_name=NVIDIA_H20.json │ │ │ ├── E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=16384,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json │ │ │ └── README │ │ ├── cpu_fused_moe.py │ │ ├── cutlass_moe.py │ │ ├── deep_gemm_moe.py │ │ ├── deep_gemm_utils.py │ │ ├── deepep_ht_prepare_finalize.py │ │ ├── deepep_ll_prepare_finalize.py │ │ ├── flashinfer_cutlass_moe.py │ │ ├── flashinfer_cutlass_prepare_finalize.py │ │ ├── flashinfer_trtllm_moe.py │ │ ├── fused_batched_moe.py │ │ ├── fused_marlin_moe.py │ │ ├── fused_moe.py │ │ ├── gpt_oss_triton_kernels_moe.py │ │ ├── layer.py │ │ ├── modular_kernel.py │ │ ├── moe_align_block_size.py │ │ ├── moe_pallas.py │ │ ├── moe_permute_unpermute.py │ │ ├── moe_torch_iterative.py │ │ ├── pplx_prepare_finalize.py │ │ ├── prepare_finalize.py │ │ ├── rocm_aiter_fused_moe.py │ │ ├── routing_simulator.py │ │ ├── topk_weight_and_reduce.py │ │ ├── triton_deep_gemm_moe.py │ │ ├── trtllm_moe.py │ │ └── utils.py │ ├── layernorm.py │ ├── lightning_attn.py │ ├── linear.py │ ├── logits_processor.py │ ├── mamba │ │ ├── __init__.py │ │ ├── abstract.py │ │ ├── linear_attn.py │ │ ├── mamba2_metadata.py │ │ ├── mamba_mixer.py │ │ ├── mamba_mixer2.py │ │ ├── mamba_utils.py │ │ ├── ops │ │ │ ├── __init__.py │ │ │ ├── causal_conv1d.py │ │ │ ├── layernorm_gated.py │ │ │ ├── mamba_ssm.py │ │ │ ├── ssd_bmm.py │ │ │ ├── ssd_chunk_scan.py │ │ │ ├── ssd_chunk_state.py │ │ │ ├── ssd_combined.py │ │ │ └── ssd_state_passing.py │ │ └── short_conv.py │ ├── mla.py │ ├── pooler.py │ ├── quantization │ │ ├── __init__.py │ │ ├── auto_round.py │ │ ├── awq.py │ │ ├── awq_marlin.py │ │ ├── awq_triton.py │ │ ├── base_config.py │ │ ├── bitblas.py │ │ ├── bitsandbytes.py │ │ ├── compressed_tensors │ │ │ ├── __init__.py │ │ │ ├── compressed_tensors.py │ │ │ ├── compressed_tensors_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── compressed_tensors_24.py │ │ │ │ ├── compressed_tensors_scheme.py │ │ │ │ ├── compressed_tensors_w4a16_24.py │ │ │ │ ├── compressed_tensors_w4a16_nvfp4.py │ │ │ │ ├── compressed_tensors_w4a4_nvfp4.py │ │ │ │ ├── compressed_tensors_w4a8_fp8.py │ │ │ │ ├── compressed_tensors_w4a8_int.py │ │ │ │ ├── compressed_tensors_w8a16_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_fp8.py │ │ │ │ ├── compressed_tensors_w8a8_int8.py │ │ │ │ └── compressed_tensors_wNa16.py │ │ │ ├── transform │ │ │ │ ├── linear.py │ │ │ │ ├── module.py │ │ │ │ ├── schemes │ │ │ │ │ └── linear_qutlass_nvfp4.py │ │ │ │ └── utils.py │ │ │ ├── triton_scaled_mm.py │ │ │ └── utils.py │ │ ├── deepgemm.py │ │ ├── deepspeedfp.py │ │ ├── experts_int8.py │ │ ├── fbgemm_fp8.py │ │ ├── fp8.py │ │ ├── gguf.py │ │ ├── gptq.py │ │ ├── gptq_bitblas.py │ │ ├── gptq_marlin.py │ │ ├── gptq_marlin_24.py │ │ ├── hqq_marlin.py │ │ ├── inc.py │ │ ├── input_quant_fp8.py │ │ ├── ipex_quant.py │ │ ├── kernels │ │ │ ├── __init__.py │ │ │ ├── mixed_precision │ │ │ │ ├── MPLinearKernel.py │ │ │ │ ├── __init__.py │ │ │ │ ├── allspark.py │ │ │ │ ├── bitblas.py │ │ │ │ ├── conch.py │ │ │ │ ├── cutlass.py │ │ │ │ ├── dynamic_4bit.py │ │ │ │ ├── exllama.py │ │ │ │ ├── machete.py │ │ │ │ └── marlin.py │ │ │ └── scaled_mm │ │ │ │ ├── ScaledMMLinearKernel.py │ │ │ │ ├── __init__.py │ │ │ │ ├── aiter.py │ │ │ │ ├── cpu.py │ │ │ │ ├── cutlass.py │ │ │ │ ├── triton.py │ │ │ │ └── xla.py │ │ ├── kv_cache.py │ │ ├── modelopt.py │ │ ├── moe_wna16.py │ │ ├── mxfp4.py │ │ ├── petit.py │ │ ├── ptpc_fp8.py │ │ ├── quark │ │ │ ├── __init__.py │ │ │ ├── quark.py │ │ │ ├── quark_moe.py │ │ │ ├── schemes │ │ │ │ ├── __init__.py │ │ │ │ ├── quark_scheme.py │ │ │ │ ├── quark_w4a4_mxfp4.py │ │ │ │ ├── quark_w8a8_fp8.py │ │ │ │ └── quark_w8a8_int8.py │ │ │ └── utils.py │ │ ├── rtn.py │ │ ├── schema.py │ │ ├── torchao.py │ │ ├── tpu_int8.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── allspark_utils.py │ │ │ ├── bitblas_utils.py │ │ │ ├── configs │ │ │ ├── N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ ├── N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json │ │ │ └── README.md │ │ │ ├── flashinfer_fp4_moe.py │ │ │ ├── flashinfer_utils.py │ │ │ ├── fp8_utils.py │ │ │ ├── gptq_utils.py │ │ │ ├── int8_utils.py │ │ │ ├── layer_utils.py │ │ │ ├── machete_utils.py │ │ │ ├── marlin_utils.py │ │ │ ├── marlin_utils_fp4.py │ │ │ ├── marlin_utils_fp8.py │ │ │ ├── marlin_utils_test.py │ │ │ ├── marlin_utils_test_24.py │ │ │ ├── mxfp4_utils.py │ │ │ ├── mxfp8_utils.py │ │ │ ├── nvfp4_emulation_utils.py │ │ │ ├── nvfp4_moe_support.py │ │ │ ├── petit_utils.py │ │ │ ├── quant_utils.py │ │ │ └── w8a8_utils.py │ ├── resampler.py │ ├── rotary_embedding │ │ ├── __init__.py │ │ ├── base.py │ │ ├── common.py │ │ ├── deepseek_scaling_rope.py │ │ ├── dual_chunk_rope.py │ │ ├── dynamic_ntk_alpha_rope.py │ │ ├── dynamic_ntk_scaling_rope.py │ │ ├── ernie45_vl_rope.py │ │ ├── linear_scaling_rope.py │ │ ├── llama3_rope.py │ │ ├── llama4_vision_rope.py │ │ ├── mrope.py │ │ ├── ntk_scaling_rope.py │ │ ├── phi3_long_rope_scaled_rope.py │ │ └── yarn_scaling_rope.py │ ├── sampler.py │ ├── shared_fused_moe │ │ ├── __init__.py │ │ └── shared_fused_moe.py │ ├── utils.py │ └── vocab_parallel_embedding.py ├── model_loader │ ├── __init__.py │ ├── base_loader.py │ ├── bitsandbytes_loader.py │ ├── default_loader.py │ ├── dummy_loader.py │ ├── gguf_loader.py │ ├── runai_streamer_loader.py │ ├── sharded_state_loader.py │ ├── tensorizer.py │ ├── tensorizer_loader.py │ ├── tpu.py │ ├── utils.py │ └── weight_utils.py ├── models │ ├── __init__.py │ ├── adapters.py │ ├── aimv2.py │ ├── apertus.py │ ├── arcee.py │ ├── arctic.py │ ├── aria.py │ ├── aya_vision.py │ ├── baichuan.py │ ├── bailing_moe.py │ ├── bamba.py │ ├── bert.py │ ├── bert_with_rope.py │ ├── blip.py │ ├── blip2.py │ ├── bloom.py │ ├── chameleon.py │ ├── chatglm.py │ ├── clip.py │ ├── cohere2_vision.py │ ├── commandr.py │ ├── config.py │ ├── constant_size_cache.py │ ├── dbrx.py │ ├── deepseek.py │ ├── deepseek_eagle.py │ ├── deepseek_mtp.py │ ├── deepseek_v2.py │ ├── deepseek_vl2.py │ ├── dots1.py │ ├── ernie45.py │ ├── ernie45_moe.py │ ├── ernie45_vl.py │ ├── ernie45_vl_moe.py │ ├── ernie_mtp.py │ ├── exaone.py │ ├── exaone4.py │ ├── fairseq2_llama.py │ ├── falcon.py │ ├── falcon_h1.py │ ├── fuyu.py │ ├── gemma.py │ ├── gemma2.py │ ├── gemma3.py │ ├── gemma3_mm.py │ ├── gemma3n.py │ ├── gemma3n_mm.py │ ├── glm.py │ ├── glm4.py │ ├── glm4_1v.py │ ├── glm4_moe.py │ ├── glm4_moe_mtp.py │ ├── glm4v.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── gpt_oss.py │ ├── granite.py │ ├── granite_speech.py │ ├── granitemoe.py │ ├── granitemoehybrid.py │ ├── granitemoeshared.py │ ├── gritlm.py │ ├── grok1.py │ ├── h2ovl.py │ ├── hunyuan_v1.py │ ├── hyperclovax_vision.py │ ├── idefics2_vision_model.py │ ├── idefics3.py │ ├── interfaces.py │ ├── interfaces_base.py │ ├── intern_vit.py │ ├── internlm2.py │ ├── internlm2_ve.py │ ├── interns1.py │ ├── interns1_vit.py │ ├── internvl.py │ ├── jais.py │ ├── jamba.py │ ├── jina_vl.py │ ├── keye.py │ ├── keye_vl1_5.py │ ├── kimi_vl.py │ ├── lfm2.py │ ├── llama.py │ ├── llama4.py │ ├── llama4_eagle.py │ ├── llama_eagle.py │ ├── llama_eagle3.py │ ├── llava.py │ ├── llava_next.py │ ├── llava_next_video.py │ ├── llava_onevision.py │ ├── mamba.py │ ├── mamba2.py │ ├── mamba_cache.py │ ├── medusa.py │ ├── midashenglm.py │ ├── mimo.py │ ├── mimo_mtp.py │ ├── minicpm.py │ ├── minicpm3.py │ ├── minicpm_eagle.py │ ├── minicpmo.py │ ├── minicpmv.py │ ├── minimax_cache.py │ ├── minimax_text_01.py │ ├── minimax_vl_01.py │ ├── mistral3.py │ ├── mixtral.py │ ├── mllama4.py │ ├── mlp_speculator.py │ ├── modernbert.py │ ├── module_mapping.py │ ├── molmo.py │ ├── moonvit.py │ ├── motif.py │ ├── mpt.py │ ├── nano_nemotron_vl.py │ ├── nemotron.py │ ├── nemotron_h.py │ ├── nemotron_nas.py │ ├── nemotron_vl.py │ ├── nvlm_d.py │ ├── olmo.py │ ├── olmo2.py │ ├── olmoe.py │ ├── opt.py │ ├── orion.py │ ├── ovis.py │ ├── ovis2_5.py │ ├── paligemma.py │ ├── persimmon.py │ ├── phi.py │ ├── phi3.py │ ├── phi3v.py │ ├── phi4_multimodal.py │ ├── phi4flash.py │ ├── phi4mm.py │ ├── phi4mm_audio.py │ ├── phi4mm_utils.py │ ├── phimoe.py │ ├── pixtral.py │ ├── plamo2.py │ ├── qwen.py │ ├── qwen2.py │ ├── qwen2_5_omni_thinker.py │ ├── qwen2_5_vl.py │ ├── qwen2_audio.py │ ├── qwen2_moe.py │ ├── qwen2_rm.py │ ├── qwen2_vl.py │ ├── qwen3.py │ ├── qwen3_moe.py │ ├── qwen3_next.py │ ├── qwen3_next_mtp.py │ ├── qwen3_vl.py │ ├── qwen3_vl_moe.py │ ├── qwen_vl.py │ ├── radio.py │ ├── registry.py │ ├── roberta.py │ ├── rvl.py │ ├── seed_oss.py │ ├── siglip.py │ ├── siglip2navit.py │ ├── skyworkr1v.py │ ├── smolvlm.py │ ├── solar.py │ ├── stablelm.py │ ├── starcoder2.py │ ├── step3_text.py │ ├── step3_vl.py │ ├── swin.py │ ├── tarsier.py │ ├── telechat2.py │ ├── teleflm.py │ ├── terratorch.py │ ├── transformers.py │ ├── ultravox.py │ ├── utils.py │ ├── vision.py │ ├── voxtral.py │ ├── whisper.py │ └── zamba2.py ├── parameter.py ├── sampling_metadata.py ├── utils.py └── warmup │ ├── __init__.py │ ├── deep_gemm_warmup.py │ └── kernel_warmup.py ├── multimodal ├── __init__.py ├── audio.py ├── base.py ├── cache.py ├── hasher.py ├── image.py ├── inputs.py ├── parse.py ├── processing.py ├── profiling.py ├── registry.py ├── utils.py └── video.py ├── outputs.py ├── platforms ├── __init__.py ├── cpu.py ├── cuda.py ├── interface.py ├── rocm.py ├── tpu.py └── xpu.py ├── plugins ├── __init__.py ├── io_processors │ ├── __init__.py │ └── interface.py └── lora_resolvers │ ├── README.md │ ├── __init__.py │ └── filesystem_resolver.py ├── pooling_params.py ├── profiler ├── __init__.py ├── layerwise_profile.py └── utils.py ├── py.typed ├── ray ├── __init__.py ├── lazy_utils.py └── ray_env.py ├── reasoning ├── __init__.py ├── abs_reasoning_parsers.py ├── deepseek_r1_reasoning_parser.py ├── glm4_moe_reasoning_parser.py ├── gptoss_reasoning_parser.py ├── granite_reasoning_parser.py ├── hunyuan_a13b_reasoning_parser.py ├── mistral_reasoning_parser.py ├── qwen3_reasoning_parser.py └── step3_reasoning_parser.py ├── sampling_params.py ├── scalar_type.py ├── scripts.py ├── sequence.py ├── tasks.py ├── test_utils.py ├── third_party ├── __init__.py └── pynvml.py ├── tracing.py ├── transformers_utils ├── __init__.py ├── chat_templates │ ├── __init__.py │ ├── registry.py │ ├── template_basic.jinja │ ├── template_blip2.jinja │ ├── template_chatml.jinja │ ├── template_deepseek_vl2.jinja │ ├── template_fuyu.jinja │ └── template_minicpmv45.jinja ├── config.py ├── config_parser_base.py ├── configs │ ├── __init__.py │ ├── arctic.py │ ├── chatglm.py │ ├── deepseek_vl2.py │ ├── eagle.py │ ├── falcon.py │ ├── jais.py │ ├── kimi_vl.py │ ├── medusa.py │ ├── midashenglm.py │ ├── mistral.py │ ├── mlp_speculator.py │ ├── moonvit.py │ ├── nemotron.py │ ├── nemotron_h.py │ ├── nemotron_vl.py │ ├── olmo3.py │ ├── ovis.py │ ├── qwen3_next.py │ ├── radio.py │ ├── speculators │ │ ├── __init__.py │ │ ├── algos.py │ │ └── base.py │ ├── step3_vl.py │ └── ultravox.py ├── detokenizer.py ├── detokenizer_utils.py ├── dynamic_module.py ├── processor.py ├── processors │ ├── __init__.py │ ├── deepseek_vl2.py │ ├── ovis.py │ └── ovis2_5.py ├── runai_utils.py ├── s3_utils.py ├── tokenizer.py ├── tokenizer_base.py ├── tokenizers │ ├── __init__.py │ └── mistral.py └── utils.py ├── triton_utils ├── __init__.py └── importing.py ├── usage ├── __init__.py └── usage_lib.py ├── utils ├── __init__.py ├── deep_gemm.py ├── flashinfer.py ├── jsontree.py └── tensor_schema.py ├── v1 ├── __init__.py ├── attention │ ├── __init__.py │ └── backends │ │ ├── __init__.py │ │ ├── cpu_attn.py │ │ ├── flash_attn.py │ │ ├── flashinfer.py │ │ ├── flex_attention.py │ │ ├── gdn_attn.py │ │ ├── linear_attn.py │ │ ├── mamba1_attn.py │ │ ├── mamba2_attn.py │ │ ├── mamba_attn.py │ │ ├── mla │ │ ├── __init__.py │ │ ├── common.py │ │ ├── cutlass_mla.py │ │ ├── flashattn_mla.py │ │ ├── flashinfer_mla.py │ │ ├── flashmla.py │ │ ├── rocm_aiter_mla.py │ │ └── triton_mla.py │ │ ├── pallas.py │ │ ├── rocm_aiter_fa.py │ │ ├── short_conv_attn.py │ │ ├── tree_attn.py │ │ ├── triton_attn.py │ │ ├── utils.py │ │ └── xformers.py ├── core │ ├── __init__.py │ ├── block_pool.py │ ├── encoder_cache_manager.py │ ├── kv_cache_coordinator.py │ ├── kv_cache_manager.py │ ├── kv_cache_utils.py │ ├── sched │ │ ├── __init__.py │ │ ├── async_scheduler.py │ │ ├── interface.py │ │ ├── output.py │ │ ├── request_queue.py │ │ ├── scheduler.py │ │ └── utils.py │ └── single_type_kv_cache_manager.py ├── cudagraph_dispatcher.py ├── engine │ ├── __init__.py │ ├── async_llm.py │ ├── coordinator.py │ ├── core.py │ ├── core_client.py │ ├── detokenizer.py │ ├── exceptions.py │ ├── llm_engine.py │ ├── logprobs.py │ ├── output_processor.py │ ├── parallel_sampling.py │ ├── processor.py │ └── utils.py ├── executor │ ├── __init__.py │ ├── abstract.py │ ├── multiproc_executor.py │ ├── ray_distributed_executor.py │ └── utils.py ├── kv_cache_interface.py ├── metrics │ ├── __init__.py │ ├── loggers.py │ ├── prometheus.py │ ├── ray_wrappers.py │ ├── reader.py │ └── stats.py ├── outputs.py ├── pool │ ├── __init__.py │ └── metadata.py ├── request.py ├── sample │ ├── __init__.py │ ├── logits_processor │ │ ├── __init__.py │ │ ├── builtin.py │ │ ├── interface.py │ │ └── state.py │ ├── metadata.py │ ├── ops │ │ ├── __init__.py │ │ ├── bad_words.py │ │ ├── logprobs.py │ │ ├── penalties.py │ │ └── topk_topp_sampler.py │ ├── rejection_sampler.py │ ├── sampler.py │ └── tpu │ │ ├── __init__.py │ │ ├── metadata.py │ │ └── sampler.py ├── serial_utils.py ├── spec_decode │ ├── __init__.py │ ├── eagle.py │ ├── medusa.py │ ├── metadata.py │ ├── metrics.py │ ├── ngram_proposer.py │ └── utils.py ├── structured_output │ ├── __init__.py │ ├── backend_guidance.py │ ├── backend_lm_format_enforcer.py │ ├── backend_outlines.py │ ├── backend_types.py │ ├── backend_xgrammar.py │ ├── request.py │ └── utils.py ├── utils.py └── worker │ ├── __init__.py │ ├── block_table.py │ ├── cpu_model_runner.py │ ├── cpu_worker.py │ ├── gpu_input_batch.py │ ├── gpu_model_runner.py │ ├── gpu_ubatch_wrapper.py │ ├── gpu_worker.py │ ├── kv_connector_model_runner_mixin.py │ ├── lora_model_runner_mixin.py │ ├── tpu_input_batch.py │ ├── tpu_model_runner.py │ ├── tpu_worker.py │ ├── ubatch_splitting.py │ ├── ubatch_utils.py │ ├── ubatching.py │ ├── utils.py │ ├── worker_base.py │ ├── xpu_model_runner.py │ └── xpu_worker.py ├── version.py ├── vllm_flash_attn └── .gitkeep └── worker ├── __init__.py ├── cache_engine.py ├── model_runner.py ├── model_runner_base.py ├── worker.py └── worker_base.py /.buildkite/check-wheel-size.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/check-wheel-size.py -------------------------------------------------------------------------------- /.buildkite/generate_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/generate_index.py -------------------------------------------------------------------------------- /.buildkite/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/pyproject.toml -------------------------------------------------------------------------------- /.buildkite/release-pipeline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/release-pipeline.yaml -------------------------------------------------------------------------------- /.buildkite/scripts/ci-clean-log.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/scripts/ci-clean-log.sh -------------------------------------------------------------------------------- /.buildkite/scripts/rerun-test.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/scripts/rerun-test.sh -------------------------------------------------------------------------------- /.buildkite/scripts/run-benchmarks.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/scripts/run-benchmarks.sh -------------------------------------------------------------------------------- /.buildkite/scripts/tpu/run_bm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/scripts/tpu/run_bm.sh -------------------------------------------------------------------------------- /.buildkite/scripts/upload-wheels.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/scripts/upload-wheels.sh -------------------------------------------------------------------------------- /.buildkite/test-pipeline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.buildkite/test-pipeline.yaml -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.clang-format -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.coveragerc -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.dockerignore -------------------------------------------------------------------------------- /.gemini/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.gemini/config.yaml -------------------------------------------------------------------------------- /.github/.bc-linter.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/.bc-linter.yml -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/CODEOWNERS -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/FUNDING.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/300-usage.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/ISSUE_TEMPLATE/300-usage.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/750-RFC.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/ISSUE_TEMPLATE/750-RFC.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/ISSUE_TEMPLATE/config.yml -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/dependabot.yml -------------------------------------------------------------------------------- /.github/mergify.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/mergify.yml -------------------------------------------------------------------------------- /.github/scale-config.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/scale-config.yml -------------------------------------------------------------------------------- /.github/scripts/cleanup_pr_body.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/scripts/cleanup_pr_body.sh -------------------------------------------------------------------------------- /.github/workflows/bc-lint.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/bc-lint.yml -------------------------------------------------------------------------------- /.github/workflows/matchers/mypy.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/matchers/mypy.json -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/pre-commit.yml -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/scripts/build.sh -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/scripts/env.sh -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.github/workflows/stale.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.gitignore -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.markdownlint.yaml -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.readthedocs.yaml -------------------------------------------------------------------------------- /.shellcheckrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/.shellcheckrc -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- 1 | collect_env.py 2 | vllm/model_executor/layers/fla/ops/*.py 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/CMakeLists.txt -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/DCO -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/README.md -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/RELEASE.md -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/SECURITY.md -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/README.md -------------------------------------------------------------------------------- /benchmarks/auto_tune/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/auto_tune/README.md -------------------------------------------------------------------------------- /benchmarks/auto_tune/auto_tune.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/auto_tune/auto_tune.sh -------------------------------------------------------------------------------- /benchmarks/backend_request_func.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/backend_request_func.py -------------------------------------------------------------------------------- /benchmarks/benchmark_block_pool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/benchmark_block_pool.py -------------------------------------------------------------------------------- /benchmarks/benchmark_latency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/benchmark_latency.py -------------------------------------------------------------------------------- /benchmarks/benchmark_serving.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/benchmark_serving.py -------------------------------------------------------------------------------- /benchmarks/benchmark_throughput.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/benchmark_throughput.py -------------------------------------------------------------------------------- /benchmarks/benchmark_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/benchmark_utils.py -------------------------------------------------------------------------------- /benchmarks/kernels/bench_fp8_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/bench_fp8_gemm.py -------------------------------------------------------------------------------- /benchmarks/kernels/benchmark_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/benchmark_lora.py -------------------------------------------------------------------------------- /benchmarks/kernels/benchmark_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/benchmark_moe.py -------------------------------------------------------------------------------- /benchmarks/kernels/benchmark_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/benchmark_rope.py -------------------------------------------------------------------------------- /benchmarks/kernels/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /benchmarks/kernels/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/utils.py -------------------------------------------------------------------------------- /benchmarks/kernels/weight_shapes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/kernels/weight_shapes.py -------------------------------------------------------------------------------- /benchmarks/multi_turn/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/multi_turn/README.md -------------------------------------------------------------------------------- /benchmarks/multi_turn/bench_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/multi_turn/bench_utils.py -------------------------------------------------------------------------------- /benchmarks/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/pyproject.toml -------------------------------------------------------------------------------- /benchmarks/sonnet.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/benchmarks/sonnet.txt -------------------------------------------------------------------------------- /cmake/cpu_extension.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/cmake/cpu_extension.cmake -------------------------------------------------------------------------------- /cmake/hipify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/cmake/hipify.py -------------------------------------------------------------------------------- /cmake/utils.cmake: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/cmake/utils.cmake -------------------------------------------------------------------------------- /csrc/activation_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/activation_kernels.cu -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/attention_dtypes.h -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/attention_generic.cuh -------------------------------------------------------------------------------- /csrc/attention/attention_kernels.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/attention_kernels.cuh -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/attention_utils.cuh -------------------------------------------------------------------------------- /csrc/attention/dtype_bfloat16.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/dtype_bfloat16.cuh -------------------------------------------------------------------------------- /csrc/attention/dtype_float16.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/dtype_float16.cuh -------------------------------------------------------------------------------- /csrc/attention/dtype_float32.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/dtype_float32.cuh -------------------------------------------------------------------------------- /csrc/attention/dtype_fp8.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/dtype_fp8.cuh -------------------------------------------------------------------------------- /csrc/attention/merge_attn_states.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/merge_attn_states.cu -------------------------------------------------------------------------------- /csrc/attention/paged_attention_v1.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/paged_attention_v1.cu -------------------------------------------------------------------------------- /csrc/attention/paged_attention_v2.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/attention/paged_attention_v2.cu -------------------------------------------------------------------------------- /csrc/cache.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cache.h -------------------------------------------------------------------------------- /csrc/cache_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cache_kernels.cu -------------------------------------------------------------------------------- /csrc/core/exception.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define VLLM_IMPLIES(p, q) (!(p) || (q)) 4 | -------------------------------------------------------------------------------- /csrc/core/math.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/core/math.hpp -------------------------------------------------------------------------------- /csrc/core/registration.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/core/registration.h -------------------------------------------------------------------------------- /csrc/core/scalar_type.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/core/scalar_type.hpp -------------------------------------------------------------------------------- /csrc/cpu/activation.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/activation.cpp -------------------------------------------------------------------------------- /csrc/cpu/attention.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/attention.cpp -------------------------------------------------------------------------------- /csrc/cpu/cache.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cache.cpp -------------------------------------------------------------------------------- /csrc/cpu/cpu_types.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cpu_types.hpp -------------------------------------------------------------------------------- /csrc/cpu/cpu_types_arm.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cpu_types_arm.hpp -------------------------------------------------------------------------------- /csrc/cpu/cpu_types_vsx.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cpu_types_vsx.hpp -------------------------------------------------------------------------------- /csrc/cpu/cpu_types_vxe.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cpu_types_vxe.hpp -------------------------------------------------------------------------------- /csrc/cpu/cpu_types_x86.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/cpu_types_x86.hpp -------------------------------------------------------------------------------- /csrc/cpu/dnnl_helper.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/dnnl_helper.cpp -------------------------------------------------------------------------------- /csrc/cpu/dnnl_helper.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/dnnl_helper.h -------------------------------------------------------------------------------- /csrc/cpu/dnnl_kernels.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/dnnl_kernels.cpp -------------------------------------------------------------------------------- /csrc/cpu/layernorm.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/layernorm.cpp -------------------------------------------------------------------------------- /csrc/cpu/mla_decode.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/mla_decode.cpp -------------------------------------------------------------------------------- /csrc/cpu/pos_encoding.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/pos_encoding.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/common.h -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/gemm.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/gemm.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/gemm.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/gemm.h -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/gemm_fp8.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/gemm_fp8.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/gemm_int8.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/gemm_int8.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/moe.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/moe.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/moe_fp8.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/moe_fp8.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/moe_int8.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/moe_int8.cpp -------------------------------------------------------------------------------- /csrc/cpu/sgl-kernels/vec.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/sgl-kernels/vec.h -------------------------------------------------------------------------------- /csrc/cpu/shm.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/shm.cpp -------------------------------------------------------------------------------- /csrc/cpu/torch_bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/torch_bindings.cpp -------------------------------------------------------------------------------- /csrc/cpu/utils.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cpu/utils.cpp -------------------------------------------------------------------------------- /csrc/cub_helpers.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cub_helpers.h -------------------------------------------------------------------------------- /csrc/cuda_compat.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cuda_compat.h -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cuda_utils.h -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cuda_utils_kernels.cu -------------------------------------------------------------------------------- /csrc/cuda_view.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cuda_view.cu -------------------------------------------------------------------------------- /csrc/cumem_allocator.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cumem_allocator.cpp -------------------------------------------------------------------------------- /csrc/custom_all_reduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/custom_all_reduce.cu -------------------------------------------------------------------------------- /csrc/custom_all_reduce.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/custom_all_reduce.cuh -------------------------------------------------------------------------------- /csrc/custom_all_reduce_test.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/custom_all_reduce_test.cu -------------------------------------------------------------------------------- /csrc/custom_quickreduce.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/custom_quickreduce.cu -------------------------------------------------------------------------------- /csrc/cutlass_extensions/common.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cutlass_extensions/common.cpp -------------------------------------------------------------------------------- /csrc/cutlass_extensions/common.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/cutlass_extensions/common.hpp -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/dispatch_utils.h -------------------------------------------------------------------------------- /csrc/layernorm_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/layernorm_kernels.cu -------------------------------------------------------------------------------- /csrc/layernorm_quant_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/layernorm_quant_kernels.cu -------------------------------------------------------------------------------- /csrc/mamba/mamba_ssm/static_switch.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/mamba/mamba_ssm/static_switch.h -------------------------------------------------------------------------------- /csrc/moe/grouped_topk_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/grouped_topk_kernels.cu -------------------------------------------------------------------------------- /csrc/moe/marlin_moe_wna16/.gitignore: -------------------------------------------------------------------------------- 1 | kernel_*.cu -------------------------------------------------------------------------------- /csrc/moe/marlin_moe_wna16/kernel.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/marlin_moe_wna16/kernel.h -------------------------------------------------------------------------------- /csrc/moe/marlin_moe_wna16/ops.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/marlin_moe_wna16/ops.cu -------------------------------------------------------------------------------- /csrc/moe/moe_align_sum_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/moe_align_sum_kernels.cu -------------------------------------------------------------------------------- /csrc/moe/moe_ops.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/moe_ops.h -------------------------------------------------------------------------------- /csrc/moe/moe_permute_unpermute_op.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/moe_permute_unpermute_op.cu -------------------------------------------------------------------------------- /csrc/moe/moe_wna16.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/moe_wna16.cu -------------------------------------------------------------------------------- /csrc/moe/moe_wna16_utils.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/moe_wna16_utils.h -------------------------------------------------------------------------------- /csrc/moe/topk_softmax_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/topk_softmax_kernels.cu -------------------------------------------------------------------------------- /csrc/moe/torch_bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/moe/torch_bindings.cpp -------------------------------------------------------------------------------- /csrc/ops.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/ops.h -------------------------------------------------------------------------------- /csrc/permute_cols.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/permute_cols.cu -------------------------------------------------------------------------------- /csrc/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/pos_encoding_kernels.cu -------------------------------------------------------------------------------- /csrc/quantization/awq/dequantize.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/awq/dequantize.cuh -------------------------------------------------------------------------------- /csrc/quantization/fp8/common.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/fp8/common.cu -------------------------------------------------------------------------------- /csrc/quantization/fp8/common.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/fp8/common.cuh -------------------------------------------------------------------------------- /csrc/quantization/gguf/ggml-common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/ggml-common.h -------------------------------------------------------------------------------- /csrc/quantization/gguf/mmq.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/mmq.cuh -------------------------------------------------------------------------------- /csrc/quantization/gguf/mmvq.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/mmvq.cuh -------------------------------------------------------------------------------- /csrc/quantization/gguf/moe.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/moe.cuh -------------------------------------------------------------------------------- /csrc/quantization/gguf/moe_vec.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/moe_vec.cuh -------------------------------------------------------------------------------- /csrc/quantization/gguf/vecdotq.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gguf/vecdotq.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/compat.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/compat.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/q_gemm.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/q_gemm.cu -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_2.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/qdq_2.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_3.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/qdq_3.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_4.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/qdq_4.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/qdq_8.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq/qdq_util.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/gptq/qdq_util.cuh -------------------------------------------------------------------------------- /csrc/quantization/gptq_marlin/.gitignore: -------------------------------------------------------------------------------- 1 | kernel_*.cu -------------------------------------------------------------------------------- /csrc/quantization/machete/Readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/machete/Readme.md -------------------------------------------------------------------------------- /csrc/quantization/utils.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/utils.cuh -------------------------------------------------------------------------------- /csrc/quantization/vectorization.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quantization/vectorization.cuh -------------------------------------------------------------------------------- /csrc/quickreduce/base.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quickreduce/base.h -------------------------------------------------------------------------------- /csrc/quickreduce/quick_reduce.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/quickreduce/quick_reduce.h -------------------------------------------------------------------------------- /csrc/rocm/attention.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/rocm/attention.cu -------------------------------------------------------------------------------- /csrc/rocm/ops.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/rocm/ops.h -------------------------------------------------------------------------------- /csrc/rocm/skinny_gemms.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/rocm/skinny_gemms.cu -------------------------------------------------------------------------------- /csrc/rocm/torch_bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/rocm/torch_bindings.cpp -------------------------------------------------------------------------------- /csrc/sampler.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/sampler.cu -------------------------------------------------------------------------------- /csrc/torch_bindings.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/torch_bindings.cpp -------------------------------------------------------------------------------- /csrc/type_convert.cuh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/csrc/type_convert.cuh -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile -------------------------------------------------------------------------------- /docker/Dockerfile.cpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.cpu -------------------------------------------------------------------------------- /docker/Dockerfile.nightly_torch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.nightly_torch -------------------------------------------------------------------------------- /docker/Dockerfile.ppc64le: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.ppc64le -------------------------------------------------------------------------------- /docker/Dockerfile.rocm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.rocm -------------------------------------------------------------------------------- /docker/Dockerfile.rocm_base: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.rocm_base -------------------------------------------------------------------------------- /docker/Dockerfile.s390x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.s390x -------------------------------------------------------------------------------- /docker/Dockerfile.tpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.tpu -------------------------------------------------------------------------------- /docker/Dockerfile.xpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docker/Dockerfile.xpu -------------------------------------------------------------------------------- /docs/.nav.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/.nav.yml -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/api/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/api/README.md -------------------------------------------------------------------------------- /docs/api/vllm/.meta.yml: -------------------------------------------------------------------------------- 1 | search: 2 | boost: 0.5 3 | -------------------------------------------------------------------------------- /docs/assets/deployment/dify-chat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/deployment/dify-chat.png -------------------------------------------------------------------------------- /docs/assets/design/hierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/assets/design/hierarchy.png -------------------------------------------------------------------------------- /docs/cli/.meta.yml: -------------------------------------------------------------------------------- 1 | toc_depth: 3 -------------------------------------------------------------------------------- /docs/cli/.nav.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/.nav.yml -------------------------------------------------------------------------------- /docs/cli/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/README.md -------------------------------------------------------------------------------- /docs/cli/bench/latency.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/bench/latency.md -------------------------------------------------------------------------------- /docs/cli/bench/serve.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/bench/serve.md -------------------------------------------------------------------------------- /docs/cli/bench/throughput.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/bench/throughput.md -------------------------------------------------------------------------------- /docs/cli/chat.md: -------------------------------------------------------------------------------- 1 | # vllm chat 2 | 3 | ## Options 4 | 5 | --8<-- "docs/argparse/chat.md" 6 | -------------------------------------------------------------------------------- /docs/cli/complete.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/complete.md -------------------------------------------------------------------------------- /docs/cli/json_tip.inc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/json_tip.inc.md -------------------------------------------------------------------------------- /docs/cli/run-batch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/run-batch.md -------------------------------------------------------------------------------- /docs/cli/serve.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/cli/serve.md -------------------------------------------------------------------------------- /docs/community/contact_us.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/community/contact_us.md -------------------------------------------------------------------------------- /docs/community/meetups.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/community/meetups.md -------------------------------------------------------------------------------- /docs/community/sponsors.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/community/sponsors.md -------------------------------------------------------------------------------- /docs/configuration/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/README.md -------------------------------------------------------------------------------- /docs/configuration/engine_args.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/engine_args.md -------------------------------------------------------------------------------- /docs/configuration/env_vars.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/env_vars.md -------------------------------------------------------------------------------- /docs/configuration/optimization.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/optimization.md -------------------------------------------------------------------------------- /docs/configuration/serve_args.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/serve_args.md -------------------------------------------------------------------------------- /docs/configuration/tpu.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/configuration/tpu.md -------------------------------------------------------------------------------- /docs/contributing/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/README.md -------------------------------------------------------------------------------- /docs/contributing/benchmarks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/benchmarks.md -------------------------------------------------------------------------------- /docs/contributing/ci/failures.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/ci/failures.md -------------------------------------------------------------------------------- /docs/contributing/model/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/model/README.md -------------------------------------------------------------------------------- /docs/contributing/model/basic.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/model/basic.md -------------------------------------------------------------------------------- /docs/contributing/model/tests.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/model/tests.md -------------------------------------------------------------------------------- /docs/contributing/profiling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/contributing/profiling.md -------------------------------------------------------------------------------- /docs/deployment/docker.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/docker.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/dify.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/dify.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/dstack.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/dstack.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/helm.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/helm.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/lws.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/lws.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/modal.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/modal.md -------------------------------------------------------------------------------- /docs/deployment/frameworks/triton.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/frameworks/triton.md -------------------------------------------------------------------------------- /docs/deployment/k8s.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/k8s.md -------------------------------------------------------------------------------- /docs/deployment/nginx.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/deployment/nginx.md -------------------------------------------------------------------------------- /docs/design/arch_overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/arch_overview.md -------------------------------------------------------------------------------- /docs/design/io_processor_plugins.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/io_processor_plugins.md -------------------------------------------------------------------------------- /docs/design/logits_processors.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/logits_processors.md -------------------------------------------------------------------------------- /docs/design/metrics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/metrics.md -------------------------------------------------------------------------------- /docs/design/mm_processing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/mm_processing.md -------------------------------------------------------------------------------- /docs/design/multiprocessing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/multiprocessing.md -------------------------------------------------------------------------------- /docs/design/p2p_nccl_connector.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/p2p_nccl_connector.md -------------------------------------------------------------------------------- /docs/design/paged_attention.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/paged_attention.md -------------------------------------------------------------------------------- /docs/design/plugin_system.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/plugin_system.md -------------------------------------------------------------------------------- /docs/design/prefix_caching.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/prefix_caching.md -------------------------------------------------------------------------------- /docs/design/torch_compile.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/design/torch_compile.md -------------------------------------------------------------------------------- /docs/examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/examples/README.md -------------------------------------------------------------------------------- /docs/features/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/README.md -------------------------------------------------------------------------------- /docs/features/custom_arguments.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/custom_arguments.md -------------------------------------------------------------------------------- /docs/features/custom_logitsprocs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/custom_logitsprocs.md -------------------------------------------------------------------------------- /docs/features/disagg_prefill.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/disagg_prefill.md -------------------------------------------------------------------------------- /docs/features/lora.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/lora.md -------------------------------------------------------------------------------- /docs/features/multimodal_inputs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/multimodal_inputs.md -------------------------------------------------------------------------------- /docs/features/prompt_embeds.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/prompt_embeds.md -------------------------------------------------------------------------------- /docs/features/quantization/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/README.md -------------------------------------------------------------------------------- /docs/features/quantization/bnb.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/bnb.md -------------------------------------------------------------------------------- /docs/features/quantization/fp8.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/fp8.md -------------------------------------------------------------------------------- /docs/features/quantization/gguf.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/gguf.md -------------------------------------------------------------------------------- /docs/features/quantization/inc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/inc.md -------------------------------------------------------------------------------- /docs/features/quantization/int4.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/int4.md -------------------------------------------------------------------------------- /docs/features/quantization/int8.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/int8.md -------------------------------------------------------------------------------- /docs/features/quantization/quark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/quantization/quark.md -------------------------------------------------------------------------------- /docs/features/reasoning_outputs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/reasoning_outputs.md -------------------------------------------------------------------------------- /docs/features/sleep_mode.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/sleep_mode.md -------------------------------------------------------------------------------- /docs/features/spec_decode.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/spec_decode.md -------------------------------------------------------------------------------- /docs/features/structured_outputs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/structured_outputs.md -------------------------------------------------------------------------------- /docs/features/tool_calling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/features/tool_calling.md -------------------------------------------------------------------------------- /docs/getting_started/quickstart.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/getting_started/quickstart.md -------------------------------------------------------------------------------- /docs/mkdocs/hooks/url_schemes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/mkdocs/hooks/url_schemes.py -------------------------------------------------------------------------------- /docs/mkdocs/javascript/mathjax.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/mkdocs/javascript/mathjax.js -------------------------------------------------------------------------------- /docs/mkdocs/overrides/main.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/mkdocs/overrides/main.html -------------------------------------------------------------------------------- /docs/mkdocs/stylesheets/extra.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/mkdocs/stylesheets/extra.css -------------------------------------------------------------------------------- /docs/models/extensions/tensorizer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/models/extensions/tensorizer.md -------------------------------------------------------------------------------- /docs/models/generative_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/models/generative_models.md -------------------------------------------------------------------------------- /docs/models/pooling_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/models/pooling_models.md -------------------------------------------------------------------------------- /docs/models/supported_models.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/models/supported_models.md -------------------------------------------------------------------------------- /docs/serving/offline_inference.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/serving/offline_inference.md -------------------------------------------------------------------------------- /docs/serving/parallelism_scaling.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/serving/parallelism_scaling.md -------------------------------------------------------------------------------- /docs/training/rlhf.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/training/rlhf.md -------------------------------------------------------------------------------- /docs/training/trl.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/training/trl.md -------------------------------------------------------------------------------- /docs/usage/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/README.md -------------------------------------------------------------------------------- /docs/usage/faq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/faq.md -------------------------------------------------------------------------------- /docs/usage/metrics.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/metrics.md -------------------------------------------------------------------------------- /docs/usage/reproducibility.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/reproducibility.md -------------------------------------------------------------------------------- /docs/usage/security.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/security.md -------------------------------------------------------------------------------- /docs/usage/troubleshooting.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/troubleshooting.md -------------------------------------------------------------------------------- /docs/usage/usage_stats.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/usage_stats.md -------------------------------------------------------------------------------- /docs/usage/v1_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/docs/usage/v1_guide.md -------------------------------------------------------------------------------- /examples/offline_inference/rlhf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/offline_inference/rlhf.py -------------------------------------------------------------------------------- /examples/offline_inference/tpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/offline_inference/tpu.py -------------------------------------------------------------------------------- /examples/online_serving/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/online_serving/utils.py -------------------------------------------------------------------------------- /examples/others/lmcache/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/others/lmcache/README.md -------------------------------------------------------------------------------- /examples/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/pyproject.toml -------------------------------------------------------------------------------- /examples/template_alpaca.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_alpaca.jinja -------------------------------------------------------------------------------- /examples/template_baichuan.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_baichuan.jinja -------------------------------------------------------------------------------- /examples/template_chatglm.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_chatglm.jinja -------------------------------------------------------------------------------- /examples/template_chatglm2.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_chatglm2.jinja -------------------------------------------------------------------------------- /examples/template_chatml.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_chatml.jinja -------------------------------------------------------------------------------- /examples/template_dse_qwen2_vl.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_dse_qwen2_vl.jinja -------------------------------------------------------------------------------- /examples/template_falcon.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_falcon.jinja -------------------------------------------------------------------------------- /examples/template_falcon_180b.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_falcon_180b.jinja -------------------------------------------------------------------------------- /examples/template_inkbot.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_inkbot.jinja -------------------------------------------------------------------------------- /examples/template_teleflm.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_teleflm.jinja -------------------------------------------------------------------------------- /examples/template_vlm2vec.jinja: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/examples/template_vlm2vec.jinja -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/format.sh -------------------------------------------------------------------------------- /mkdocs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/mkdocs.yaml -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/pyproject.toml -------------------------------------------------------------------------------- /requirements/build.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/build.txt -------------------------------------------------------------------------------- /requirements/common.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/common.txt -------------------------------------------------------------------------------- /requirements/cpu-build.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/cpu-build.txt -------------------------------------------------------------------------------- /requirements/cpu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/cpu.txt -------------------------------------------------------------------------------- /requirements/cuda.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/cuda.txt -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/dev.txt -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/docs.txt -------------------------------------------------------------------------------- /requirements/kv_connectors.txt: -------------------------------------------------------------------------------- 1 | lmcache 2 | nixl >= 0.5.1 # Required for disaggregated prefill 3 | -------------------------------------------------------------------------------- /requirements/lint.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | pre-commit==4.0.1 3 | -------------------------------------------------------------------------------- /requirements/nightly_torch_test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/nightly_torch_test.txt -------------------------------------------------------------------------------- /requirements/rocm-build.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/rocm-build.txt -------------------------------------------------------------------------------- /requirements/rocm-test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/rocm-test.txt -------------------------------------------------------------------------------- /requirements/rocm.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/rocm.txt -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/test.in -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/test.txt -------------------------------------------------------------------------------- /requirements/tpu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/tpu.txt -------------------------------------------------------------------------------- /requirements/xpu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/requirements/xpu.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/setup.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/basic_correctness/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/benchmarks/test_latency_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/benchmarks/test_latency_cli.py -------------------------------------------------------------------------------- /tests/benchmarks/test_serve_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/benchmarks/test_serve_cli.py -------------------------------------------------------------------------------- /tests/build_cython.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/build_cython.py -------------------------------------------------------------------------------- /tests/ci_envs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/ci_envs.py -------------------------------------------------------------------------------- /tests/compile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/compile/backend.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/backend.py -------------------------------------------------------------------------------- /tests/compile/piecewise/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/compile/silly_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/silly_attention.py -------------------------------------------------------------------------------- /tests/compile/test_async_tp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_async_tp.py -------------------------------------------------------------------------------- /tests/compile/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_config.py -------------------------------------------------------------------------------- /tests/compile/test_decorator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_decorator.py -------------------------------------------------------------------------------- /tests/compile/test_full_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_full_graph.py -------------------------------------------------------------------------------- /tests/compile/test_fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_fusion.py -------------------------------------------------------------------------------- /tests/compile/test_fusion_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_fusion_attn.py -------------------------------------------------------------------------------- /tests/compile/test_pass_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_pass_manager.py -------------------------------------------------------------------------------- /tests/compile/test_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/compile/test_wrapper.py -------------------------------------------------------------------------------- /tests/config/test_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/config/test_config.yaml -------------------------------------------------------------------------------- /tests/config/test_mp_reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/config/test_mp_reducer.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/conftest.py -------------------------------------------------------------------------------- /tests/cuda/test_cuda_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/cuda/test_cuda_context.py -------------------------------------------------------------------------------- /tests/detokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/detokenizer/test_min_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/detokenizer/test_min_tokens.py -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/distributed/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/conftest.py -------------------------------------------------------------------------------- /tests/distributed/test_comm_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_comm_ops.py -------------------------------------------------------------------------------- /tests/distributed/test_eplb_algo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_eplb_algo.py -------------------------------------------------------------------------------- /tests/distributed/test_events.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_events.py -------------------------------------------------------------------------------- /tests/distributed/test_kvlayout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_kvlayout.py -------------------------------------------------------------------------------- /tests/distributed/test_node_count.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_node_count.py -------------------------------------------------------------------------------- /tests/distributed/test_pynccl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_pynccl.py -------------------------------------------------------------------------------- /tests/distributed/test_same_node.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_same_node.py -------------------------------------------------------------------------------- /tests/distributed/test_shm_buffer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_shm_buffer.py -------------------------------------------------------------------------------- /tests/distributed/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/distributed/test_utils.py -------------------------------------------------------------------------------- /tests/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/engine/test_arg_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/engine/test_arg_utils.py -------------------------------------------------------------------------------- /tests/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/conftest.py -------------------------------------------------------------------------------- /tests/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/llm/test_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/llm/test_chat.py -------------------------------------------------------------------------------- /tests/entrypoints/offline_mode/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/openai/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/openai/conftest.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/correctness/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/openai/test_uds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/openai/test_uds.py -------------------------------------------------------------------------------- /tests/entrypoints/openai/tool_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/pooling/correctness/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/pooling/llm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/pooling/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/entrypoints/test_chat_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/test_chat_utils.py -------------------------------------------------------------------------------- /tests/entrypoints/test_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/test_context.py -------------------------------------------------------------------------------- /tests/entrypoints/test_renderer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/entrypoints/test_renderer.py -------------------------------------------------------------------------------- /tests/evals/gpt_oss/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gpt_oss/__init__.py -------------------------------------------------------------------------------- /tests/evals/gpt_oss/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gpt_oss/conftest.py -------------------------------------------------------------------------------- /tests/evals/gsm8k/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gsm8k/README.md -------------------------------------------------------------------------------- /tests/evals/gsm8k/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gsm8k/__init__.py -------------------------------------------------------------------------------- /tests/evals/gsm8k/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gsm8k/conftest.py -------------------------------------------------------------------------------- /tests/evals/gsm8k/gsm8k_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/evals/gsm8k/gsm8k_eval.py -------------------------------------------------------------------------------- /tests/fastsafetensors_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/kernels/allclose_default.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/allclose_default.py -------------------------------------------------------------------------------- /tests/kernels/attention/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/attention/conftest.py -------------------------------------------------------------------------------- /tests/kernels/core/test_layernorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/core/test_layernorm.py -------------------------------------------------------------------------------- /tests/kernels/core/test_mrope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/core/test_mrope.py -------------------------------------------------------------------------------- /tests/kernels/core/test_opcheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/core/test_opcheck.py -------------------------------------------------------------------------------- /tests/kernels/core/test_uva.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/core/test_uva.py -------------------------------------------------------------------------------- /tests/kernels/moe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/kernels/moe/modular_kernel_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/kernels/moe/parallel_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/parallel_utils.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_block_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_block_fp8.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_block_int8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_block_int8.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_deepep_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_deepep_moe.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_deepgemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_deepgemm.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_flashinfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_flashinfer.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_moe.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_mxfp4_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_mxfp4_moe.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_nvfp4_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_nvfp4_moe.py -------------------------------------------------------------------------------- /tests/kernels/moe/test_pplx_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/test_pplx_moe.py -------------------------------------------------------------------------------- /tests/kernels/moe/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/moe/utils.py -------------------------------------------------------------------------------- /tests/kernels/quant_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/quant_utils.py -------------------------------------------------------------------------------- /tests/kernels/test_flex_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/test_flex_attention.py -------------------------------------------------------------------------------- /tests/kernels/test_onednn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/test_onednn.py -------------------------------------------------------------------------------- /tests/kernels/test_shuffle_rows.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/test_shuffle_rows.py -------------------------------------------------------------------------------- /tests/kernels/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kernels/utils.py -------------------------------------------------------------------------------- /tests/kv_transfer/test_module.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kv_transfer/test_module.py -------------------------------------------------------------------------------- /tests/kv_transfer/test_send_recv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kv_transfer/test_send_recv.py -------------------------------------------------------------------------------- /tests/kv_transfer/test_send_recv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/kv_transfer/test_send_recv.sh -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/lora/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/conftest.py -------------------------------------------------------------------------------- /tests/lora/test_add_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_add_lora.py -------------------------------------------------------------------------------- /tests/lora/test_chatglm3_tp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_chatglm3_tp.py -------------------------------------------------------------------------------- /tests/lora/test_default_mm_loras.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_default_mm_loras.py -------------------------------------------------------------------------------- /tests/lora/test_layers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_layers.py -------------------------------------------------------------------------------- /tests/lora/test_llama_tp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_llama_tp.py -------------------------------------------------------------------------------- /tests/lora/test_lora_checkpoints.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_lora_checkpoints.py -------------------------------------------------------------------------------- /tests/lora/test_lora_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_lora_functions.py -------------------------------------------------------------------------------- /tests/lora/test_lora_huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_lora_huggingface.py -------------------------------------------------------------------------------- /tests/lora/test_lora_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_lora_manager.py -------------------------------------------------------------------------------- /tests/lora/test_minicpmv_tp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_minicpmv_tp.py -------------------------------------------------------------------------------- /tests/lora/test_mixtral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_mixtral.py -------------------------------------------------------------------------------- /tests/lora/test_peft_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_peft_helper.py -------------------------------------------------------------------------------- /tests/lora/test_punica_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_punica_ops.py -------------------------------------------------------------------------------- /tests/lora/test_quant_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_quant_model.py -------------------------------------------------------------------------------- /tests/lora/test_qwen2vl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_qwen2vl.py -------------------------------------------------------------------------------- /tests/lora/test_resolver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_resolver.py -------------------------------------------------------------------------------- /tests/lora/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_utils.py -------------------------------------------------------------------------------- /tests/lora/test_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/test_worker.py -------------------------------------------------------------------------------- /tests/lora/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/lora/utils.py -------------------------------------------------------------------------------- /tests/mistral_tool_use/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/mistral_tool_use/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/mistral_tool_use/conftest.py -------------------------------------------------------------------------------- /tests/mistral_tool_use/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/mistral_tool_use/utils.py -------------------------------------------------------------------------------- /tests/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/model_executor/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/model_executor/conftest.py -------------------------------------------------------------------------------- /tests/model_executor/model_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/language/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/language/generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/language/generation_ppl_test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/language/pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/language/pooling_mteb_test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/multimodal/generation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/multimodal/generation/vlm_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/multimodal/pooling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/multimodal/processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/registry.py -------------------------------------------------------------------------------- /tests/models/test_initialization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_initialization.py -------------------------------------------------------------------------------- /tests/models/test_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_registry.py -------------------------------------------------------------------------------- /tests/models/test_terratorch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_terratorch.py -------------------------------------------------------------------------------- /tests/models/test_transformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_transformers.py -------------------------------------------------------------------------------- /tests/models/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_utils.py -------------------------------------------------------------------------------- /tests/models/test_vision.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/test_vision.py -------------------------------------------------------------------------------- /tests/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/models/utils.py -------------------------------------------------------------------------------- /tests/multimodal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/multimodal/assets/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image1.png -------------------------------------------------------------------------------- /tests/multimodal/assets/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/image2.png -------------------------------------------------------------------------------- /tests/multimodal/assets/rgba.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/assets/rgba.png -------------------------------------------------------------------------------- /tests/multimodal/test_cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_cache.py -------------------------------------------------------------------------------- /tests/multimodal/test_hasher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_hasher.py -------------------------------------------------------------------------------- /tests/multimodal/test_image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_image.py -------------------------------------------------------------------------------- /tests/multimodal/test_inputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_inputs.py -------------------------------------------------------------------------------- /tests/multimodal/test_processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_processing.py -------------------------------------------------------------------------------- /tests/multimodal/test_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_registry.py -------------------------------------------------------------------------------- /tests/multimodal/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_utils.py -------------------------------------------------------------------------------- /tests/multimodal/test_video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/test_video.py -------------------------------------------------------------------------------- /tests/multimodal/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/multimodal/utils.py -------------------------------------------------------------------------------- /tests/plugins/lora_resolvers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/prompts/example.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/prompts/example.txt -------------------------------------------------------------------------------- /tests/prompts/summary.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/prompts/summary.txt -------------------------------------------------------------------------------- /tests/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/quantization/test_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_configs.py -------------------------------------------------------------------------------- /tests/quantization/test_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_fp8.py -------------------------------------------------------------------------------- /tests/quantization/test_lm_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_lm_head.py -------------------------------------------------------------------------------- /tests/quantization/test_modelopt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_modelopt.py -------------------------------------------------------------------------------- /tests/quantization/test_ptpc_fp8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_ptpc_fp8.py -------------------------------------------------------------------------------- /tests/quantization/test_quark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_quark.py -------------------------------------------------------------------------------- /tests/quantization/test_rtn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_rtn.py -------------------------------------------------------------------------------- /tests/quantization/test_torchao.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/test_torchao.py -------------------------------------------------------------------------------- /tests/quantization/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/quantization/utils.py -------------------------------------------------------------------------------- /tests/reasoning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/reasoning/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/reasoning/utils.py -------------------------------------------------------------------------------- /tests/runai_model_streamer_test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/samplers/test_beam_search.py -------------------------------------------------------------------------------- /tests/samplers/test_ignore_eos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/samplers/test_ignore_eos.py -------------------------------------------------------------------------------- /tests/samplers/test_no_bad_words.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/samplers/test_no_bad_words.py -------------------------------------------------------------------------------- /tests/samplers/test_ranks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/samplers/test_ranks.py -------------------------------------------------------------------------------- /tests/tensorizer_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tensorizer_loader/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tensorizer_loader/conftest.py -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_config.py -------------------------------------------------------------------------------- /tests/test_embedded_commit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_embedded_commit.py -------------------------------------------------------------------------------- /tests/test_inputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_inputs.py -------------------------------------------------------------------------------- /tests/test_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_logger.py -------------------------------------------------------------------------------- /tests/test_outputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_outputs.py -------------------------------------------------------------------------------- /tests/test_pooling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_pooling_params.py -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_regression.py -------------------------------------------------------------------------------- /tests/test_routing_simulator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_routing_simulator.py -------------------------------------------------------------------------------- /tests/test_sampling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_sampling_params.py -------------------------------------------------------------------------------- /tests/test_scalartype.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_scalartype.py -------------------------------------------------------------------------------- /tests/test_seed_behavior.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_seed_behavior.py -------------------------------------------------------------------------------- /tests/test_sequence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_sequence.py -------------------------------------------------------------------------------- /tests/test_sharded_state_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_sharded_state_loader.py -------------------------------------------------------------------------------- /tests/test_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_test.py -------------------------------------------------------------------------------- /tests/test_triton_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_triton_utils.py -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_version.py -------------------------------------------------------------------------------- /tests/test_vllm_port.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/test_vllm_port.py -------------------------------------------------------------------------------- /tests/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tokenization/test_get_eos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tokenization/test_get_eos.py -------------------------------------------------------------------------------- /tests/tokenization/test_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tokenization/test_tokenizer.py -------------------------------------------------------------------------------- /tests/tool_use/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tool_use/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tool_use/conftest.py -------------------------------------------------------------------------------- /tests/tool_use/test_tool_calls.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tool_use/test_tool_calls.py -------------------------------------------------------------------------------- /tests/tool_use/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tool_use/utils.py -------------------------------------------------------------------------------- /tests/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tools/test_config_validator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tools/test_config_validator.py -------------------------------------------------------------------------------- /tests/tpu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tpu/lora/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tpu/lora/test_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tpu/lora/test_lora.py -------------------------------------------------------------------------------- /tests/tpu/test_compilation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tpu/test_compilation.py -------------------------------------------------------------------------------- /tests/tpu/test_custom_dispatcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tpu/test_custom_dispatcher.py -------------------------------------------------------------------------------- /tests/tpu/test_moe_pallas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/tpu/test_moe_pallas.py -------------------------------------------------------------------------------- /tests/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/utils.py -------------------------------------------------------------------------------- /tests/utils_/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/utils_/__init__.py -------------------------------------------------------------------------------- /tests/utils_/test_tensor_schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/utils_/test_tensor_schema.py -------------------------------------------------------------------------------- /tests/utils_/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/utils_/test_utils.py -------------------------------------------------------------------------------- /tests/v1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/attention/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/attention/utils.py -------------------------------------------------------------------------------- /tests/v1/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/core/test_kv_cache_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/core/test_kv_cache_utils.py -------------------------------------------------------------------------------- /tests/v1/core/test_prefix_caching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/core/test_prefix_caching.py -------------------------------------------------------------------------------- /tests/v1/core/test_scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/core/test_scheduler.py -------------------------------------------------------------------------------- /tests/v1/core/test_scheduler_e2e.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/core/test_scheduler_e2e.py -------------------------------------------------------------------------------- /tests/v1/core/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/core/utils.py -------------------------------------------------------------------------------- /tests/v1/cudagraph/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/e2e/test_min_tokens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/e2e/test_min_tokens.py -------------------------------------------------------------------------------- /tests/v1/e2e/test_spec_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/e2e/test_spec_decode.py -------------------------------------------------------------------------------- /tests/v1/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/engine/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/conftest.py -------------------------------------------------------------------------------- /tests/v1/engine/test_async_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/test_async_llm.py -------------------------------------------------------------------------------- /tests/v1/engine/test_engine_args.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/test_engine_args.py -------------------------------------------------------------------------------- /tests/v1/engine/test_engine_core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/test_engine_core.py -------------------------------------------------------------------------------- /tests/v1/engine/test_llm_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/test_llm_engine.py -------------------------------------------------------------------------------- /tests/v1/engine/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/engine/utils.py -------------------------------------------------------------------------------- /tests/v1/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/entrypoints/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/entrypoints/conftest.py -------------------------------------------------------------------------------- /tests/v1/entrypoints/llm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/entrypoints/openai/responses/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/executor/test_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/executor/test_executor.py -------------------------------------------------------------------------------- /tests/v1/kv_connector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/kv_connector/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/kv_connector/unit/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/kv_connector/unit/utils.py -------------------------------------------------------------------------------- /tests/v1/logits_processors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/logits_processors/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/logits_processors/utils.py -------------------------------------------------------------------------------- /tests/v1/metrics/test_ray_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/metrics/test_ray_metrics.py -------------------------------------------------------------------------------- /tests/v1/sample/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/sample/test_logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/sample/test_logprobs.py -------------------------------------------------------------------------------- /tests/v1/sample/test_logprobs_e2e.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/sample/test_logprobs_e2e.py -------------------------------------------------------------------------------- /tests/v1/sample/test_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/sample/test_sampler.py -------------------------------------------------------------------------------- /tests/v1/sample/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/sample/utils.py -------------------------------------------------------------------------------- /tests/v1/shutdown/test_delete.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/shutdown/test_delete.py -------------------------------------------------------------------------------- /tests/v1/shutdown/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/shutdown/utils.py -------------------------------------------------------------------------------- /tests/v1/spec_decode/test_eagle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/spec_decode/test_eagle.py -------------------------------------------------------------------------------- /tests/v1/spec_decode/test_max_len.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/spec_decode/test_max_len.py -------------------------------------------------------------------------------- /tests/v1/spec_decode/test_ngram.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/spec_decode/test_ngram.py -------------------------------------------------------------------------------- /tests/v1/structured_output/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/test_async_llm_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_async_llm_dp.py -------------------------------------------------------------------------------- /tests/v1/test_external_lb_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_external_lb_dp.py -------------------------------------------------------------------------------- /tests/v1/test_hybrid_lb_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_hybrid_lb_dp.py -------------------------------------------------------------------------------- /tests/v1/test_internal_lb_dp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_internal_lb_dp.py -------------------------------------------------------------------------------- /tests/v1/test_kv_sharing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_kv_sharing.py -------------------------------------------------------------------------------- /tests/v1/test_metrics_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_metrics_reader.py -------------------------------------------------------------------------------- /tests/v1/test_oracle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_oracle.py -------------------------------------------------------------------------------- /tests/v1/test_request.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_request.py -------------------------------------------------------------------------------- /tests/v1/test_serial_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_serial_utils.py -------------------------------------------------------------------------------- /tests/v1/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/test_utils.py -------------------------------------------------------------------------------- /tests/v1/tpu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/tpu/test_basic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_basic.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_mha_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_mha_attn.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_multimodal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_multimodal.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_pallas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_pallas.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_perf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_perf.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_sampler.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_tpu_int8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_tpu_int8.py -------------------------------------------------------------------------------- /tests/v1/tpu/test_tpu_qkv_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tpu/test_tpu_qkv_linear.py -------------------------------------------------------------------------------- /tests/v1/tpu/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/tracing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/v1/tracing/test_tracing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/v1/tracing/test_tracing.py -------------------------------------------------------------------------------- /tests/v1/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/vllm_test_utils/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/vllm_test_utils/setup.py -------------------------------------------------------------------------------- /tests/weight_loading/models.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tests/weight_loading/models.txt -------------------------------------------------------------------------------- /tools/check_init_lazy_imports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/check_init_lazy_imports.py -------------------------------------------------------------------------------- /tools/check_pickle_imports.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/check_pickle_imports.py -------------------------------------------------------------------------------- /tools/check_repo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/check_repo.sh -------------------------------------------------------------------------------- /tools/check_spdx_header.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/check_spdx_header.py -------------------------------------------------------------------------------- /tools/check_triton_import.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/check_triton_import.py -------------------------------------------------------------------------------- /tools/enforce_regex_import.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/enforce_regex_import.py -------------------------------------------------------------------------------- /tools/ep_kernels/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/ep_kernels/README.md -------------------------------------------------------------------------------- /tools/generate_cmake_presets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/generate_cmake_presets.py -------------------------------------------------------------------------------- /tools/generate_nightly_torch_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/generate_nightly_torch_test.py -------------------------------------------------------------------------------- /tools/install_deepgemm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/install_deepgemm.sh -------------------------------------------------------------------------------- /tools/install_gdrcopy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/install_gdrcopy.sh -------------------------------------------------------------------------------- /tools/mypy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/mypy.sh -------------------------------------------------------------------------------- /tools/png-lint.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/png-lint.sh -------------------------------------------------------------------------------- /tools/report_build_time_ninja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/report_build_time_ninja.py -------------------------------------------------------------------------------- /tools/shellcheck.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/shellcheck.sh -------------------------------------------------------------------------------- /tools/update-dockerfile-graph.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/update-dockerfile-graph.sh -------------------------------------------------------------------------------- /tools/validate_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/tools/validate_config.py -------------------------------------------------------------------------------- /use_existing_torch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/use_existing_torch.py -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/__init__.py -------------------------------------------------------------------------------- /vllm/_bc_linter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/_bc_linter.py -------------------------------------------------------------------------------- /vllm/_custom_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/_custom_ops.py -------------------------------------------------------------------------------- /vllm/_ipex_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/_ipex_ops.py -------------------------------------------------------------------------------- /vllm/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/assets/audio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/assets/audio.py -------------------------------------------------------------------------------- /vllm/assets/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/assets/base.py -------------------------------------------------------------------------------- /vllm/assets/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/assets/image.py -------------------------------------------------------------------------------- /vllm/assets/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/assets/video.py -------------------------------------------------------------------------------- /vllm/attention/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/__init__.py -------------------------------------------------------------------------------- /vllm/attention/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/attention/backends/abstract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/backends/abstract.py -------------------------------------------------------------------------------- /vllm/attention/backends/flashmla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/backends/flashmla.py -------------------------------------------------------------------------------- /vllm/attention/backends/mla/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/attention/backends/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/backends/utils.py -------------------------------------------------------------------------------- /vllm/attention/backends/xformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/backends/xformers.py -------------------------------------------------------------------------------- /vllm/attention/layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/layer.py -------------------------------------------------------------------------------- /vllm/attention/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/attention/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/attention/ops/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/ops/common.py -------------------------------------------------------------------------------- /vllm/attention/ops/flashmla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/ops/flashmla.py -------------------------------------------------------------------------------- /vllm/attention/ops/paged_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/ops/paged_attn.py -------------------------------------------------------------------------------- /vllm/attention/ops/prefix_prefill.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/ops/prefix_prefill.py -------------------------------------------------------------------------------- /vllm/attention/ops/rocm_aiter_mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/ops/rocm_aiter_mla.py -------------------------------------------------------------------------------- /vllm/attention/selector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/selector.py -------------------------------------------------------------------------------- /vllm/attention/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/attention/utils/fa_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/attention/utils/fa_utils.py -------------------------------------------------------------------------------- /vllm/beam_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/beam_search.py -------------------------------------------------------------------------------- /vllm/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/benchmarks/datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/datasets.py -------------------------------------------------------------------------------- /vllm/benchmarks/latency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/latency.py -------------------------------------------------------------------------------- /vllm/benchmarks/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/lib/__init__.py -------------------------------------------------------------------------------- /vllm/benchmarks/lib/ready_checker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/lib/ready_checker.py -------------------------------------------------------------------------------- /vllm/benchmarks/lib/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/lib/utils.py -------------------------------------------------------------------------------- /vllm/benchmarks/serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/serve.py -------------------------------------------------------------------------------- /vllm/benchmarks/throughput.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/benchmarks/throughput.py -------------------------------------------------------------------------------- /vllm/collect_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/collect_env.py -------------------------------------------------------------------------------- /vllm/compilation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/compilation/backends.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/backends.py -------------------------------------------------------------------------------- /vllm/compilation/counter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/counter.py -------------------------------------------------------------------------------- /vllm/compilation/cuda_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/cuda_graph.py -------------------------------------------------------------------------------- /vllm/compilation/decorators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/decorators.py -------------------------------------------------------------------------------- /vllm/compilation/fusion.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/fusion.py -------------------------------------------------------------------------------- /vllm/compilation/fusion_attn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/fusion_attn.py -------------------------------------------------------------------------------- /vllm/compilation/fx_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/fx_utils.py -------------------------------------------------------------------------------- /vllm/compilation/inductor_pass.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/inductor_pass.py -------------------------------------------------------------------------------- /vllm/compilation/monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/monitor.py -------------------------------------------------------------------------------- /vllm/compilation/noop_elimination.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/noop_elimination.py -------------------------------------------------------------------------------- /vllm/compilation/pass_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/pass_manager.py -------------------------------------------------------------------------------- /vllm/compilation/wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/compilation/wrapper.py -------------------------------------------------------------------------------- /vllm/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/__init__.py -------------------------------------------------------------------------------- /vllm/config/cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/cache.py -------------------------------------------------------------------------------- /vllm/config/compilation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/compilation.py -------------------------------------------------------------------------------- /vllm/config/kv_events.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/kv_events.py -------------------------------------------------------------------------------- /vllm/config/kv_transfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/kv_transfer.py -------------------------------------------------------------------------------- /vllm/config/load.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/load.py -------------------------------------------------------------------------------- /vllm/config/lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/lora.py -------------------------------------------------------------------------------- /vllm/config/multimodal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/multimodal.py -------------------------------------------------------------------------------- /vllm/config/parallel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/parallel.py -------------------------------------------------------------------------------- /vllm/config/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/scheduler.py -------------------------------------------------------------------------------- /vllm/config/speculative.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/speculative.py -------------------------------------------------------------------------------- /vllm/config/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/config/utils.py -------------------------------------------------------------------------------- /vllm/connections.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/connections.py -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/core/block/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/core/block/block_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block/block_table.py -------------------------------------------------------------------------------- /vllm/core/block/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block/common.py -------------------------------------------------------------------------------- /vllm/core/block/interfaces.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block/interfaces.py -------------------------------------------------------------------------------- /vllm/core/block/naive_block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block/naive_block.py -------------------------------------------------------------------------------- /vllm/core/block/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block/utils.py -------------------------------------------------------------------------------- /vllm/core/block_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/block_manager.py -------------------------------------------------------------------------------- /vllm/core/evictor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/evictor.py -------------------------------------------------------------------------------- /vllm/core/interfaces.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/interfaces.py -------------------------------------------------------------------------------- /vllm/core/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/core/scheduler.py -------------------------------------------------------------------------------- /vllm/device_allocator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/device_allocator/cumem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/device_allocator/cumem.py -------------------------------------------------------------------------------- /vllm/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/communication_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/communication_op.py -------------------------------------------------------------------------------- /vllm/distributed/device_communicators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/distributed/eplb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/eplb/__init__.py -------------------------------------------------------------------------------- /vllm/distributed/eplb/eplb_state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/eplb/eplb_state.py -------------------------------------------------------------------------------- /vllm/distributed/kv_events.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/kv_events.py -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/distributed/kv_transfer/kv_pipe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/distributed/parallel_state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/parallel_state.py -------------------------------------------------------------------------------- /vllm/distributed/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/distributed/utils.py -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/engine/arg_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/arg_utils.py -------------------------------------------------------------------------------- /vllm/engine/async_llm_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/async_llm_engine.py -------------------------------------------------------------------------------- /vllm/engine/async_timeout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/async_timeout.py -------------------------------------------------------------------------------- /vllm/engine/llm_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/llm_engine.py -------------------------------------------------------------------------------- /vllm/engine/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/metrics.py -------------------------------------------------------------------------------- /vllm/engine/metrics_types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/metrics_types.py -------------------------------------------------------------------------------- /vllm/engine/output_processor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/engine/protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/engine/protocol.py -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/api_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/api_server.py -------------------------------------------------------------------------------- /vllm/entrypoints/chat_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/chat_utils.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/cli/collect_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/collect_env.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/main.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/openai.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/run_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/run_batch.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/serve.py -------------------------------------------------------------------------------- /vllm/entrypoints/cli/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/cli/types.py -------------------------------------------------------------------------------- /vllm/entrypoints/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/constants.py -------------------------------------------------------------------------------- /vllm/entrypoints/context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/context.py -------------------------------------------------------------------------------- /vllm/entrypoints/harmony_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/harmony_utils.py -------------------------------------------------------------------------------- /vllm/entrypoints/launcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/launcher.py -------------------------------------------------------------------------------- /vllm/entrypoints/llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/llm.py -------------------------------------------------------------------------------- /vllm/entrypoints/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/logger.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/cli_args.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/openai/cli_args.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/openai/protocol.py -------------------------------------------------------------------------------- /vllm/entrypoints/openai/run_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/openai/run_batch.py -------------------------------------------------------------------------------- /vllm/entrypoints/renderer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/renderer.py -------------------------------------------------------------------------------- /vllm/entrypoints/score_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/score_utils.py -------------------------------------------------------------------------------- /vllm/entrypoints/ssl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/ssl.py -------------------------------------------------------------------------------- /vllm/entrypoints/tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/tool.py -------------------------------------------------------------------------------- /vllm/entrypoints/tool_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/tool_server.py -------------------------------------------------------------------------------- /vllm/entrypoints/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/entrypoints/utils.py -------------------------------------------------------------------------------- /vllm/env_override.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/env_override.py -------------------------------------------------------------------------------- /vllm/envs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/envs.py -------------------------------------------------------------------------------- /vllm/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/executor/executor_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/executor/executor_base.py -------------------------------------------------------------------------------- /vllm/executor/msgspec_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/executor/msgspec_utils.py -------------------------------------------------------------------------------- /vllm/executor/ray_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/executor/ray_utils.py -------------------------------------------------------------------------------- /vllm/executor/uniproc_executor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/executor/uniproc_executor.py -------------------------------------------------------------------------------- /vllm/forward_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/forward_context.py -------------------------------------------------------------------------------- /vllm/inputs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/inputs/__init__.py -------------------------------------------------------------------------------- /vllm/inputs/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/inputs/data.py -------------------------------------------------------------------------------- /vllm/inputs/parse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/inputs/parse.py -------------------------------------------------------------------------------- /vllm/inputs/preprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/inputs/preprocess.py -------------------------------------------------------------------------------- /vllm/inputs/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/inputs/registry.py -------------------------------------------------------------------------------- /vllm/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logger.py -------------------------------------------------------------------------------- /vllm/logging_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logging_utils/__init__.py -------------------------------------------------------------------------------- /vllm/logging_utils/dump_input.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logging_utils/dump_input.py -------------------------------------------------------------------------------- /vllm/logging_utils/formatter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logging_utils/formatter.py -------------------------------------------------------------------------------- /vllm/logits_process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logits_process.py -------------------------------------------------------------------------------- /vllm/logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/logprobs.py -------------------------------------------------------------------------------- /vllm/lora/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/lora/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/layers/__init__.py -------------------------------------------------------------------------------- /vllm/lora/layers/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/layers/base.py -------------------------------------------------------------------------------- /vllm/lora/layers/base_linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/layers/base_linear.py -------------------------------------------------------------------------------- /vllm/lora/layers/logits_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/layers/logits_processor.py -------------------------------------------------------------------------------- /vllm/lora/layers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/layers/utils.py -------------------------------------------------------------------------------- /vllm/lora/lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/lora.py -------------------------------------------------------------------------------- /vllm/lora/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/models.py -------------------------------------------------------------------------------- /vllm/lora/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/lora/ops/ipex_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/ipex_ops/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/ipex_ops/lora_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/ipex_ops/lora_ops.py -------------------------------------------------------------------------------- /vllm/lora/ops/torch_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/torch_ops/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/torch_ops/lora_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/torch_ops/lora_ops.py -------------------------------------------------------------------------------- /vllm/lora/ops/triton_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/triton_ops/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/triton_ops/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/triton_ops/utils.py -------------------------------------------------------------------------------- /vllm/lora/ops/xla_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/xla_ops/__init__.py -------------------------------------------------------------------------------- /vllm/lora/ops/xla_ops/lora_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/ops/xla_ops/lora_ops.py -------------------------------------------------------------------------------- /vllm/lora/peft_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/peft_helper.py -------------------------------------------------------------------------------- /vllm/lora/punica_wrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/punica_wrapper/__init__.py -------------------------------------------------------------------------------- /vllm/lora/punica_wrapper/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/punica_wrapper/utils.py -------------------------------------------------------------------------------- /vllm/lora/request.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/request.py -------------------------------------------------------------------------------- /vllm/lora/resolver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/resolver.py -------------------------------------------------------------------------------- /vllm/lora/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/utils.py -------------------------------------------------------------------------------- /vllm/lora/worker_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/lora/worker_manager.py -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/custom_op.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/custom_op.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/linear.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/layers/linear.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/mamba/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/mla.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/layers/mla.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/pooler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/layers/pooler.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/quark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/layers/utils.py -------------------------------------------------------------------------------- /vllm/model_executor/models/aimv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/aimv2.py -------------------------------------------------------------------------------- /vllm/model_executor/models/arcee.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/arcee.py -------------------------------------------------------------------------------- /vllm/model_executor/models/arctic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/arctic.py -------------------------------------------------------------------------------- /vllm/model_executor/models/aria.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/aria.py -------------------------------------------------------------------------------- /vllm/model_executor/models/bamba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/bamba.py -------------------------------------------------------------------------------- /vllm/model_executor/models/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/bert.py -------------------------------------------------------------------------------- /vllm/model_executor/models/blip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/blip.py -------------------------------------------------------------------------------- /vllm/model_executor/models/blip2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/blip2.py -------------------------------------------------------------------------------- /vllm/model_executor/models/bloom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/bloom.py -------------------------------------------------------------------------------- /vllm/model_executor/models/clip.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/clip.py -------------------------------------------------------------------------------- /vllm/model_executor/models/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/config.py -------------------------------------------------------------------------------- /vllm/model_executor/models/dbrx.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/dbrx.py -------------------------------------------------------------------------------- /vllm/model_executor/models/dots1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/dots1.py -------------------------------------------------------------------------------- /vllm/model_executor/models/exaone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/exaone.py -------------------------------------------------------------------------------- /vllm/model_executor/models/falcon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/falcon.py -------------------------------------------------------------------------------- /vllm/model_executor/models/glm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/glm.py -------------------------------------------------------------------------------- /vllm/model_executor/models/mpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/mpt.py -------------------------------------------------------------------------------- /vllm/model_executor/models/opt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/opt.py -------------------------------------------------------------------------------- /vllm/model_executor/models/phi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/phi.py -------------------------------------------------------------------------------- /vllm/model_executor/models/rvl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/models/rvl.py -------------------------------------------------------------------------------- /vllm/model_executor/parameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/parameter.py -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/model_executor/utils.py -------------------------------------------------------------------------------- /vllm/model_executor/warmup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/multimodal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/__init__.py -------------------------------------------------------------------------------- /vllm/multimodal/audio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/audio.py -------------------------------------------------------------------------------- /vllm/multimodal/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/base.py -------------------------------------------------------------------------------- /vllm/multimodal/cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/cache.py -------------------------------------------------------------------------------- /vllm/multimodal/hasher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/hasher.py -------------------------------------------------------------------------------- /vllm/multimodal/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/image.py -------------------------------------------------------------------------------- /vllm/multimodal/inputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/inputs.py -------------------------------------------------------------------------------- /vllm/multimodal/parse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/parse.py -------------------------------------------------------------------------------- /vllm/multimodal/processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/processing.py -------------------------------------------------------------------------------- /vllm/multimodal/profiling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/profiling.py -------------------------------------------------------------------------------- /vllm/multimodal/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/registry.py -------------------------------------------------------------------------------- /vllm/multimodal/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/utils.py -------------------------------------------------------------------------------- /vllm/multimodal/video.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/multimodal/video.py -------------------------------------------------------------------------------- /vllm/outputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/outputs.py -------------------------------------------------------------------------------- /vllm/platforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/__init__.py -------------------------------------------------------------------------------- /vllm/platforms/cpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/cpu.py -------------------------------------------------------------------------------- /vllm/platforms/cuda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/cuda.py -------------------------------------------------------------------------------- /vllm/platforms/interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/interface.py -------------------------------------------------------------------------------- /vllm/platforms/rocm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/rocm.py -------------------------------------------------------------------------------- /vllm/platforms/tpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/tpu.py -------------------------------------------------------------------------------- /vllm/platforms/xpu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/platforms/xpu.py -------------------------------------------------------------------------------- /vllm/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/plugins/__init__.py -------------------------------------------------------------------------------- /vllm/plugins/lora_resolvers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/pooling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/pooling_params.py -------------------------------------------------------------------------------- /vllm/profiler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/profiler/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/profiler/utils.py -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/ray/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/ray/lazy_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/ray/lazy_utils.py -------------------------------------------------------------------------------- /vllm/ray/ray_env.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/ray/ray_env.py -------------------------------------------------------------------------------- /vllm/reasoning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/reasoning/__init__.py -------------------------------------------------------------------------------- /vllm/sampling_params.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/sampling_params.py -------------------------------------------------------------------------------- /vllm/scalar_type.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/scalar_type.py -------------------------------------------------------------------------------- /vllm/scripts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/scripts.py -------------------------------------------------------------------------------- /vllm/sequence.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/sequence.py -------------------------------------------------------------------------------- /vllm/tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/tasks.py -------------------------------------------------------------------------------- /vllm/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/test_utils.py -------------------------------------------------------------------------------- /vllm/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/third_party/pynvml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/third_party/pynvml.py -------------------------------------------------------------------------------- /vllm/tracing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/tracing.py -------------------------------------------------------------------------------- /vllm/transformers_utils/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/transformers_utils/config.py -------------------------------------------------------------------------------- /vllm/transformers_utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/transformers_utils/utils.py -------------------------------------------------------------------------------- /vllm/triton_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/triton_utils/__init__.py -------------------------------------------------------------------------------- /vllm/triton_utils/importing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/triton_utils/importing.py -------------------------------------------------------------------------------- /vllm/usage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/usage/usage_lib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/usage/usage_lib.py -------------------------------------------------------------------------------- /vllm/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/utils/__init__.py -------------------------------------------------------------------------------- /vllm/utils/deep_gemm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/utils/deep_gemm.py -------------------------------------------------------------------------------- /vllm/utils/flashinfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/utils/flashinfer.py -------------------------------------------------------------------------------- /vllm/utils/jsontree.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/utils/jsontree.py -------------------------------------------------------------------------------- /vllm/utils/tensor_schema.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/utils/tensor_schema.py -------------------------------------------------------------------------------- /vllm/v1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/attention/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/attention/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/attention/backends/mla/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/core/block_pool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/block_pool.py -------------------------------------------------------------------------------- /vllm/v1/core/kv_cache_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/kv_cache_manager.py -------------------------------------------------------------------------------- /vllm/v1/core/kv_cache_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/kv_cache_utils.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/core/sched/interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/sched/interface.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/sched/output.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/sched/scheduler.py -------------------------------------------------------------------------------- /vllm/v1/core/sched/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/core/sched/utils.py -------------------------------------------------------------------------------- /vllm/v1/cudagraph_dispatcher.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/cudagraph_dispatcher.py -------------------------------------------------------------------------------- /vllm/v1/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/__init__.py -------------------------------------------------------------------------------- /vllm/v1/engine/async_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/async_llm.py -------------------------------------------------------------------------------- /vllm/v1/engine/coordinator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/coordinator.py -------------------------------------------------------------------------------- /vllm/v1/engine/core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/core.py -------------------------------------------------------------------------------- /vllm/v1/engine/core_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/core_client.py -------------------------------------------------------------------------------- /vllm/v1/engine/detokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/detokenizer.py -------------------------------------------------------------------------------- /vllm/v1/engine/exceptions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/exceptions.py -------------------------------------------------------------------------------- /vllm/v1/engine/llm_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/llm_engine.py -------------------------------------------------------------------------------- /vllm/v1/engine/logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/logprobs.py -------------------------------------------------------------------------------- /vllm/v1/engine/processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/processor.py -------------------------------------------------------------------------------- /vllm/v1/engine/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/engine/utils.py -------------------------------------------------------------------------------- /vllm/v1/executor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/executor/abstract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/executor/abstract.py -------------------------------------------------------------------------------- /vllm/v1/executor/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/executor/utils.py -------------------------------------------------------------------------------- /vllm/v1/kv_cache_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/kv_cache_interface.py -------------------------------------------------------------------------------- /vllm/v1/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/metrics/loggers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/metrics/loggers.py -------------------------------------------------------------------------------- /vllm/v1/metrics/prometheus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/metrics/prometheus.py -------------------------------------------------------------------------------- /vllm/v1/metrics/ray_wrappers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/metrics/ray_wrappers.py -------------------------------------------------------------------------------- /vllm/v1/metrics/reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/metrics/reader.py -------------------------------------------------------------------------------- /vllm/v1/metrics/stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/metrics/stats.py -------------------------------------------------------------------------------- /vllm/v1/outputs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/outputs.py -------------------------------------------------------------------------------- /vllm/v1/pool/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/pool/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/pool/metadata.py -------------------------------------------------------------------------------- /vllm/v1/request.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/request.py -------------------------------------------------------------------------------- /vllm/v1/sample/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/sample/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/metadata.py -------------------------------------------------------------------------------- /vllm/v1/sample/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/sample/ops/bad_words.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/ops/bad_words.py -------------------------------------------------------------------------------- /vllm/v1/sample/ops/logprobs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/ops/logprobs.py -------------------------------------------------------------------------------- /vllm/v1/sample/ops/penalties.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/ops/penalties.py -------------------------------------------------------------------------------- /vllm/v1/sample/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/sampler.py -------------------------------------------------------------------------------- /vllm/v1/sample/tpu/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/sample/tpu/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/tpu/metadata.py -------------------------------------------------------------------------------- /vllm/v1/sample/tpu/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/sample/tpu/sampler.py -------------------------------------------------------------------------------- /vllm/v1/serial_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/serial_utils.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/spec_decode/eagle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/spec_decode/eagle.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/medusa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/spec_decode/medusa.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/spec_decode/metadata.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/spec_decode/metrics.py -------------------------------------------------------------------------------- /vllm/v1/spec_decode/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/spec_decode/utils.py -------------------------------------------------------------------------------- /vllm/v1/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/utils.py -------------------------------------------------------------------------------- /vllm/v1/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/v1/worker/block_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/block_table.py -------------------------------------------------------------------------------- /vllm/v1/worker/cpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/cpu_worker.py -------------------------------------------------------------------------------- /vllm/v1/worker/gpu_input_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/gpu_input_batch.py -------------------------------------------------------------------------------- /vllm/v1/worker/gpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/gpu_worker.py -------------------------------------------------------------------------------- /vllm/v1/worker/tpu_input_batch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/tpu_input_batch.py -------------------------------------------------------------------------------- /vllm/v1/worker/tpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/tpu_worker.py -------------------------------------------------------------------------------- /vllm/v1/worker/ubatch_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/ubatch_utils.py -------------------------------------------------------------------------------- /vllm/v1/worker/ubatching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/ubatching.py -------------------------------------------------------------------------------- /vllm/v1/worker/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/utils.py -------------------------------------------------------------------------------- /vllm/v1/worker/worker_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/worker_base.py -------------------------------------------------------------------------------- /vllm/v1/worker/xpu_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/v1/worker/xpu_worker.py -------------------------------------------------------------------------------- /vllm/version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/version.py -------------------------------------------------------------------------------- /vllm/vllm_flash_attn/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/worker/cache_engine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/worker/cache_engine.py -------------------------------------------------------------------------------- /vllm/worker/model_runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/worker/model_runner.py -------------------------------------------------------------------------------- /vllm/worker/model_runner_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/worker/model_runner_base.py -------------------------------------------------------------------------------- /vllm/worker/worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/worker/worker.py -------------------------------------------------------------------------------- /vllm/worker/worker_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fyabc/vllm/HEAD/vllm/worker/worker_base.py --------------------------------------------------------------------------------