├── .coveragerc ├── docs ├── assets │ ├── torchax.png │ ├── get_model.png │ ├── model-fn.png │ ├── sandwich.png │ ├── wrap_model.png │ ├── tpu_inference-light.png │ ├── tpu_inference_dark.png │ ├── tpu_inference_light.png │ ├── tpu-inference-banner.png │ ├── tpu_inference_dark_2.png │ ├── tpu_inference_dark_mode.png │ ├── tpu_inference_light_mode.png │ ├── tpu_header_new_preview_v1.png │ ├── tpu_header_new_preview_v2.png │ ├── tpu_inference_dark_20px_space.png │ ├── tpu_inference_dark_mode_short.png │ ├── tpu_inference_dark_more_space.png │ └── tpu_inference_light_mode_short.png ├── requirements.txt ├── recommended_models_features.md └── getting_started │ ├── installation.md │ └── quickstart.md ├── MANIFEST.in ├── support_matrices ├── multimodal_model_support_matrix.csv ├── parallelism_support_matrix.csv ├── nightly │ ├── parallelism_support_matrix.csv │ ├── multimodal_model_support_matrix.csv │ ├── kernel_support_matrix.csv │ ├── quantization_support_matrix.csv │ ├── text_only_model_support_matrix.csv │ └── feature_support_matrix.csv ├── text_only_model_support_matrix.csv ├── kernel_support_matrix.csv ├── quantization_support_matrix.csv └── feature_support_matrix.csv ├── requirements_benchmarking.txt ├── .buildkite ├── features │ ├── default_features.txt │ ├── sampling_params.yml │ ├── Single-Host-P-D-disaggregation.yml │ ├── Structured_Decoding.yml │ ├── Hybrid_kvcache.yml │ ├── runai_model_streamer_loader.yml │ ├── MLA.yml │ ├── MoE.yml │ ├── Quantized_Matmul.yml │ ├── Multimodal_Inputs.yml │ ├── Quantized_KV_Cache.yml │ ├── Quantized_Attention.yml │ ├── data_parallelism.yml │ ├── async_scheduler.yml │ ├── LoRA_Torch.yml │ ├── KV_Cache_Host_Offloading.yml │ ├── DCN-Based_P-D_disaggregation.yml │ └── Collective_Communication_Matmul.yml ├── pipeline_generation │ ├── constant.py │ ├── feature_template.yml │ └── vllm_native_model_template.yml ├── scripts │ ├── notify_test_results.sh │ ├── check_results.sh │ ├── run_disagg.sh │ ├── record_step_result.sh │ ├── commit_verified_commit_hashes.sh │ └── commit_support_matrices.sh ├── parallelism │ ├── CP.yml │ ├── EP.yml │ ├── SP.yml │ ├── TP.yml │ └── DP.yml └── quantization │ ├── AWQ_INT4.yml │ ├── FP8_W8A8.yml │ ├── FP4_W4A16.yml │ ├── FP8_W8A16.yml │ ├── INT8_W8A8.yml │ └── INT4_W4A16.yml ├── requirements.txt ├── tpu_inference ├── logger.py ├── env_override.py ├── core │ ├── __init__.py │ ├── sched │ │ └── __init__.py │ └── disagg_utils.py ├── kernels │ ├── __init__.py │ ├── mla │ │ ├── __init__.py │ │ └── v1 │ │ │ └── __init__.py │ ├── fused_moe │ │ ├── __init__.py │ │ └── v1 │ │ │ └── __init__.py │ ├── megablox │ │ ├── __init__.py │ │ └── common.py │ ├── collectives │ │ ├── __init__.py │ │ ├── all_gather_matmul_tuned_block_sizes.py │ │ └── util.py │ ├── flash_attention │ │ └── __init__.py │ ├── quantized_matmul │ │ ├── __init__.py │ │ └── util.py │ └── ragged_paged_attention │ │ ├── __init__.py │ │ ├── v2 │ │ └── __init__.py │ │ └── v3 │ │ ├── __init__.py │ │ └── util.py ├── layers │ ├── __init__.py │ ├── jax │ │ ├── __init__.py │ │ ├── moe │ │ │ └── __init__.py │ │ ├── sample │ │ │ └── __init__.py │ │ ├── attention │ │ │ └── __init__.py │ │ ├── misc.py │ │ ├── glossary.md │ │ └── pp_utils.py │ ├── vllm │ │ ├── __init__.py │ │ └── quantization │ │ │ └── compressed_tensors │ │ │ ├── __init__.py │ │ │ └── schemes │ │ │ └── __init__.py │ └── common │ │ ├── __init__.py │ │ ├── quant_methods.py │ │ └── attention_metadata.py ├── lora │ └── __init__.py ├── models │ ├── __init__.py │ ├── jax │ │ ├── __init__.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── qwix │ │ │ ├── __init__.py │ │ │ └── configs │ │ │ ├── int8_all_modules_w_only.yaml │ │ │ ├── fp8_all_modules_w_only.yaml │ │ │ ├── int8_default.yaml │ │ │ └── fp8_default.yaml │ ├── vllm │ │ ├── __init__.py │ │ └── vllm_model_wrapper_context.py │ └── common │ │ └── __init__.py ├── runner │ └── __init__.py ├── worker │ └── __init__.py ├── distributed │ └── __init__.py ├── executors │ └── __init__.py ├── experimental │ └── __init__.py ├── spec_decode │ ├── __init__.py │ └── jax │ │ └── __init__.py └── platforms │ └── __init__.py ├── requirements_v7x.txt ├── .dockerignore ├── pyproject.toml ├── tests ├── __init__.py ├── core │ ├── __init__.py │ ├── test_disagg_executor.py │ └── test_init.py ├── e2e │ ├── __init__.py │ ├── test_structured_decoding.py │ └── benchmarking │ │ └── bench_utils.sh ├── lora │ ├── __init__.py │ ├── conftest.py │ ├── test_bgmv.py │ └── test_lora_perf.py ├── executors │ └── __init__.py ├── kernels │ ├── __init__.py │ └── collectives │ │ └── __init__.py ├── layers │ ├── __init__.py │ ├── jax │ │ ├── __init__.py │ │ ├── moe │ │ │ └── __init__.py │ │ ├── attention │ │ │ └── __init__.py │ │ └── sample │ │ │ └── __init__.py │ ├── common │ │ └── __init__.py │ └── vllm │ │ ├── __init__.py │ │ └── test_fp8.py ├── models │ ├── __init__.py │ ├── jax │ │ ├── __init__.py │ │ └── utils │ │ │ └── __init__.py │ └── common │ │ └── __init__.py ├── platforms │ ├── __init__.py │ └── test_tpu_platform.py ├── runner │ └── __init__.py ├── worker │ └── __init__.py ├── distributed │ └── __init__.py ├── experimental │ └── __init__.py ├── spec_decode │ └── __init__.py └── scripts │ └── run_rpa_v3_tests.sh ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ ├── pre-commit.yml │ ├── check_ready_label.yml │ └── release.yml ├── PULL_REQUEST_TEMPLATE │ └── MODELING_CODE_PR.md ├── ISSUE_TEMPLATE │ ├── 100-documentation.yml │ ├── 500-feature-request.yml │ └── 200-installation.yml ├── scripts │ └── determine_release_vars.sh └── CODEOWNERS ├── .readthedocs.yaml ├── DCO ├── scripts └── vllm │ └── integration │ └── conftest.py └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | tpu_inference/kernels/* 4 | -------------------------------------------------------------------------------- /docs/assets/torchax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/torchax.png -------------------------------------------------------------------------------- /docs/assets/get_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/get_model.png -------------------------------------------------------------------------------- /docs/assets/model-fn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/model-fn.png -------------------------------------------------------------------------------- /docs/assets/sandwich.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/sandwich.png -------------------------------------------------------------------------------- /docs/assets/wrap_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/wrap_model.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include README.md 3 | include tpu_inference/models/jax/utils/quantization/configs/*.yaml 4 | -------------------------------------------------------------------------------- /docs/assets/tpu_inference-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference-light.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light.png -------------------------------------------------------------------------------- /docs/assets/tpu-inference-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu-inference-banner.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_2.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_mode.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_light_mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light_mode.png -------------------------------------------------------------------------------- /docs/assets/tpu_header_new_preview_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_header_new_preview_v1.png -------------------------------------------------------------------------------- /docs/assets/tpu_header_new_preview_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_header_new_preview_v2.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark_20px_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_20px_space.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark_mode_short.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_mode_short.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_dark_more_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_more_space.png -------------------------------------------------------------------------------- /docs/assets/tpu_inference_light_mode_short.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light_mode_short.png -------------------------------------------------------------------------------- /support_matrices/multimodal_model_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Model,UnitTest,IntegrationTest,Benchmark 2 | "meta-llama/Llama-4-Maverick-17B-128E-Instruct",unverified,unverified,unverified 3 | "Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅ 4 | -------------------------------------------------------------------------------- /requirements_benchmarking.txt: -------------------------------------------------------------------------------- 1 | # These are required for running any benchmarking 2 | # See more information at scripts/vllm/benchmarking/README.md 3 | nltk 4 | evaluate 5 | datasets 6 | rouge-score 7 | scikit-learn 8 | pandas 9 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-api-autonav 3 | mkdocs-material 4 | mkdocs-awesome-nav 5 | mkdocs-glightbox 6 | python-markdown-math 7 | mkdocs-same-dir 8 | mkdocs-open-in-new-tab 9 | mkdocs-table-reader-plugin 10 | -------------------------------------------------------------------------------- /.buildkite/features/default_features.txt: -------------------------------------------------------------------------------- 1 | Chunked Prefill (feature support matrix) 2 | Prefix Caching (feature support matrix) 3 | Ragged Paged Attention V3 (kernel support matrix) 4 | Single Program Multi Data (feature support matrix) 5 | -------------------------------------------------------------------------------- /support_matrices/parallelism_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "CP",unverified,unverified 3 | "DP",❌,N/A 4 | "EP",unverified,unverified 5 | "PP",✅,✅ 6 | "SP",unverified,unverified 7 | "TP",unverified,unverified 8 | -------------------------------------------------------------------------------- /support_matrices/nightly/parallelism_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "CP",unverified,unverified 3 | "DP",✅,unverified 4 | "EP",unverified,unverified 5 | "PP",✅,✅ 6 | "SP",unverified,unverified 7 | "TP",unverified,unverified 8 | -------------------------------------------------------------------------------- /support_matrices/nightly/multimodal_model_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Model,UnitTest,IntegrationTest,Benchmark 2 | "meta-llama/Llama-4-Maverick-17B-128E-Instruct",unverified,unverified,unverified 3 | "Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅ 4 | "Qwen/Qwen3-Omni-30B-A3B-Instruct",unverified,unverified,unverified 5 | -------------------------------------------------------------------------------- /support_matrices/text_only_model_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Model,UnitTest,IntegrationTest,Benchmark 2 | "meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅ 3 | "Qwen/Qwen3-4B",✅,✅,✅ 4 | "google/gemma-3-27b-it",✅,✅,✅ 5 | "Qwen/Qwen3-32B",✅,✅,✅ 6 | "meta-llama/Llama-Guard-4-12B",✅,✅,✅ 7 | "meta-llama/Llama-3.1-8B-Instruct",✅,✅,✅ 8 | "Qwen/Qwen3-30B-A3B",✅,✅,✅ 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tpu-info==0.7.1 2 | yapf==0.43.0 3 | pytest 4 | pytest-mock 5 | absl-py 6 | numpy 7 | google-cloud-storage 8 | jax[tpu]==0.8.0 9 | jaxlib==0.8.0 10 | jaxtyping 11 | flax==0.11.1 12 | torchax==0.0.10 13 | qwix==0.1.1 14 | torchvision==0.24.0 15 | pathwaysutils 16 | parameterized 17 | numba==0.62.1 18 | runai-model-streamer[s3,gcs]==0.15.0 19 | -------------------------------------------------------------------------------- /support_matrices/kernel_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "Collective Communication Matmul",✅,unverified 3 | "MLA",unverified,unverified 4 | "MoE",unverified,unverified 5 | "Quantized Attention",unverified,unverified 6 | "Quantized KV Cache",unverified,unverified 7 | "Quantized Matmul",unverified,unverified 8 | "Ragged Paged Attention V3",✅,✅ 9 | -------------------------------------------------------------------------------- /support_matrices/quantization_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,Recommended TPU Generations,CorrectnessTest,PerformanceTest 2 | "AWQ INT4","v5, v6",unverified,unverified 3 | "FP4 W4A16",v7,unverified,unverified 4 | "FP8 W8A8",v7,unverified,unverified 5 | "FP8 W8A16",v7,unverified,unverified 6 | "INT4 W4A16","v5, v6",unverified,unverified 7 | "INT8 W8A8","v5, v6",unverified,unverified 8 | -------------------------------------------------------------------------------- /support_matrices/nightly/kernel_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "Collective Communication Matmul",✅,unverified 3 | "MLA",unverified,unverified 4 | "MoE",unverified,unverified 5 | "Quantized Attention",unverified,unverified 6 | "Quantized KV Cache",unverified,unverified 7 | "Quantized Matmul",unverified,unverified 8 | "Ragged Paged Attention V3",✅,✅ 9 | -------------------------------------------------------------------------------- /support_matrices/nightly/quantization_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,Recommended TPU Generations,CorrectnessTest,PerformanceTest 2 | "AWQ INT4","v5, v6",unverified,unverified 3 | "FP4 W4A16",v7,unverified,unverified 4 | "FP8 W8A8",v7,unverified,unverified 5 | "FP8 W8A16",v7,unverified,unverified 6 | "INT4 W4A16","v5, v6",unverified,unverified 7 | "INT8 W8A8","v5, v6",unverified,unverified 8 | -------------------------------------------------------------------------------- /tpu_inference/logger.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from vllm.logger import _VllmLogger 4 | from vllm.logger import init_logger as init_vllm_logger 5 | 6 | 7 | def init_logger(name: str) -> _VllmLogger: 8 | # Prepend the root "vllm" to the module path to use vllm's configured logger. 9 | patched_name = "vllm." + name 10 | return init_vllm_logger(patched_name) 11 | -------------------------------------------------------------------------------- /requirements_v7x.txt: -------------------------------------------------------------------------------- 1 | # This file contains additional dependencies needed for TPU v7x support. 2 | # It is expected to be used in conjunction with the main requirements.txt file. 3 | --pre 4 | -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ 5 | -f https://storage.googleapis.com/jax-releases/libtpu_releases.html 6 | jax==0.8.1 7 | jaxlib==0.8.1 8 | jaxtyping==0.3.2 9 | libtpu==0.0.31 10 | -------------------------------------------------------------------------------- /tpu_inference/env_override.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # SPDX-FileCopyrightText: Copyright contributors to the tpu-inference project 3 | 4 | import os 5 | 6 | # Disable CUDA-specific shared experts stream for TPU 7 | # This prevents errors when trying to create CUDA streams on TPU hardware 8 | # The issue was introduced by vllm-project/vllm#26440 9 | os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1" 10 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /.venv 2 | /build 3 | dist 4 | vllm/*.so 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | .mypy_cache 12 | 13 | # Distribution / packaging 14 | .Python 15 | /build/ 16 | cmake-build-*/ 17 | CMakeUserPresets.json 18 | develop-eggs/ 19 | /dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | -------------------------------------------------------------------------------- /support_matrices/feature_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "Chunked Prefill",✅,✅ 3 | "DCN-based P/D disaggregation",unverified,unverified 4 | "KV cache host offloading",unverified,unverified 5 | "LoRA_Torch",✅,unverified 6 | "Multimodal Inputs",✅,✅ 7 | "Out-of-tree model support",✅,✅ 8 | "Prefix Caching",✅,✅ 9 | "Single Program Multi Data",✅,✅ 10 | "Speculative Decoding: Eagle3",✅,✅ 11 | "Speculative Decoding: Ngram",✅,✅ 12 | "async scheduler",✅,✅ 13 | "runai_model_streamer_loader",✅,N/A 14 | "sampling_params",✅,N/A 15 | "structured_decoding",✅,N/A 16 | -------------------------------------------------------------------------------- /support_matrices/nightly/text_only_model_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Model,UnitTest,IntegrationTest,Benchmark 2 | "moonshotai/Kimi-K2-Thinking",unverified,unverified,unverified 3 | "Qwen/Qwen3-Coder-480B-A35B-Instruct",unverified,unverified,unverified 4 | "meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅ 5 | "Qwen/Qwen3-4B",✅,✅,✅ 6 | "google/gemma-3-27b-it",✅,✅,✅ 7 | "Qwen/Qwen3-32B",✅,✅,✅ 8 | "deepseek-ai/DeepSeek-V3.1",unverified,unverified,unverified 9 | "meta-llama/Llama-Guard-4-12B",✅,✅,✅ 10 | "openai/gpt-oss-120b",unverified,unverified,unverified 11 | "meta-llama/Llama-3.1-8B-Instruct",✅,✅,✅ 12 | "Qwen/Qwen3-30B-A3B",✅,✅,✅ 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /support_matrices/nightly/feature_support_matrix.csv: -------------------------------------------------------------------------------- 1 | Feature,CorrectnessTest,PerformanceTest 2 | "Chunked Prefill",✅,✅ 3 | "DCN-based P/D disaggregation",unverified,✅ 4 | "KV cache host offloading",unverified,unverified 5 | "LoRA_Torch",✅,✅ 6 | "Multimodal Inputs",✅,✅ 7 | "Out-of-tree model support",✅,✅ 8 | "Prefix Caching",✅,✅ 9 | "Single Program Multi Data",✅,✅ 10 | "Single-Host-P-D-disaggregation",N/A,N/A 11 | "Speculative Decoding: Eagle3",✅,✅ 12 | "Speculative Decoding: Ngram",✅,✅ 13 | "async scheduler",✅,✅ 14 | "data_parallelism",✅,❌ 15 | "runai_model_streamer_loader",✅,N/A 16 | "sampling_params",✅,N/A 17 | "structured_decoding",✅,N/A 18 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/lora/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/executors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/jax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/models/jax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/platforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/runner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/jax/moe/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/models/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/spec_decode/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/lora/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/runner/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/worker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/kernels/collectives/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/jax/attention/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/layers/jax/sample/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/models/jax/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/core/sched/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/executors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/mla/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/spec_decode/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/fused_moe/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/megablox/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/mla/v1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/moe/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/sample/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/spec_decode/jax/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/collectives/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/fused_moe/v1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/attention/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/qwix/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/flash_attention/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/quantized_matmul/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/ragged_paged_attention/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/ragged_paged_attention/v2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/kernels/ragged_paged_attention/v3/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tpu_inference/platforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # ruff: noqa 16 | from tpu_inference.platforms.tpu_platform import TpuPlatform 17 | -------------------------------------------------------------------------------- /tests/layers/vllm/test_fp8.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | pytest.skip("FP8 implementation not complete yet", allow_module_level=True) 18 | -------------------------------------------------------------------------------- /.buildkite/pipeline_generation/constant.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | QUEUE_TO_TENSOR_PARALLEL_SIZE_MAP = { 16 | "tpu_v6e_queue": 1, 17 | "tpu_v6e_8_queue": 8, 18 | } 19 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/qwix/configs/int8_all_modules_w_only.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | qwix: 16 | rules: 17 | # NOTE: each entry corresponds to a qwix.QuantizationRule 18 | - module_path: '.*' 19 | weight_qtype: 'int8' 20 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/qwix/configs/fp8_all_modules_w_only.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | qwix: 16 | rules: 17 | # NOTE: each entry corresponds to a qwix.QuantizationRule 18 | - module_path: '.*' 19 | weight_qtype: 'float8_e4m3fn' 20 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/qwix/configs/int8_default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | qwix: 16 | rules: 17 | # NOTE: each entry corresponds to a qwix.QuantizationRule 18 | - module_path: '.*' 19 | weight_qtype: 'int8' 20 | act_qtype: 'int8' 21 | -------------------------------------------------------------------------------- /tpu_inference/models/jax/utils/qwix/configs/fp8_default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | qwix: 16 | rules: 17 | # NOTE: each entry corresponds to a qwix.QuantizationRule 18 | - module_path: '.*' 19 | weight_qtype: 'float8_e4m3fn' 20 | act_qtype: 'float8_e4m3fn' 21 | -------------------------------------------------------------------------------- /tpu_inference/layers/common/quant_methods.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | UNQUANTIZED = "unquantized" 16 | MXFP4 = "mxfp4" 17 | AWQ = "awq" 18 | COMPRESSED_TENSORS = "compressed-tensors" 19 | FP8 = "fp8" 20 | 21 | 22 | def get_tpu_quant_method(quant_method: str) -> str: 23 | return "tpu-" + quant_method 24 | -------------------------------------------------------------------------------- /.buildkite/scripts/notify_test_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | ANY_FAILED=$(buildkite-agent meta-data get "CI_TESTS_FAILED") 19 | FAILURE_LABEL="Not all models and/or features passed" 20 | 21 | echo "--- Checking test outcomes" 22 | 23 | if [ "${ANY_FAILED}" = "true" ] ; then 24 | echo "${FAILURE_LABEL}" 25 | exit 1 26 | else 27 | echo "All models & features passed." 28 | fi 29 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Start with a short description of what the PR does and how this is a change from 4 | the past. 5 | 6 | The rest of the description includes relevant details and context, examples: 7 | - why is this change being made, 8 | - the problem being solved and any relevant context, 9 | - why this is a good solution, 10 | - some information about the specific implementation, 11 | - shortcomings of the solution and possible future improvements. 12 | 13 | If the change fixes a Github issue, please include a link, e.g.,: 14 | FIXES: #123456 15 | 16 | # Tests 17 | 18 | Please describe how you tested this change, and include any instructions and/or 19 | commands to reproduce. 20 | 21 | # Checklist 22 | 23 | Before submitting this PR, please make sure: 24 | - I have performed a self-review of my code. 25 | - I have necessary comments in my code, particularly in hard-to-understand areas. 26 | - I have made or will make corresponding changes to any relevant documentation. 27 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: pre-commit 16 | 17 | on: 18 | pull_request: 19 | push: 20 | branches: [main] 21 | 22 | jobs: 23 | pre-commit: 24 | runs-on: ubuntu-latest 25 | steps: 26 | - uses: actions/checkout@v4 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: '3.12' 31 | - name: Install pre-commit 32 | run: pip install pre-commit 33 | - name: Run pre-commit 34 | uses: pre-commit/action@v3.0.1 35 | with: 36 | extra_args: --all-files 37 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import math 16 | from typing import Tuple 17 | 18 | import jax 19 | from jax.sharding import NamedSharding 20 | from jax.sharding import PartitionSpec as P 21 | 22 | 23 | # TODO(xiang): move this to weight_utils.py 24 | def shard_put(x: jax.Array, sharding_names: Tuple[str, ...] | P, 25 | mesh: jax.sharding.Mesh) -> jax.Array: 26 | # Single device sharding requires this special handling 27 | # to avoid the recursive jit error. 28 | if math.prod(mesh.axis_sizes) == 1: 29 | return jax.device_put(x, mesh.devices.flatten()[0]) 30 | return jax.device_put(x, NamedSharding(mesh, P(*sharding_names))) 31 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Read the Docs configuration file 16 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 17 | 18 | # Required 19 | version: 2 20 | 21 | # Set the OS, Python version, and other tools you might need 22 | build: 23 | os: ubuntu-24.04 24 | tools: 25 | python: "3.13" 26 | 27 | # Build documentation with Mkdocs 28 | mkdocs: 29 | configuration: mkdocs.yml 30 | 31 | # Optionally, but recommended, 32 | # declare the Python requirements required to build your documentation 33 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 34 | python: 35 | install: 36 | - requirements: docs/requirements.txt 37 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/MODELING_CODE_PR.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Start with a short description of what the PR does and how this is a change from 4 | the past. 5 | 6 | The rest of the description includes relevant details and context, examples: 7 | - why is this change being made, 8 | - the problem being solved and any relevant context, 9 | - why this is a good solution, 10 | - some information about the specific implementation, 11 | - shortcomings of the solution and possible future improvements. 12 | 13 | If the change fixes a Github issue, please include a link, e.g.,: 14 | FIXES: #123456 15 | 16 | # Tests 17 | 18 | Please describe how you tested this change, and include any instructions and/or 19 | commands to reproduce. 20 | 21 | # Checklist 22 | 23 | Before submitting this PR, please make sure (put X in square brackets): 24 | - [ ] I have performed a self-review of my code. 25 | - [ ] I have necessary comments in my code, particularly in hard-to-understand areas. 26 | - [ ] I have made or will make corresponding changes to any relevant documentation. 27 | - [ ] I have reviewed the uLLM modeling code checklist: https://docs.google.com/document/d/1DGQBVvr2bh4G8tBUO1YH8pO7Dd_myw5rfEMfVDymEk8/edit?resourcekey=0-V7MGHu3aQjJH6YrI3-y8Hg&tab=t.t91cyovog2mr#heading=h.cqdzv8mlszca 28 | - [ ] I have received at least 1 readability approval and 1 correctness approval. 29 | -------------------------------------------------------------------------------- /tests/scripts/run_rpa_v3_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Install dependencies 18 | pip install -U --pre jax jaxlib libtpu requests -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ -f https://storage.googleapis.com/jax-releases/libtpu_releases.html 19 | 20 | TPU_INFERENCE_DIR="/workspace/tpu_inference/" 21 | 22 | # RPA v3 test files - add new tests here 23 | RPA_V3_TESTS=( 24 | "tests/kernels/ragged_paged_attention_kernel_v3_test.py" 25 | "tests/layers/attention/test_deepseek_v3_attention.py" 26 | ) 27 | 28 | # Convert array to space-separated string for pytest 29 | FULL_PATHS=() 30 | for test in "${RPA_V3_TESTS[@]}"; do 31 | FULL_PATHS+=("$TPU_INFERENCE_DIR/$test") 32 | done 33 | 34 | # Run all tests in a single pytest command 35 | pytest "${FULL_PATHS[@]}" 36 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 6 | Everyone is permitted to copy and distribute verbatim copies of this 7 | license document, but changing it is not allowed. 8 | 9 | 10 | Developer's Certificate of Origin 1.1 11 | 12 | By making a contribution to this project, I certify that: 13 | 14 | (a) The contribution was created in whole or in part by me and I 15 | have the right to submit it under the open source license 16 | indicated in the file; or 17 | 18 | (b) The contribution is based upon previous work that, to the best 19 | of my knowledge, is covered under an appropriate open source 20 | license and I have the right under that license to submit that 21 | work with modifications, whether created in whole or in part 22 | by me, under the same open source license (unless I am 23 | permitted to submit under a different license), as indicated 24 | in the file; or 25 | 26 | (c) The contribution was provided directly to me by some other 27 | person who certified (a), (b) or (c) and I have not modified 28 | it. 29 | 30 | (d) I understand and agree that this project and the contribution 31 | are public and that a record of the contribution (including all 32 | personal information I submit with it, including my sign-off) is 33 | maintained indefinitely and may be redistributed consistent with 34 | this project or the open source license(s) involved. 35 | -------------------------------------------------------------------------------- /.buildkite/scripts/check_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | ANY_FAILED=false 19 | if [ "$#" -lt 2 ]; then 20 | echo "Usage: $0 ..." 21 | exit 1 22 | fi 23 | 24 | FAILURE_LABEL="$1" 25 | shift 26 | 27 | echo "--- Checking Test Outcomes" 28 | 29 | for KEY in "$@"; do 30 | OUTCOME=$(buildkite-agent step get "outcome" --step "${KEY}" || echo "skipped") 31 | echo "Step ${KEY} outcome: ${OUTCOME}" 32 | 33 | if [ "${OUTCOME}" != "passed" ] && [ "${OUTCOME}" != "skipped" ] ; then 34 | ANY_FAILED=true 35 | fi 36 | done 37 | 38 | if [ "${ANY_FAILED}" = "true" ] ; then 39 | cat <<- YAML | buildkite-agent pipeline upload 40 | steps: 41 | - label: "${FAILURE_LABEL}" 42 | agents: 43 | queue: cpu 44 | command: echo "${FAILURE_LABEL}" 45 | YAML 46 | exit 1 47 | else 48 | echo "All relevant TPU tests passed (or were skipped)." 49 | fi 50 | -------------------------------------------------------------------------------- /.github/workflows/check_ready_label.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Enforce Ready Label 16 | 17 | on: 18 | pull_request: 19 | types: [opened, synchronize, reopened, labeled, unlabeled] 20 | 21 | jobs: 22 | check-ready-label: 23 | name: Check for Ready Label 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: Verify label existence 27 | shell: bash 28 | run: | 29 | echo "Checking for 'ready' label..." 30 | LABELS_JSON='${{ toJSON(github.event.pull_request.labels) }}' 31 | echo "Current labels: $LABELS_JSON" 32 | MATCHED_LABEL=$(echo "$LABELS_JSON" | jq -r '.[] | select(.name=="ready") | .name') 33 | 34 | if [ "$MATCHED_LABEL" == "ready" ]; then 35 | echo "'ready' label found. Check passed." 36 | exit 0 37 | else 38 | echo "'ready' label NOT found." 39 | echo "Blocking merge until 'ready' label is applied." 40 | exit 1 41 | fi 42 | -------------------------------------------------------------------------------- /.buildkite/scripts/run_disagg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | set -ex 18 | 19 | IMAGE_NAME='vllm-tpu' 20 | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) 21 | # Source the environment setup script 22 | # shellcheck disable=SC1091 23 | source "$SCRIPT_DIR/setup_docker_env.sh" 24 | setup_environment $IMAGE_NAME 25 | 26 | SCRIPT_DIR=$SCRIPT_DIR/../../examples/disagg 27 | 28 | # call the /examples/disagg/run_disagg_multi_host.sh script 29 | CONTAINER_PREFIX="disagg-node" 30 | 31 | CONTAINER_PREFIX=${CONTAINER_PREFIX} \ 32 | RUN_IN_BUILDKITE=true \ 33 | MODEL=${MODEL:="Qwen/Qwen3-0.6B"} \ 34 | DOCKER_IMAGE=${IMAGE_NAME}:${BUILDKITE_COMMIT} \ 35 | "$SCRIPT_DIR/run_disagg_multi_host.sh" "$@" 36 | 37 | # clear existing containers 38 | CONTAINERS=$(docker ps -a --filter "name=${CONTAINER_PREFIX}*" -q) 39 | if [ -n "$CONTAINERS" ]; then 40 | # shellcheck disable=SC2086 41 | docker stop $CONTAINERS 42 | # shellcheck disable=SC2086 43 | docker rm -f $CONTAINERS 44 | fi 45 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/glossary.md: -------------------------------------------------------------------------------- 1 | ### Variable Glossary 2 | 3 | | Variable | Full Name | Description | 4 | | :--- | :--- | :--- | 5 | | **B** | Batch Size | The number of samples processed at once. | 6 | | **T** | Sequence Length | The number of tokens in the Query sequence. | 7 | | **S** | Sequence Length | The number of tokens in the Key/Value sequence. | 8 | | **D** | $d_{model}$ | The embedding dimension of the model. | 9 | | **F** | $d_{ff}$ | The hidden dimension of the feed-forward MLP layers. | 10 | | **V** | Vocab Size | The size of the vocabulary. | 11 | | **H** | Head Dimension | The dimension of each attention head, typically $D/N$. | 12 | | **N** | Number of Query Heads | The total number of query heads in multi-head attention. | 13 | | **Q** | Number of Query Heads | Synonymous with **N**. | 14 | | **K** | Number of Key/Value Heads | The total number of key/value heads. | 15 | | **C** | Expert Capacity | The maximum number of tokens an expert can process in an MoE layer. | 16 | | **X** | Activated Experts | The number of activated experts per token in MoE. | 17 | | **G** | Number of Groups | The number of groups for grouped-query attention. | 18 | | **E** | Total Experts | The total number of experts in the MoE layer. | 19 | | **M** | Experts per Group | The number of experts within each group, where $M = E/G$. | 20 | | **A** | Q Lora Rank | Used for DeepSeek models. 21 | | **L** | Product of QK NoPE Head Dim and V Head Dim | Used for DeepSeek models. 22 | | **P** | Product of Total (NoPE + RoPE) QK Head Dim and V Head Dim | Used for DeepSeek models. 23 | | **R** | Product of Number of Attention Heads and V Head Dim | Used for DeepSeek models. 24 | -------------------------------------------------------------------------------- /tpu_inference/layers/common/attention_metadata.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import functools 16 | from dataclasses import dataclass, field 17 | from typing import Any 18 | 19 | import jax 20 | 21 | 22 | @functools.partial( 23 | jax.tree_util.register_dataclass, 24 | data_fields=[ 25 | "input_positions", 26 | "block_tables", 27 | "seq_lens", 28 | "query_start_loc", 29 | "request_distribution", 30 | ], 31 | meta_fields=[], 32 | drop_fields=["query_start_loc_cpu", "seq_lens_cpu"], 33 | ) 34 | @dataclass 35 | class AttentionMetadata(object): 36 | # (padded_total_num_scheduled_tokens,) 37 | input_positions: jax.Array 38 | # (max_num_seqs * max_num_blocks_per_req,) 39 | block_tables: jax.Array = None 40 | # (max_num_seqs,) 41 | seq_lens: jax.Array = None 42 | # (max_num_seqs + 1,) 43 | query_start_loc: jax.Array = None 44 | # (3,) 45 | request_distribution: jax.Array = None 46 | 47 | query_start_loc_cpu: Any = field(init=False) 48 | seq_lens_cpu: Any = field(init=False) 49 | -------------------------------------------------------------------------------- /.buildkite/features/sampling_params.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: sampling_params 16 | # pipeline-type: feature support matrix 17 | # Sampling parameters control how the model selects tokens during generation. 18 | # These tests verify that temperature, top_p, top_k, and logprobs work correctly. 19 | steps: 20 | - label: "Correctness tests for sampling_params" 21 | key: "sampling_params_CorrectnessTest" 22 | soft_fail: true 23 | agents: 24 | queue: tpu_v6e_queue 25 | commands: 26 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_sampling_params.py 27 | - label: "Record correctness test result for sampling_params" 28 | key: "record_sampling_params_CorrectnessTest" 29 | depends_on: "sampling_params_CorrectnessTest" 30 | env: 31 | CI_TARGET: "sampling_params" 32 | CI_STAGE: "CorrectnessTest" 33 | CI_CATEGORY: "feature support matrix" 34 | agents: 35 | queue: cpu 36 | commands: 37 | - | 38 | .buildkite/scripts/record_step_result.sh sampling_params_CorrectnessTest 39 | -------------------------------------------------------------------------------- /tpu_inference/core/disagg_utils.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from typing import Tuple 4 | 5 | from tpu_inference import envs 6 | 7 | 8 | def is_disagg_enabled() -> bool: 9 | # We triggrer our code path as long as prefill slices are set. This 10 | # allows us to test interleave mode effectively with the code path 11 | # for comparison purposes. 12 | return bool(envs.PREFILL_SLICES) 13 | 14 | 15 | def _parse_slices(slices_str: str) -> Tuple[int, ...]: 16 | """Parse slices environment variable and return the a list of integers, each the size of a slice. 17 | 18 | For example, if slices_str is set to `2x2,2x1,2x4`, we should return `(4, 2, 8)`. 19 | 20 | Throws exception if the slice str is malformed. 21 | """ 22 | if not slices_str: 23 | return () 24 | 25 | try: 26 | slice_sizes = [] 27 | for s in slices_str.split(','): 28 | dims = s.split('x') 29 | if len(dims) == 1: 30 | slice_sizes.append(int(dims[0])) 31 | elif len(dims) == 2: 32 | slice_sizes.append((int(dims[0]), int(dims[1]))) 33 | else: 34 | raise ValueError("Each slice must be in 'N' or 'NxM' format.") 35 | return tuple(slice_sizes) 36 | except ValueError as e: 37 | raise ValueError(f"Malformed slice string: '{slices_str}'") from e 38 | 39 | 40 | def get_prefill_slices() -> Tuple[int, ...]: 41 | if not envs.PREFILL_SLICES: 42 | return () 43 | return _parse_slices(envs.PREFILL_SLICES) 44 | 45 | 46 | def get_decode_slices() -> Tuple[int, ...]: 47 | if not envs.DECODE_SLICES: 48 | return () 49 | return _parse_slices(envs.DECODE_SLICES) 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/100-documentation.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: 📚 Documentation 16 | description: Report an issue related to https://github.com/vllm-project/tpu-inference/tree/main/docs 17 | title: "[Doc]: " 18 | labels: ["documentation"] 19 | 20 | body: 21 | - type: textarea 22 | attributes: 23 | label: 📚 The doc issue 24 | description: > 25 | A clear and concise description of what content in https://github.com/vllm-project/tpu-inference/tree/main/docs is an issue. 26 | validations: 27 | required: true 28 | - type: textarea 29 | attributes: 30 | label: Suggest a potential alternative/fix 31 | description: > 32 | Tell us how we could improve the documentation in this regard. 33 | - type: markdown 34 | attributes: 35 | value: > 36 | Thanks for contributing 🎉! 37 | - type: checkboxes 38 | id: askllm 39 | attributes: 40 | label: Before submitting a new issue... 41 | options: 42 | - label: Make sure you already searched for relevant issues and checked the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs). 43 | required: true 44 | -------------------------------------------------------------------------------- /tests/lora/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tempfile 16 | 17 | import pytest 18 | from vllm.config import set_current_vllm_config 19 | from vllm.distributed import cleanup_dist_env_and_memory 20 | from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, 21 | init_distributed_environment) 22 | from vllm.engine.arg_utils import EngineArgs 23 | 24 | 25 | @pytest.fixture 26 | def dist_init(): 27 | engine_args = EngineArgs( 28 | model="Qwen/Qwen2-1.5B-Instruct", 29 | max_model_len=64, 30 | max_num_batched_tokens=64, 31 | max_num_seqs=4, 32 | ) 33 | 34 | vllm_config = engine_args.create_engine_config() 35 | 36 | with set_current_vllm_config(vllm_config): 37 | temp_file = tempfile.mkstemp()[1] 38 | init_distributed_environment( 39 | 1, 40 | 0, 41 | local_rank=0, 42 | distributed_init_method=f"file://{temp_file}", 43 | backend="gloo") 44 | ensure_model_parallel_initialized(1, 1) 45 | yield vllm_config 46 | cleanup_dist_env_and_memory(shutdown_ray=True) 47 | -------------------------------------------------------------------------------- /.buildkite/scripts/record_step_result.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | if [ "$#" -ne 1 ]; then 19 | echo "Usage: $0 " 20 | exit 1 21 | fi 22 | 23 | STEP_KEY="$1" 24 | 25 | echo "--- Checking ${STEP_KEY} Outcome" 26 | 27 | # Try to get the custom string you saved 28 | CUSTOM_STATUS=$(buildkite-agent meta-data get "${STEP_KEY}" --default "") 29 | 30 | if [ -n "$CUSTOM_STATUS" ]; then 31 | OUTCOME="$CUSTOM_STATUS" 32 | else 33 | OUTCOME=$(buildkite-agent step get "outcome" --step "${STEP_KEY}" || echo "skipped") 34 | fi 35 | 36 | echo "Step ${STEP_KEY} outcome: ${OUTCOME}" 37 | message="" 38 | 39 | case $OUTCOME in 40 | "passed") 41 | message="✅" 42 | ;; 43 | "skipped") 44 | message="N/A" 45 | ;; 46 | "unverified") 47 | message="unverified" 48 | ;; 49 | *) 50 | message="❌" 51 | ;; 52 | esac 53 | 54 | buildkite-agent meta-data set "${CI_TARGET}_category" "${CI_CATEGORY}" 55 | buildkite-agent meta-data set "${CI_TARGET}:${CI_STAGE}" "${message}" 56 | 57 | if [ "${OUTCOME}" != "passed" ] && [ "${OUTCOME}" != "skipped" ] && [ "${OUTCOME}" != "unverified" ]; then 58 | exit 1 59 | fi 60 | -------------------------------------------------------------------------------- /scripts/vllm/integration/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def pytest_addoption(parser): 17 | """Adds custom command-line options to pytest.""" 18 | parser.addoption("--tensor-parallel-size", 19 | type=int, 20 | default=1, 21 | help="The tensor parallel size to use for the test.") 22 | parser.addoption( 23 | "--expected-value", 24 | type=float, 25 | default=None, 26 | help= 27 | "This value will be used to compare the measure value and determine if the test passes or fails." 28 | ) 29 | parser.addoption("--model-name", 30 | type=str, 31 | default=None, 32 | help="Model name to test (e.g., 'model1')") 33 | parser.addoption("--fp8-kv-model-name", 34 | type=str, 35 | default=None, 36 | help="Model name to test fp8-kv (e.g., 'model1')") 37 | parser.addoption( 38 | "--dataset-path", 39 | type=str, 40 | default=None, 41 | help= 42 | "Path to the dataset file used for accuracy evaluation (CSV or PKL).") 43 | -------------------------------------------------------------------------------- /tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """All-gather matmul kernel's tuned block sizes.""" 3 | 4 | import re 5 | 6 | import jax 7 | 8 | # key: 9 | # - tpu_version 10 | # - m 11 | # - n 12 | # - k 13 | # - dtype 14 | # - tp_size 15 | # value: 16 | # - bn 17 | # - bk 18 | TUNED_BLOCK_SIZES = { 19 | # go/keep-sorted start 20 | (6, 1024, 51200, 5120, 'bfloat16', 8): (6400, 2560), 21 | (6, 1024, 57344, 8192, 'bfloat16', 8): (7168, 8192), 22 | (6, 2048, 51200, 5120, 'bfloat16', 8): (1280, 5120), 23 | (6, 2048, 57344, 8192, 'bfloat16', 8): (1024, 8192), 24 | (6, 4096, 51200, 5120, 'bfloat16', 8): (3200, 5120), 25 | (6, 8192, 51200, 5120, 'bfloat16', 8): (1280, 5120), 26 | # go/keep-sorted end 27 | } 28 | 29 | 30 | def get_tpu_version() -> int: 31 | """Returns the numeric version of the TPU, or -1 if not on TPU.""" 32 | kind = jax.devices()[0].device_kind 33 | if 'TPU' not in kind: 34 | return -1 35 | if kind.endswith(' lite'): 36 | kind = kind[:-len(' lite')] 37 | 38 | # v6: "TPU v6" 39 | # v7: "TPU7x" 40 | assert kind[:3] == 'TPU', kind 41 | return int(re.search(r'\d+', kind).group()) 42 | 43 | 44 | def get_key( 45 | m, 46 | n, 47 | k, 48 | dtype, 49 | tp_size, 50 | ): 51 | """Returns the key for the given parameters.""" 52 | return ( 53 | get_tpu_version(), 54 | m, 55 | n, 56 | k, 57 | dtype, 58 | tp_size, 59 | ) 60 | 61 | 62 | def get_tuned_block_sizes(m, n, k, dtype_name, tp_size): 63 | """Returns the tuned block sizes for the given parameters.""" 64 | key = get_key(m, n, k, dtype_name, tp_size) 65 | return TUNED_BLOCK_SIZES.get(key, (None, None)) 66 | -------------------------------------------------------------------------------- /.buildkite/features/Single-Host-P-D-disaggregation.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Single-Host-P-D-disaggregation 16 | # pipeline-type: features support matrix 17 | steps: 18 | - label: "Correctness tests for Single-Host P-D disaggregation" 19 | key: "SingleHostPDDisaggregation_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_8_queue 23 | commands: 24 | - | 25 | .buildkite/scripts/run_in_docker.sh \ 26 | python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_local_disagg.py::test_disaggregated_serving \ 27 | /workspace/tpu_inference/tests/e2e/test_local_disagg.py::test_disaggregated_serving_correctness 28 | - label: "Record correctness test result for Single-Host P-D disaggregation" 29 | key: "record_SingleHostPDDisaggregation_CorrectnessTest" 30 | depends_on: "SingleHostPDDisaggregation_CorrectnessTest" 31 | env: 32 | CI_TARGET: "SingleHostPDDisaggregation" 33 | CI_STAGE: "CorrectnessTest" 34 | CI_CATEGORY: "features support matrix" 35 | agents: 36 | queue: cpu 37 | commands: 38 | - | 39 | .buildkite/scripts/record_step_result.sh SingleHostPDDisaggregation_CorrectnessTest 40 | -------------------------------------------------------------------------------- /tpu_inference/kernels/megablox/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Common utilities for GMM kernels.""" 15 | 16 | import re 17 | 18 | import jax 19 | import jax.numpy as jnp 20 | 21 | 22 | def is_tpu() -> bool: 23 | return "TPU" in jax.devices()[0].device_kind 24 | 25 | 26 | def tpu_kind() -> str: 27 | """Query identification string for the currently attached TPU.""" 28 | return jax.devices()[0].device_kind 29 | 30 | 31 | # Most TPU devices follow the pattern "TPU v{version}{variant}", e.g. "TPU v5p" 32 | # TPU v7 has a different pattern (i.e. "TPU7x") 33 | _TPU_KIND_PATTERN = re.compile(r"TPU( v)?(\d+)") 34 | 35 | 36 | def tpu_generation() -> int: 37 | """Generation number of the currently attached TPU.""" 38 | if version := _TPU_KIND_PATTERN.match(tpu_kind()): 39 | return int(version[2]) 40 | raise NotImplementedError("only TPU devices are supported") 41 | 42 | 43 | def assert_is_supported_dtype(dtype: jnp.dtype) -> None: 44 | if dtype not in [ 45 | jnp.bfloat16, 46 | jnp.float32, 47 | jnp.float8_e4m3fn, 48 | jnp.float8_e5m2, 49 | jnp.int8, 50 | jnp.int4, 51 | jnp.float4_e2m1fn, 52 | jnp.uint4, 53 | ]: 54 | raise ValueError(f"No support for {dtype=}.") 55 | -------------------------------------------------------------------------------- /.buildkite/features/Structured_Decoding.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: structured_decoding 16 | # pipeline-type: feature support matrix 17 | # Structured decoding allows constraining the model's output to follow a 18 | # specific format, such as choosing from a predefined set of options or 19 | # following a JSON schema. This is useful for classification tasks, 20 | # structured data extraction, and ensuring outputs conform to expected formats. 21 | steps: 22 | - label: "Correctness tests for structured_decoding" 23 | key: "structured_decoding_CorrectnessTest" 24 | soft_fail: true 25 | agents: 26 | queue: tpu_v6e_queue 27 | commands: 28 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_structured_decoding.py::test_structured_decoding 29 | - label: "Record correctness test result for structured_decoding" 30 | key: "record_structured_decoding_CorrectnessTest" 31 | depends_on: "structured_decoding_CorrectnessTest" 32 | env: 33 | CI_TARGET: "structured_decoding" 34 | CI_STAGE: "CorrectnessTest" 35 | CI_CATEGORY: "feature support matrix" 36 | agents: 37 | queue: cpu 38 | commands: 39 | - | 40 | .buildkite/scripts/record_step_result.sh structured_decoding_CorrectnessTest 41 | -------------------------------------------------------------------------------- /docs/recommended_models_features.md: -------------------------------------------------------------------------------- 1 | # Recommended Model and Feature Matrices 2 | 3 | Although vLLM TPU’s new unified backend makes out-of-the-box high performance serving possible with any model supported in vLLM, the reality is that we're still in the process of implementing a few core components. 4 | For this reason, until we land more capabilities, we recommend starting from this list of stress tested models and features below. 5 | 6 | We are still landing components in tpu-inference that will improve performance for larger scale, higher complexity models (XL MoE, +vision encoders, MLA, etc.). 7 | 8 | If you’d like us to prioritize something specific, please submit a GitHub feature request [here](https://github.com/vllm-project/tpu-inference/issues/new/choose). 9 | 10 | ## Recommended Models 11 | 12 | These tables show the models currently tested for accuracy and performance. 13 | 14 | ### Text-Only Models 15 | 16 | {{ read_csv('../support_matrices/text_only_model_support_matrix.csv', keep_default_na=False) }} 17 | 18 | ### Multimodal Models 19 | 20 | {{ read_csv('../support_matrices/multimodal_model_support_matrix.csv', keep_default_na=False) }} 21 | 22 | ## Recommended Features 23 | 24 | This table shows the features currently tested for accuracy and performance. 25 | 26 | {{ read_csv('../support_matrices/feature_support_matrix.csv', keep_default_na=False) }} 27 | 28 | ## Kernel Support 29 | 30 | This table shows the current kernel support status. 31 | 32 | {{ read_csv('../support_matrices/kernel_support_matrix.csv', keep_default_na=False) }} 33 | 34 | ## Parallelism Support 35 | 36 | This table shows the current parallelism support status. 37 | 38 | {{ read_csv('../support_matrices/parallelism_support_matrix.csv', keep_default_na=False) }} 39 | 40 | ## Quantization Support 41 | 42 | This table shows the current quantization support status. 43 | 44 | {{ read_csv('../support_matrices/quantization_support_matrix.csv', keep_default_na=False) }} 45 | -------------------------------------------------------------------------------- /tpu_inference/layers/jax/pp_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Protocol 16 | 17 | from flax import nnx 18 | from vllm.distributed import get_pp_group 19 | from vllm.distributed.utils import get_pp_indices 20 | 21 | 22 | class PPMissingLayer(nnx.Module): 23 | """ 24 | A placeholder layer for missing layers in a pipeline parallel model. 25 | """ 26 | 27 | def __init__(self, *args, **kwargs): 28 | pass 29 | 30 | def __call__(self, *args, **kwargs): 31 | """Return the first arg from args or the first value from kwargs.""" 32 | return args[0] if args else next(iter(kwargs.values())) 33 | 34 | 35 | class LayerFn(Protocol): 36 | 37 | def __call__(self) -> nnx.Module: 38 | ... 39 | 40 | 41 | def make_layers( 42 | num_hidden_layers: int, 43 | layer_fn: LayerFn, 44 | ) -> tuple[int, int, List[nnx.Module]]: 45 | start_layer, end_layer = get_pp_indices(num_hidden_layers, 46 | get_pp_group().rank_in_group, 47 | get_pp_group().world_size) 48 | 49 | layers = [PPMissingLayer() for _ in range(start_layer)] \ 50 | + [layer_fn() for _ in range(start_layer, end_layer)] \ 51 | + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)] 52 | 53 | return start_layer, end_layer, layers 54 | -------------------------------------------------------------------------------- /.github/scripts/determine_release_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -eu pipefail 17 | 18 | # --- SCHEDULE TRIGGER --- 19 | if [[ "$GH_EVENT_NAME" == "schedule" ]]; then 20 | echo "Trigger: Schedule - Generating nightly build" 21 | 22 | # --- Get Base Version from Tag --- 23 | echo "Fetching latest tags..." 24 | git fetch --tags --force 25 | echo "Finding the latest stable version tag (vX.Y.Z)..." 26 | LATEST_STABLE_TAG=$(git tag --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n 1) 27 | if [[ -z "$LATEST_STABLE_TAG" ]]; then 28 | echo "Warning: No stable tag found." 29 | exit 1 30 | else 31 | BASE_VERSION=${LATEST_STABLE_TAG#v} 32 | fi 33 | echo "Using BASE_VERSION=${BASE_VERSION}" 34 | 35 | # --- Generate Nightly Version --- 36 | DATETIME_STR=$(date -u +%Y%m%d) 37 | VERSION="${BASE_VERSION}.dev${DATETIME_STR}" 38 | 39 | # --- PUSH TAG TRIGGER --- 40 | elif [[ "$GH_EVENT_NAME" == "push" && "$GH_REF" == refs/tags/* ]]; then 41 | echo "Trigger: Push Tag - Generating stable build" 42 | TAG_NAME="$GH_REF_NAME" 43 | VERSION=${TAG_NAME#v} 44 | 45 | else 46 | echo "Error: Unknown or unsupported trigger." 47 | exit 1 48 | fi 49 | 50 | # --- output --- 51 | echo "Final determined values: VERSION=${VERSION}" 52 | echo "VERSION=${VERSION}" >> "$GITHUB_OUTPUT" 53 | -------------------------------------------------------------------------------- /.buildkite/features/Hybrid_kvcache.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # hybrid kv cache 16 | # Hybrid kv cache allows the kv cache mgr to allocate different number of 17 | # blocks for different attention types. This is useful for models with more 18 | # than 1 attention type (e.g. opt-oss 120b, gemma-27b 19 | # with full + sliding window attn) to save HBM memory for kv cache and be able 20 | # to accomodate more requests. 21 | steps: 22 | - label: "Correctness tests for hybrid kv cache allocation" 23 | key: "hybrid_kvcache_CorrectnessTest" 24 | soft_fail: true 25 | agents: 26 | queue: tpu_v6e_8_queue 27 | commands: 28 | - | 29 | .buildkite/scripts/run_in_docker.sh \ 30 | python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_hybrid_kvcache.py::test_hybrid_kv_cache \ 31 | /workspace/tpu_inference/tests/e2e/test_hybrid_kvcache.py::test_hybrid_kv_cache_correctness 32 | - label: "Record correctness test result for hybrid kv cache allocation" 33 | key: "record_hybrid_kvcache_CorrectnessTest" 34 | depends_on: "hybrid_kvcache_CorrectnessTest" 35 | env: 36 | CI_TARGET: "hybrid_kvcache" 37 | CI_STAGE: "CorrectnessTest" 38 | CI_CATEGORY: "feature support matrix" 39 | agents: 40 | queue: cpu 41 | commands: 42 | - | 43 | .buildkite/scripts/record_step_result.sh hybrid_kvcache_CorrectnessTest 44 | -------------------------------------------------------------------------------- /.buildkite/features/runai_model_streamer_loader.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: runai_model_streamer_loader 16 | # pipeline-type: feature support matrix 17 | # The RunAI Model Streamer is a high-performance model loader that serves as an 18 | # alternative to the default Hugging Face loader. Instead of downloading a model 19 | # to local disk, it streams the weights from object storage (like GCS) into 20 | # GPU memory. This streaming process is significantly faster than the traditional 21 | # disk-based loading method. 22 | steps: 23 | - label: "Correctness tests for runai_model_streamer_loader" 24 | key: "runai_model_streamer_loader_CorrectnessTest" 25 | soft_fail: true 26 | agents: 27 | queue: tpu_v6e_queue 28 | commands: 29 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_runai_model_streamer_loader.py::test_correctness 30 | - label: "Record correctness test result for runai_model_streamer_loader" 31 | key: "record_runai_model_streamer_loader_CorrectnessTest" 32 | depends_on: "runai_model_streamer_loader_CorrectnessTest" 33 | env: 34 | CI_TARGET: "runai_model_streamer_loader" 35 | CI_STAGE: "CorrectnessTest" 36 | CI_CATEGORY: "feature support matrix" 37 | agents: 38 | queue: cpu 39 | commands: 40 | - | 41 | .buildkite/scripts/record_step_result.sh runai_model_streamer_loader_CorrectnessTest 42 | -------------------------------------------------------------------------------- /tpu_inference/models/vllm/vllm_model_wrapper_context.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from contextlib import contextmanager 16 | from dataclasses import dataclass 17 | from typing import Dict, List, Optional 18 | 19 | import jax 20 | from jax.sharding import Mesh 21 | 22 | 23 | @dataclass 24 | class VllmModelWrapperContext: 25 | kv_caches: List[jax.Array] 26 | mesh: Mesh 27 | layer_name_to_kvcache_index: Dict[str, int] 28 | 29 | 30 | _vllm_model_wrapper_context: Optional[VllmModelWrapperContext] = None 31 | 32 | 33 | def get_vllm_model_wrapper_context() -> VllmModelWrapperContext: 34 | assert _vllm_model_wrapper_context is not None, ( 35 | "VllmModelWrapperContext is not set. " 36 | "Please use `set_vllm_model_wrapper_context` to set the VllmModelWrapperContext." 37 | ) 38 | return _vllm_model_wrapper_context 39 | 40 | 41 | @contextmanager 42 | def set_vllm_model_wrapper_context( 43 | *, 44 | kv_caches: List[jax.Array], 45 | mesh: Mesh, 46 | layer_name_to_kvcache_index: Dict[str, int] = None, 47 | ): 48 | global _vllm_model_wrapper_context 49 | prev_context = _vllm_model_wrapper_context 50 | _vllm_model_wrapper_context = VllmModelWrapperContext( 51 | kv_caches=kv_caches, 52 | mesh=mesh, 53 | layer_name_to_kvcache_index=layer_name_to_kvcache_index, 54 | ) 55 | 56 | try: 57 | yield 58 | finally: 59 | _vllm_model_wrapper_context = prev_context 60 | -------------------------------------------------------------------------------- /tests/e2e/test_structured_decoding.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains end-to-end tests for structured decoding. 16 | # 17 | # Structured decoding allows constraining the model's output to follow a 18 | # specific format, such as choosing from a predefined set of options or 19 | # following a JSON schema. This is useful for classification tasks, 20 | # structured data extraction, and ensuring outputs conform to expected formats. 21 | 22 | # The tests in this file verify that: 23 | # 1. Choice-based structured decoding correctly constrains output to valid options 24 | # 2. The model produces deterministic results when given structured constraints 25 | 26 | from __future__ import annotations 27 | 28 | from vllm import LLM, SamplingParams 29 | from vllm.sampling_params import StructuredOutputsParams 30 | 31 | 32 | def test_structured_decoding(): 33 | llm = LLM(model='meta-llama/Llama-3.2-1B-Instruct', 34 | max_model_len=1024, 35 | max_num_seqs=1, 36 | enable_prefix_caching=False) 37 | 38 | choices = ['Positive', 'Negative'] 39 | structured_outputs_params = StructuredOutputsParams(choice=choices) 40 | sampling_params = SamplingParams( 41 | structured_outputs=structured_outputs_params) 42 | outputs = llm.generate( 43 | prompts="Classify this sentiment: tpu-inference is wonderful!", 44 | sampling_params=sampling_params, 45 | ) 46 | assert outputs[0].outputs[0].text in choices 47 | -------------------------------------------------------------------------------- /tpu_inference/kernels/collectives/util.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """utilities for collective kernels.""" 3 | 4 | import functools 5 | 6 | from jax.experimental import pallas as pl 7 | from jax.experimental.pallas import tpu as pltpu 8 | 9 | 10 | def local_barrier(left_neighbor, right_neighbor, double_barrier=True): 11 | """Performs a barrier with neighbors on the global barrier semaphore. 12 | 13 | Optionally performs a second barrier, which prevents a potential race 14 | when reusing the same collective_id across kernel invocations. 15 | 16 | Args: 17 | left_neighbor: Left neighbor device id. 18 | right_neighbor: Right neighbor device id. 19 | double_barrier: Whether to perform a second barrier. 20 | """ 21 | barrier_sem = pltpu.get_barrier_semaphore() 22 | for neighbor in [left_neighbor, right_neighbor]: 23 | pltpu.semaphore_signal( 24 | barrier_sem, 25 | inc=1, 26 | device_id=(neighbor, ), 27 | device_id_type=pltpu.DeviceIdType.MESH, 28 | ) 29 | pltpu.semaphore_wait(barrier_sem, 2) 30 | if double_barrier: 31 | # The double-barrier prevents a race condition where one neighbor can 32 | # re-enter the kernel again on a subsequent call and increment the 33 | # barrier semaphore a second time. This would unblock the current device 34 | # even if the other neighbor is not ready yet. 35 | # To implement a double-barrier, we stack-allocate a second REGULAR 36 | # semaphore using run_scoped. 37 | @functools.partial(pl.run_scoped, 38 | second_barrier=pltpu.SemaphoreType.REGULAR) 39 | def _(second_barrier): 40 | for neighbor in [left_neighbor, right_neighbor]: 41 | pltpu.semaphore_signal( 42 | second_barrier, 43 | inc=1, 44 | device_id=(neighbor, ), 45 | device_id_type=pltpu.DeviceIdType.MESH, 46 | ) 47 | pltpu.semaphore_wait(second_barrier, 2) 48 | -------------------------------------------------------------------------------- /tpu_inference/kernels/ragged_paged_attention/v3/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utility functions for ragged paged attention.""" 15 | import jax 16 | from jax._src import dtypes 17 | 18 | 19 | def cdiv(a, b): 20 | assert b != 0 21 | return (a + b - 1) // b 22 | 23 | 24 | def align_to(x, a): 25 | return cdiv(x, a) * a 26 | 27 | 28 | def get_dtype_bitwidth(dtype): 29 | return (dtypes.bit_width(dtype) 30 | if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype)) 31 | 32 | 33 | def get_dtype_packing(dtype): 34 | bits = get_dtype_bitwidth(dtype) 35 | return 32 // bits 36 | 37 | 38 | def next_power_of_2(x: int): 39 | """Finds the smallest power of 2 >= x using bit manipulation. 40 | 41 | Args: 42 | x: The input number (should be an integer). 43 | 44 | Returns: 45 | The smallest integer power of 2 that is >= x. 46 | """ 47 | assert x > 0 48 | if x == 1: 49 | return 1 50 | return 1 << (x - 1).bit_length() 51 | 52 | 53 | def get_tpu_version() -> int: 54 | """Returns the numeric version of the TPU, or -1 if not on TPU.""" 55 | kind = jax.devices()[0].device_kind 56 | if 'TPU' not in kind: 57 | return -1 58 | if kind.endswith(' lite'): 59 | kind = kind[:-len(' lite')] 60 | if kind.endswith('p') or kind.endswith('e'): 61 | kind = kind[:-1] 62 | if kind == 'TPU7x': 63 | return 7 64 | assert kind[:-1] == 'TPU v', kind 65 | return int(kind[-1]) 66 | -------------------------------------------------------------------------------- /.buildkite/features/MLA.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: MLA 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for MLA" 19 | key: "MLA_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "MLA_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for MLA" 27 | key: "record_MLA_CorrectnessTest" 28 | depends_on: "MLA_CorrectnessTest" 29 | env: 30 | CI_TARGET: "MLA" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "kernel support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh MLA_CorrectnessTest 38 | 39 | - label: "Performance tests for MLA" 40 | key: "MLA_PerformanceTest" 41 | depends_on: "record_MLA_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "MLA_PerformanceTest" "unverified" 48 | - label: "Record performance test result for MLA" 49 | key: "record_MLA_PerformanceTest" 50 | depends_on: "MLA_PerformanceTest" 51 | env: 52 | CI_TARGET: "MLA" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "kernel support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh MLA_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/features/MoE.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: MoE 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for MoE" 19 | key: "MoE_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "MoE_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for MoE" 27 | key: "record_MoE_CorrectnessTest" 28 | depends_on: "MoE_CorrectnessTest" 29 | env: 30 | CI_TARGET: "MoE" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "kernel support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh MoE_CorrectnessTest 38 | 39 | - label: "Performance tests for MoE" 40 | key: "MoE_PerformanceTest" 41 | depends_on: "record_MoE_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "MoE_PerformanceTest" "unverified" 48 | - label: "Record performance test result for MoE" 49 | key: "record_MoE_PerformanceTest" 50 | depends_on: "MoE_PerformanceTest" 51 | env: 52 | CI_TARGET: "MoE" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "kernel support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh MoE_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/parallelism/CP.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: CP 16 | # pipeline-type: parallelism support matrix 17 | steps: 18 | - label: "Correctness tests for CP" 19 | key: "CP_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "CP_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for CP" 27 | key: "record_CP_CorrectnessTest" 28 | depends_on: "CP_CorrectnessTest" 29 | env: 30 | CI_TARGET: "CP" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "parallelism support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh CP_CorrectnessTest 38 | 39 | - label: "Performance tests for CP" 40 | key: "CP_PerformanceTest" 41 | depends_on: "record_CP_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "CP_PerformanceTest" "unverified" 48 | - label: "Record performance test result for CP" 49 | key: "record_CP_PerformanceTest" 50 | depends_on: "CP_PerformanceTest" 51 | env: 52 | CI_TARGET: "CP" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "parallelism support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh CP_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/parallelism/EP.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: EP 16 | # pipeline-type: parallelism support matrix 17 | steps: 18 | - label: "Correctness tests for EP" 19 | key: "EP_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "EP_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for EP" 27 | key: "record_EP_CorrectnessTest" 28 | depends_on: "EP_CorrectnessTest" 29 | env: 30 | CI_TARGET: "EP" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "parallelism support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh EP_CorrectnessTest 38 | 39 | - label: "Performance tests for EP" 40 | key: "EP_PerformanceTest" 41 | depends_on: "record_EP_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "EP_PerformanceTest" "unverified" 48 | - label: "Record performance test result for EP" 49 | key: "record_EP_PerformanceTest" 50 | depends_on: "EP_PerformanceTest" 51 | env: 52 | CI_TARGET: "EP" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "parallelism support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh EP_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/parallelism/SP.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: SP 16 | # pipeline-type: parallelism support matrix 17 | steps: 18 | - label: "Correctness tests for SP" 19 | key: "SP_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "SP_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for SP" 27 | key: "record_SP_CorrectnessTest" 28 | depends_on: "SP_CorrectnessTest" 29 | env: 30 | CI_TARGET: "SP" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "parallelism support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh SP_CorrectnessTest 38 | 39 | - label: "Performance tests for SP" 40 | key: "SP_PerformanceTest" 41 | depends_on: "record_SP_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "SP_PerformanceTest" "unverified" 48 | - label: "Record performance test result for SP" 49 | key: "record_SP_PerformanceTest" 50 | depends_on: "SP_PerformanceTest" 51 | env: 52 | CI_TARGET: "SP" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "parallelism support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh SP_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/parallelism/TP.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: TP 16 | # pipeline-type: parallelism support matrix 17 | steps: 18 | - label: "Correctness tests for TP" 19 | key: "TP_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "TP_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for TP" 27 | key: "record_TP_CorrectnessTest" 28 | depends_on: "TP_CorrectnessTest" 29 | env: 30 | CI_TARGET: "TP" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "parallelism support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh TP_CorrectnessTest 38 | 39 | - label: "Performance tests for TP" 40 | key: "TP_PerformanceTest" 41 | depends_on: "record_TP_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "TP_PerformanceTest" "unverified" 48 | - label: "Record performance test result for TP" 49 | key: "record_TP_PerformanceTest" 50 | depends_on: "TP_PerformanceTest" 51 | env: 52 | CI_TARGET: "TP" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "parallelism support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh TP_PerformanceTest 60 | -------------------------------------------------------------------------------- /tests/lora/test_bgmv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import jax 16 | import torch 17 | import torchax 18 | 19 | from tpu_inference.lora.torch_lora_ops import bgmv_torch 20 | 21 | 22 | def test_bgmv_torch(): 23 | num_tokens = 16 24 | hidden_size = 128 25 | max_loras = 9 26 | max_lora_rank = 8 27 | 28 | with torchax.default_env(), jax.default_device(jax.devices("tpu")[0]): 29 | inputs = torch.rand(num_tokens, hidden_size, device='jax') 30 | loras = torch.rand(max_loras, 31 | 1, 32 | max_lora_rank, 33 | hidden_size, 34 | device='jax') 35 | idxs = torch.randint(0, max_loras, (num_tokens, ), device='jax') 36 | 37 | actual = bgmv_torch(inputs, loras, idxs) 38 | expected = _ref_bgmv_torch(inputs, loras, idxs) 39 | torch.testing.assert_close(actual, expected, atol=3e-2, rtol=1e-3) 40 | 41 | 42 | def _ref_bgmv_torch(inputs, loras, idxs): 43 | if len(loras.shape) == 4: 44 | loras = loras.squeeze(axis=1) 45 | 46 | # Another equivalent ref impl is as the 2 lines below. 47 | # selected_loras = loras[idxs] 48 | # return torch.einsum('td,tld->tl', inputs, selected_loras) 49 | num_tokens, _ = inputs.shape 50 | outputs = [] 51 | for i in range(num_tokens): 52 | input = inputs[i] # [hidden_size] 53 | lora = loras[idxs[i]] # [max_lora_rank, hidden_size] 54 | out = torch.matmul(lora, input) 55 | outputs.append(out) 56 | 57 | return torch.stack(outputs, axis=0) 58 | -------------------------------------------------------------------------------- /tpu_inference/kernels/quantized_matmul/util.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """Utility functions for quantized matmul kernel.""" 3 | from typing import Any, Callable 4 | 5 | import jax 6 | import jax.numpy as jnp 7 | 8 | from tpu_inference.kernels.quantized_matmul.tuned_block_sizes import TunedValue 9 | 10 | 11 | def unfold_args( 12 | conditions: tuple[jax.Array | bool, ...], 13 | fn_conditions: tuple[bool, ...], 14 | fn: Callable[..., Any], 15 | ): 16 | """Minimize run-time branching of fn by converting jnp.bool to python bool.""" 17 | if conditions: 18 | arg = conditions[0] 19 | if isinstance(arg, bool): 20 | unfold_args(conditions[1:], fn_conditions + (arg, ), fn) 21 | else: 22 | assert arg.dtype == jnp.bool and arg.size == 1 23 | jax.lax.cond( 24 | arg, 25 | lambda: unfold_args(conditions[1:], fn_conditions + 26 | (True, ), fn), 27 | lambda: unfold_args(conditions[1:], fn_conditions + 28 | (False, ), fn), 29 | ) 30 | else: 31 | fn(*fn_conditions) 32 | 33 | 34 | def quantize_tensor(x: jax.Array, dtype: jnp.dtype, dim: int = -1): 35 | if jnp.issubdtype(dtype, jnp.integer): 36 | dtype_info = jnp.iinfo(dtype) 37 | max_val = int(dtype_info.max) 38 | min_val = int(dtype_info.min) 39 | else: 40 | dtype_info = jnp.finfo(dtype) 41 | max_val = float(dtype_info.max) 42 | min_val = float(dtype_info.min) 43 | 44 | x_abs_max = jnp.max(jnp.abs(x), axis=dim, keepdims=True) 45 | scale = x_abs_max / max_val 46 | x_q = jnp.clip(x / scale, min_val, max_val).astype(dtype) 47 | return x_q, scale.astype(jnp.float32) 48 | 49 | 50 | def next_multiple(x, multiple): 51 | return ((x + multiple - 1) // multiple) * multiple 52 | 53 | 54 | def get_kernel_name(tuned_value: TunedValue): 55 | batch_block_size = tuned_value.batch_block_size 56 | out_block_size = tuned_value.out_block_size 57 | in_block_size = tuned_value.in_block_size 58 | return f'quantized_matmul_kernel_{batch_block_size}_{out_block_size}_{in_block_size}' 59 | -------------------------------------------------------------------------------- /.buildkite/quantization/AWQ_INT4.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: AWQ INT4 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for AWQ INT4" 19 | key: "AWQ_INT4_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "AWQ_INT4_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for AWQ INT4" 27 | key: "record_AWQ_INT4_CorrectnessTest" 28 | depends_on: "AWQ_INT4_CorrectnessTest" 29 | env: 30 | CI_TARGET: "AWQ INT4" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh AWQ_INT4_CorrectnessTest 38 | 39 | - label: "Performance tests for AWQ INT4" 40 | key: "AWQ_INT4_PerformanceTest" 41 | depends_on: "record_AWQ_INT4_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "AWQ_INT4_PerformanceTest" "unverified" 48 | - label: "Record performance test result for AWQ INT4" 49 | key: "record_AWQ_INT4_PerformanceTest" 50 | depends_on: "AWQ_INT4_PerformanceTest" 51 | env: 52 | CI_TARGET: "AWQ INT4" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh AWQ_INT4_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/quantization/FP8_W8A8.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: FP8 W8A8 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for FP8 W8A8" 19 | key: "FP8_W8A8_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "FP8_W8A8_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for FP8 W8A8" 27 | key: "record_FP8_W8A8_CorrectnessTest" 28 | depends_on: "FP8_W8A8_CorrectnessTest" 29 | env: 30 | CI_TARGET: "FP8 W8A8" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh FP8_W8A8_CorrectnessTest 38 | 39 | - label: "Performance tests for FP8 W8A8" 40 | key: "FP8_W8A8_PerformanceTest" 41 | depends_on: "record_FP8_W8A8_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "FP8_W8A8_PerformanceTest" "unverified" 48 | - label: "Record performance test result for FP8 W8A8" 49 | key: "record_FP8_W8A8_PerformanceTest" 50 | depends_on: "FP8_W8A8_PerformanceTest" 51 | env: 52 | CI_TARGET: "FP8 W8A8" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh FP8_W8A8_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/parallelism/DP.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: DP 16 | # pipeline-type: parallelism support matrix 17 | steps: 18 | - label: "Correctness tests for DP" 19 | key: "DP_CorrectnessTest" 20 | soft_fail: true 21 | env: 22 | NEW_MODEL_DESIGN: "1" 23 | agents: 24 | queue: tpu_v6e_8_queue 25 | commands: 26 | - | 27 | .buildkite/scripts/run_in_docker.sh \ 28 | bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_data_parallel.py' 29 | - label: "Record correctness test result for DP" 30 | key: "record_DP_CorrectnessTest" 31 | depends_on: "DP_CorrectnessTest" 32 | env: 33 | CI_TARGET: "DP" 34 | CI_STAGE: "CorrectnessTest" 35 | CI_CATEGORY: "parallelism support matrix" 36 | agents: 37 | queue: cpu 38 | commands: 39 | - | 40 | .buildkite/scripts/record_step_result.sh DP_CorrectnessTest 41 | 42 | - label: "Performance tests for DP" 43 | key: "DP_PerformanceTest" 44 | depends_on: "record_DP_CorrectnessTest" 45 | soft_fail: true 46 | agents: 47 | queue: tpu_v6e_queue 48 | commands: 49 | - | 50 | buildkite-agent meta-data set "DP_PerformanceTest" "unverified" 51 | - label: "Record performance test result for DP" 52 | key: "record_DP_PerformanceTest" 53 | depends_on: "DP_PerformanceTest" 54 | env: 55 | CI_TARGET: "DP" 56 | CI_STAGE: "PerformanceTest" 57 | CI_CATEGORY: "parallelism support matrix" 58 | agents: 59 | queue: cpu 60 | commands: 61 | - | 62 | .buildkite/scripts/record_step_result.sh DP_PerformanceTest 63 | -------------------------------------------------------------------------------- /.buildkite/quantization/FP4_W4A16.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: FP4 W4A16 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for FP4 W4A16" 19 | key: "FP4_W4A16_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "FP4_W4A16_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for FP4 W4A16" 27 | key: "record_FP4_W4A16_CorrectnessTest" 28 | depends_on: "FP4_W4A16_CorrectnessTest" 29 | env: 30 | CI_TARGET: "FP4 W4A16" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh FP4_W4A16_CorrectnessTest 38 | 39 | - label: "Performance tests for FP4 W4A16" 40 | key: "FP4_W4A16_PerformanceTest" 41 | depends_on: "record_FP4_W4A16_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "FP4_W4A16_PerformanceTest" "unverified" 48 | - label: "Record performance test result for FP4 W4A16" 49 | key: "record_FP4_W4A16_PerformanceTest" 50 | depends_on: "FP4_W4A16_PerformanceTest" 51 | env: 52 | CI_TARGET: "FP4 W4A16" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh FP4_W4A16_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/quantization/FP8_W8A16.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: FP8 W8A16 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for FP8 W8A16" 19 | key: "FP8_W8A16_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "FP8_W8A16_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for FP8 W8A16" 27 | key: "record_FP8_W8A16_CorrectnessTest" 28 | depends_on: "FP8_W8A16_CorrectnessTest" 29 | env: 30 | CI_TARGET: "FP8 W8A16" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh FP8_W8A16_CorrectnessTest 38 | 39 | - label: "Performance tests for FP8 W8A16" 40 | key: "FP8_W8A16_PerformanceTest" 41 | depends_on: "record_FP8_W8A16_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "FP8_W8A16_PerformanceTest" "unverified" 48 | - label: "Record performance test result for FP8 W8A16" 49 | key: "record_FP8_W8A16_PerformanceTest" 50 | depends_on: "FP8_W8A16_PerformanceTest" 51 | env: 52 | CI_TARGET: "FP8 W8A16" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh FP8_W8A16_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/quantization/INT8_W8A8.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: INT8 W8A8 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for INT8 W8A8" 19 | key: "INT8_W8A8_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "INT8_W8A8_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for INT8 W8A8" 27 | key: "record_INT8_W8A8_CorrectnessTest" 28 | depends_on: "INT8_W8A8_CorrectnessTest" 29 | env: 30 | CI_TARGET: "INT8 W8A8" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh INT8_W8A8_CorrectnessTest 38 | 39 | - label: "Performance tests for INT8 W8A8" 40 | key: "INT8_W8A8_PerformanceTest" 41 | depends_on: "record_INT8_W8A8_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "INT8_W8A8_PerformanceTest" "unverified" 48 | - label: "Record performance test result for INT8 W8A8" 49 | key: "record_INT8_W8A8_PerformanceTest" 50 | depends_on: "INT8_W8A8_PerformanceTest" 51 | env: 52 | CI_TARGET: "INT8 W8A8" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh INT8_W8A8_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/quantization/INT4_W4A16.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: INT4 W4A16 16 | # pipeline-type: quantization support matrix 17 | steps: 18 | - label: "Correctness tests for INT4 W4A16" 19 | key: "INT4_W4A16_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "INT4_W4A16_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for INT4 W4A16" 27 | key: "record_INT4_W4A16_CorrectnessTest" 28 | depends_on: "INT4_W4A16_CorrectnessTest" 29 | env: 30 | CI_TARGET: "INT4 W4A16" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "quantization support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh INT4_W4A16_CorrectnessTest 38 | 39 | - label: "Performance tests for INT4 W4A16" 40 | key: "INT4_W4A16_PerformanceTest" 41 | depends_on: "record_INT4_W4A16_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "INT4_W4A16_PerformanceTest" "unverified" 48 | - label: "Record performance test result for INT4 W4A16" 49 | key: "record_INT4_W4A16_PerformanceTest" 50 | depends_on: "INT4_W4A16_PerformanceTest" 51 | env: 52 | CI_TARGET: "INT4 W4A16" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "quantization support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh INT4_W4A16_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/pipeline_generation/feature_template.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: {FEATURE_NAME} 16 | # pipeline-type: {CATEGORY} 17 | steps: 18 | - label: "Correctness tests for {FEATURE_NAME}" 19 | key: "{SANITIZED_FEATURE_NAME}_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: {QUEUE} 23 | commands: 24 | - echo "placeholder" # TODO : replace with your correctness test command 25 | - label: "Record correctness test result for {FEATURE_NAME}" 26 | key: "record_{SANITIZED_FEATURE_NAME}_CorrectnessTest" 27 | depends_on: "{SANITIZED_FEATURE_NAME}_CorrectnessTest" 28 | env: 29 | CI_TARGET: "{FEATURE_NAME}" 30 | CI_STAGE: "CorrectnessTest" 31 | CI_CATEGORY: "{CATEGORY}" 32 | agents: 33 | queue: cpu 34 | commands: 35 | - | 36 | .buildkite/scripts/record_step_result.sh {SANITIZED_FEATURE_NAME}_CorrectnessTest 37 | 38 | - label: "Performance tests for {FEATURE_NAME}" 39 | key: "{SANITIZED_FEATURE_NAME}_PerformanceTest" 40 | depends_on: "record_{SANITIZED_FEATURE_NAME}_CorrectnessTest" 41 | soft_fail: true 42 | agents: 43 | queue: {QUEUE} 44 | commands: 45 | - echo "placeholder" # TODO : replace with your performance test command 46 | - label: "Record performance test result for {FEATURE_NAME}" 47 | key: "record_{SANITIZED_FEATURE_NAME}_PerformanceTest" 48 | depends_on: "{SANITIZED_FEATURE_NAME}_PerformanceTest" 49 | env: 50 | CI_TARGET: "{FEATURE_NAME}" 51 | CI_STAGE: "PerformanceTest" 52 | CI_CATEGORY: "{CATEGORY}" 53 | agents: 54 | queue: cpu 55 | commands: 56 | - | 57 | .buildkite/scripts/record_step_result.sh {SANITIZED_FEATURE_NAME}_PerformanceTest 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # SPDX-License-Identifier: Apache-2.0 3 | import os 4 | from typing import List 5 | 6 | from setuptools import find_packages, setup 7 | 8 | ROOT_DIR = os.path.dirname(__file__) 9 | 10 | 11 | def get_path(*filepath) -> str: 12 | return os.path.join(ROOT_DIR, *filepath) 13 | 14 | 15 | def get_requirements() -> List[str]: 16 | """Get Python package dependencies from requirements.txt.""" 17 | 18 | def _read_requirements(filename: str) -> List[str]: 19 | with open(get_path(filename)) as f: 20 | requirements = f.read().strip().split("\n") 21 | resolved_requirements = [] 22 | for line in requirements: 23 | if line.startswith("-r "): 24 | resolved_requirements += _read_requirements(line.split()[1]) 25 | elif line.startswith("--"): 26 | continue 27 | else: 28 | resolved_requirements.append(line) 29 | return resolved_requirements 30 | 31 | try: 32 | requirements = _read_requirements("requirements.txt") 33 | except ValueError: 34 | print("Failed to read requirements.txt in vllm_tpu.") 35 | return requirements 36 | 37 | 38 | def get_version(): 39 | if env_version := os.getenv("VLLM_VERSION_OVERRIDE"): 40 | return env_version 41 | return "0.0.0" 42 | 43 | 44 | setup( 45 | name="tpu_inference", 46 | version=get_version(), 47 | description="", 48 | long_description=open("README.md").read() if hasattr( 49 | open("README.md"), "read") else "", 50 | long_description_content_type="text/markdown", 51 | author="tpu_inference Contributors", 52 | packages=find_packages(), 53 | python_requires=">=3.10", 54 | install_requires=get_requirements(), 55 | include_package_data=True, 56 | classifiers=[ 57 | "Development Status :: 3 - Alpha", 58 | "Intended Audience :: Developers", 59 | "Intended Audience :: Education", 60 | "Intended Audience :: Science/Research", 61 | "License :: OSI Approved :: Apache Software License", 62 | "Programming Language :: Python :: 3.10", 63 | "Programming Language :: Python :: 3.11", 64 | "Programming Language :: Python :: 3.12", 65 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 66 | ], 67 | ) 68 | -------------------------------------------------------------------------------- /.buildkite/features/Quantized_Matmul.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Quantized Matmul 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for Quantized Matmul" 19 | key: "Quantized_Matmul_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "Quantized_Matmul_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for Quantized Matmul" 27 | key: "record_Quantized_Matmul_CorrectnessTest" 28 | depends_on: "Quantized_Matmul_CorrectnessTest" 29 | env: 30 | CI_TARGET: "Quantized Matmul" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "kernel support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh Quantized_Matmul_CorrectnessTest 38 | 39 | - label: "Performance tests for Quantized Matmul" 40 | key: "Quantized_Matmul_PerformanceTest" 41 | depends_on: "record_Quantized_Matmul_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "Quantized_Matmul_PerformanceTest" "unverified" 48 | - label: "Record performance test result for Quantized Matmul" 49 | key: "record_Quantized_Matmul_PerformanceTest" 50 | depends_on: "Quantized_Matmul_PerformanceTest" 51 | env: 52 | CI_TARGET: "Quantized Matmul" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "kernel support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh Quantized_Matmul_PerformanceTest 60 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # CODEOWNERS file for tpu-inference 2 | # This file defines code ownership for different parts of the repository. 3 | # Each line is a file pattern followed by one or more owners. 4 | # Owners are notified when PRs modify code in their areas. 5 | # 6 | # Order matters - the last matching pattern takes precedence. 7 | # Analysis includes full history from tpu_commons and tpu_inference paths. 8 | 9 | # Default owners for everything in the repo (fallback) 10 | * @vipannalla 11 | 12 | # CI/CD and Build Configuration 13 | /.buildkite/ @jcyang43 @QiliangCui 14 | /.github/ @jcyang43 @QiliangCui 15 | 16 | # Documentation 17 | /docs/ @bvrockwell 18 | /README.md @bvrockwell 19 | /CONTRIBUTING.md @jrplatin @bvrockwell 20 | 21 | # Distributed Computing 22 | /tpu_inference/distributed/ @mrjunwan-lang @sixiang-google 23 | 24 | # Kernel Implementations (Performance-critical) 25 | /tpu_inference/kernels/ @kyuyeunk @yaochengji @bythew3i 26 | 27 | # JAX Model Layers - Attention 28 | /tpu_inference/layers/jax/ @py4 @bzgoogle @jrplatin @gpolovets1 29 | /tpu_inference/layers/vllm/ @kyuyeunk @hfan @vanbasten23 30 | 31 | # JAX Model Implementations 32 | /tpu_inference/models/jax/qwen2_5_vl.py @hfan @kwang3939 33 | /tpu_inference/models/jax/gpt_oss.py @bzgoogle 34 | /tpu_inference/models/jax/deepseek_v3.py @bzgoogle @gpolovets1 @jrplatin 35 | /tpu_inference/models/vllm/ @kyuyeunk @hfan @vanbasten23 36 | 37 | # Runner and Execution 38 | /tpu_inference/runner/ @kyuyeunk @py4 @wenxindongwork @sixiang-google @mrjunwan-lang 39 | /tpu_inference/runner/tpu_runner.py @py4 @kyuyeunk @wenxindongwork @sixiang-google 40 | /tpu_inference/runner/persistent_batch_manager.py @py4 @wenxindongwork 41 | /tpu_inference/runner/speculative_decoding_manager.py @py4 @Lumosis 42 | /tpu_inference/executors/ @sixiang-google @mrjunwan-lang 43 | /tpu_inference/core/ @sixiang-google @mrjunwan-lang @wenxindongwork 44 | 45 | # Worker Management 46 | /tpu_inference/worker/ @sixiang-google @mrjunwan-lang @py4 @vanbasten23 @wenxindongwork 47 | 48 | # Speculative Decoding 49 | /tpu_inference/spec_decode/ @py4 @Lumosis 50 | 51 | # Platform Support 52 | /tpu_inference/platforms/ @sixiang-google @mrjunwan-lang 53 | 54 | # LoRA and Adapters 55 | /tpu_inference/lora/ @vanbasten23 56 | /tpu_inference/runner/lora_utils.py @vanbasten23 57 | 58 | # Docker Configuration 59 | /docker/ @jrplatin @QiliangCui 60 | -------------------------------------------------------------------------------- /docs/getting_started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | This guide provides instructions for installing and running `tpu-inference`. 4 | 5 | There are three ways to install `tpu-inference`: 6 | 7 | 1. **[Install with pip](#install-using-pip)** 8 | 2. **[Run with Docker](#run-with-docker)** 9 | 3. **[Install from source](#install-from-source)** 10 | 11 | ## Install using pip 12 | 13 | 1. Create a working directory: 14 | 15 | ```shell 16 | mkdir ~/work-dir 17 | cd ~/work-dir 18 | ``` 19 | 20 | 2. Set up a Python virtual environment: 21 | 22 | ```shell 23 | python3.12 -m venv vllm_env --symlinks 24 | source vllm_env/bin/activate 25 | ``` 26 | 27 | 3. Use the following command to install vllm-tpu using `pip` 28 | 29 | ```shell 30 | pip install vllm-tpu 31 | ``` 32 | 33 | ## Run with Docker 34 | 35 | Include the `--privileged`, `--net=host`, and `--shm-size=150gb` options to enable TPU interaction and shared memory. 36 | 37 | ```shell 38 | export DOCKER_URI=vllm/vllm-tpu:latest 39 | sudo docker run -it --rm --name $USER-vllm --privileged --net=host \ 40 | -v /dev/shm:/dev/shm \ 41 | --shm-size 150gb \ 42 | -p 8000:8000 \ 43 | --entrypoint /bin/bash ${DOCKER_URI} 44 | ``` 45 | 46 | ## Install from source 47 | 48 | For debugging or development purposes, you can install `tpu-inference` from source. `tpu-inference` is a plugin for `vllm`, so you need to install both from source. 49 | 50 | 1. Install system dependencies: 51 | 52 | ```shell 53 | sudo apt-get update && sudo apt-get install -y libopenblas-base libopenmpi-dev libomp-dev 54 | ``` 55 | 56 | 1. Clone the `vllm` and `tpu-inference` repositories: 57 | 58 | ```shell 59 | git clone https://github.com/vllm-project/vllm.git 60 | git clone https://github.com/vllm-project/tpu-inference.git 61 | ``` 62 | 63 | 1. Set up a Python virtual environment: 64 | 65 | ```shell 66 | python3.12 -m venv vllm_env --symlinks 67 | source vllm_env/bin/activate 68 | ``` 69 | 70 | 1. Install `vllm` from source, targeting the TPU device: 71 | 72 | ```shell 73 | cd vllm 74 | pip install -r requirements/tpu.txt 75 | VLLM_TARGET_DEVICE="tpu" pip install -e . 76 | cd .. 77 | ``` 78 | 79 | 1. Install `tpu-inference` from source: 80 | 81 | ```shell 82 | cd tpu-inference 83 | pip install -e . 84 | cd .. 85 | ``` 86 | -------------------------------------------------------------------------------- /tests/platforms/test_tpu_platform.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest.mock import MagicMock, patch 16 | 17 | import pytest 18 | import torch 19 | from vllm.config import CacheConfig, VllmConfig 20 | 21 | from tpu_inference.platforms.tpu_platform import TpuPlatform 22 | 23 | 24 | class TestTpuPlatform: 25 | 26 | @pytest.fixture 27 | def vllm_config(self): 28 | cache_config = CacheConfig(block_size=16, 29 | gpu_memory_utilization=0.9, 30 | swap_space=4, 31 | cache_dtype="fp8") 32 | 33 | vllm_config = MagicMock(spec=VllmConfig) 34 | vllm_config.cache_config = cache_config 35 | vllm_config.model_config = MagicMock(dtype='bfloat16') 36 | vllm_config.scheduler_config = MagicMock(is_multimodal_model=False) 37 | vllm_config.parallel_config = MagicMock() 38 | vllm_config.compilation_config = MagicMock(mode="dynamo_trace_once", 39 | backend="openxla") 40 | vllm_config.kv_transfer_config = None 41 | return vllm_config 42 | 43 | @pytest.mark.parametrize("chip_name,expected_dtype", [ 44 | ("v6e", torch.float8_e5m2), 45 | ("v5e", torch.float8_e4m3fn), 46 | ]) 47 | def test_fp8_dtype(self, chip_name, expected_dtype): 48 | mock_chip_type = MagicMock() 49 | mock_chip_type.name = chip_name 50 | 51 | with patch('tpu_inference.platforms.tpu_platform.init_logger'), \ 52 | patch('tpu_inference.platforms.tpu_platform.device.get_local_chips', return_value=(mock_chip_type, None)), \ 53 | patch('vllm.envs.VLLM_TPU_USING_PATHWAYS', False): 54 | assert TpuPlatform.fp8_dtype() == expected_dtype 55 | -------------------------------------------------------------------------------- /.buildkite/features/Multimodal_Inputs.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Multimodal Inputs 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for Multimodal Inputs" 19 | key: "Multimodal_Inputs_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py 25 | - label: "Record correctness test result for Multimodal Inputs" 26 | key: "record_Multimodal_Inputs_CorrectnessTest" 27 | depends_on: "Multimodal_Inputs_CorrectnessTest" 28 | env: 29 | CI_TARGET: Multimodal Inputs 30 | CI_STAGE: "CorrectnessTest" 31 | CI_CATEGORY: "feature support matrix" 32 | agents: 33 | queue: cpu 34 | commands: 35 | - | 36 | .buildkite/scripts/record_step_result.sh Multimodal_Inputs_CorrectnessTest 37 | 38 | - label: "Performance tests for Multimodal Inputs" 39 | key: "Multimodal_Inputs_PerformanceTest" 40 | depends_on: "record_Multimodal_Inputs_CorrectnessTest" 41 | soft_fail: true 42 | agents: 43 | queue: tpu_v6e_queue 44 | commands: 45 | - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mm_bench.sh 46 | - label: "Record performance test result for Multimodal Inputs" 47 | key: "record_Multimodal_Inputs_PerformanceTest" 48 | depends_on: "Multimodal_Inputs_PerformanceTest" 49 | env: 50 | CI_TARGET: Multimodal Inputs 51 | CI_STAGE: "PerformanceTest" 52 | CI_CATEGORY: "feature support matrix" 53 | agents: 54 | queue: cpu 55 | commands: 56 | - | 57 | .buildkite/scripts/record_step_result.sh Multimodal_Inputs_PerformanceTest 58 | -------------------------------------------------------------------------------- /.buildkite/features/Quantized_KV_Cache.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Quantized KV Cache 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for Quantized KV Cache" 19 | key: "Quantized_KV_Cache_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "Quantized_KV_Cache_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for Quantized KV Cache" 27 | key: "record_Quantized_KV_Cache_CorrectnessTest" 28 | depends_on: "Quantized_KV_Cache_CorrectnessTest" 29 | env: 30 | CI_TARGET: "Quantized KV Cache" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "kernel support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh Quantized_KV_Cache_CorrectnessTest 38 | 39 | - label: "Performance tests for Quantized KV Cache" 40 | key: "Quantized_KV_Cache_PerformanceTest" 41 | depends_on: "record_Quantized_KV_Cache_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "Quantized_KV_Cache_PerformanceTest" "unverified" 48 | - label: "Record performance test result for Quantized KV Cache" 49 | key: "record_Quantized_KV_Cache_PerformanceTest" 50 | depends_on: "Quantized_KV_Cache_PerformanceTest" 51 | env: 52 | CI_TARGET: "Quantized KV Cache" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "kernel support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh Quantized_KV_Cache_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/features/Quantized_Attention.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Quantized Attention 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for Quantized Attention" 19 | key: "Quantized_Attention_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "Quantized_Attention_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for Quantized Attention" 27 | key: "record_Quantized_Attention_CorrectnessTest" 28 | depends_on: "Quantized_Attention_CorrectnessTest" 29 | env: 30 | CI_TARGET: "Quantized Attention" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "kernel support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh Quantized_Attention_CorrectnessTest 38 | 39 | - label: "Performance tests for Quantized Attention" 40 | key: "Quantized_Attention_PerformanceTest" 41 | depends_on: "record_Quantized_Attention_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "Quantized_Attention_PerformanceTest" "unverified" 48 | - label: "Record performance test result for Quantized Attention" 49 | key: "record_Quantized_Attention_PerformanceTest" 50 | depends_on: "Quantized_Attention_PerformanceTest" 51 | env: 52 | CI_TARGET: "Quantized Attention" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "kernel support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh Quantized_Attention_PerformanceTest 60 | -------------------------------------------------------------------------------- /.buildkite/features/data_parallelism.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: data_parallelism 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for data_parallelism" 19 | key: "data_parallelism_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_8_queue 23 | commands: 24 | - | 25 | .buildkite/scripts/run_in_docker.sh \ 26 | bash -c 'NEW_MODEL_DESIGN=1 python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_data_parallel.py' 27 | - label: "Record correctness test result for data_parallelism" 28 | key: "record_data_parallelism_CorrectnessTest" 29 | depends_on: "data_parallelism_CorrectnessTest" 30 | env: 31 | CI_TARGET: "data_parallelism" 32 | CI_STAGE: "CorrectnessTest" 33 | CI_CATEGORY: "feature support matrix" 34 | agents: 35 | queue: cpu 36 | commands: 37 | - | 38 | .buildkite/scripts/record_step_result.sh data_parallelism_CorrectnessTest 39 | 40 | - label: "Performance tests for data_parallelism" 41 | key: "data_parallelism_PerformanceTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_8_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "data_parallelism_PerformanceTest" "run together with correctness test" 48 | 49 | - label: "Record performance test result for data_parallelism" 50 | key: "record_data_parallelism_PerformanceTest" 51 | depends_on: "data_parallelism_PerformanceTest" 52 | env: 53 | CI_TARGET: "data_parallelism" 54 | CI_STAGE: "PerformanceTest" 55 | CI_CATEGORY: "feature support matrix" 56 | agents: 57 | queue: cpu 58 | commands: 59 | - | 60 | .buildkite/scripts/record_step_result.sh data_parallelism_PerformanceTest 61 | -------------------------------------------------------------------------------- /.buildkite/features/async_scheduler.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: async scheduler 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for async scheduler" 19 | key: "async_scheduler_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_async_scheduler.py::test_async_correctness 25 | - label: "Record correctness test result for async scheduler" 26 | key: "record_async_scheduler_CorrectnessTest" 27 | depends_on: "async_scheduler_CorrectnessTest" 28 | env: 29 | CI_TARGET: "async scheduler" 30 | CI_STAGE: "CorrectnessTest" 31 | CI_CATEGORY: "feature support matrix" 32 | agents: 33 | queue: cpu 34 | commands: 35 | - | 36 | .buildkite/scripts/record_step_result.sh async_scheduler_CorrectnessTest 37 | 38 | - label: "Performance tests for async scheduler" 39 | key: "async_scheduler_PerformanceTest" 40 | depends_on: "record_async_scheduler_CorrectnessTest" 41 | soft_fail: true 42 | agents: 43 | queue: tpu_v6e_queue 44 | commands: 45 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_async_scheduler.py::test_performance 46 | - label: "Record performance test result for async scheduler" 47 | key: "record_async_scheduler_PerformanceTest" 48 | depends_on: "async_scheduler_PerformanceTest" 49 | env: 50 | CI_TARGET: "async scheduler" 51 | CI_STAGE: "PerformanceTest" 52 | CI_CATEGORY: "feature support matrix" 53 | agents: 54 | queue: cpu 55 | commands: 56 | - | 57 | .buildkite/scripts/record_step_result.sh async_scheduler_PerformanceTest 58 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/500-feature-request.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: 🚀 Feature request 16 | description: Submit a proposal/request for a new TPU Inference feature 17 | title: "[Feature]: " 18 | labels: ["feature request"] 19 | 20 | body: 21 | - type: markdown 22 | attributes: 23 | value: > 24 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/tpu-inference/issues?q=is%3Aissue+sort%3Acreated-desc+). 25 | - type: textarea 26 | attributes: 27 | label: 🚀 The feature, motivation and pitch 28 | description: > 29 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 30 | validations: 31 | required: true 32 | - type: textarea 33 | attributes: 34 | label: Alternatives 35 | description: > 36 | A description of any alternative solutions or features you've considered, if any. 37 | - type: textarea 38 | attributes: 39 | label: Additional context 40 | description: > 41 | Add any other context or screenshots about the feature request. 42 | - type: markdown 43 | attributes: 44 | value: > 45 | Thanks for contributing 🎉! 46 | - type: checkboxes 47 | id: askllm 48 | attributes: 49 | label: Before submitting a new issue... 50 | options: 51 | - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs), which can answer lots of frequently asked questions. 52 | required: true 53 | -------------------------------------------------------------------------------- /.buildkite/features/LoRA_Torch.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: LoRA_Torch 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for LoRA_Torch" 19 | key: "LoRA_Torch_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | .buildkite/scripts/run_in_docker.sh \ 26 | bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py' 27 | - label: "Record correctness test result for LoRA_Torch" 28 | key: "record_LoRA_Torch_CorrectnessTest" 29 | depends_on: "LoRA_Torch_CorrectnessTest" 30 | env: 31 | CI_TARGET: "LoRA_Torch" 32 | CI_STAGE: "CorrectnessTest" 33 | CI_CATEGORY: "feature support matrix" 34 | agents: 35 | queue: cpu 36 | commands: 37 | - | 38 | .buildkite/scripts/record_step_result.sh LoRA_Torch_CorrectnessTest 39 | 40 | - label: "Performance tests for LoRA_Torch" 41 | key: "LoRA_Torch_PerformanceTest" 42 | depends_on: "record_LoRA_Torch_CorrectnessTest" 43 | soft_fail: true 44 | agents: 45 | queue: tpu_v6e_queue 46 | commands: 47 | - | 48 | .buildkite/scripts/run_in_docker.sh \ 49 | bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora_perf.py' 50 | - label: "Record performance test result for LoRA_Torch" 51 | key: "record_LoRA_Torch_PerformanceTest" 52 | depends_on: "LoRA_Torch_PerformanceTest" 53 | env: 54 | CI_TARGET: "LoRA_Torch" 55 | CI_STAGE: "PerformanceTest" 56 | CI_CATEGORY: "feature support matrix" 57 | agents: 58 | queue: cpu 59 | commands: 60 | - | 61 | .buildkite/scripts/record_step_result.sh LoRA_Torch_PerformanceTest 62 | -------------------------------------------------------------------------------- /.buildkite/pipeline_generation/vllm_native_model_template.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: {MODEL_NAME} 16 | # pipeline-type: {CATEGORY} 17 | steps: 18 | - label: "Unit tests for {MODEL_NAME}" 19 | key: "{SANITIZED_MODEL_NAME}_UnitTest" 20 | agents: 21 | queue: {QUEUE} 22 | soft_fail: true 23 | commands: 24 | - echo "placeholder" # TODO: replace with your unit test command 25 | - label: "Record unit test result for {MODEL_NAME}" 26 | key: "record_{SANITIZED_MODEL_NAME}_UnitTest" 27 | depends_on: "{SANITIZED_MODEL_NAME}_UnitTest" 28 | env: 29 | CI_TARGET: {MODEL_NAME} 30 | CI_STAGE: "UnitTest" 31 | CI_CATEGORY: "{CATEGORY}" 32 | agents: 33 | queue: cpu 34 | commands: 35 | - | 36 | .buildkite/scripts/record_step_result.sh {SANITIZED_MODEL_NAME}_UnitTest 37 | 38 | - label: "Integration tests for {MODEL_NAME}" 39 | key: "{SANITIZED_MODEL_NAME}_IntegrationTest" 40 | depends_on: "record_{SANITIZED_MODEL_NAME}_UnitTest" 41 | agents: 42 | queue: {QUEUE} 43 | soft_fail: true 44 | env: 45 | TEST_MODEL: {MODEL_NAME} 46 | TENSOR_PARALLEL_SIZE: {TENSOR_PARALLEL_SIZE} 47 | MINIMUM_ACCURACY_THRESHOLD: 0 # TODO : replace 0 with your accuracy threshold 48 | commands: 49 | - | 50 | .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/test_accuracy.sh 51 | - label: "Record integration test result for {MODEL_NAME}" 52 | key: "record_{SANITIZED_MODEL_NAME}_IntegrationTest" 53 | depends_on: "{SANITIZED_MODEL_NAME}_IntegrationTest" 54 | env: 55 | CI_TARGET: {MODEL_NAME} 56 | CI_STAGE: "IntegrationTest" 57 | CI_CATEGORY: "{CATEGORY}" 58 | agents: 59 | queue: cpu 60 | commands: 61 | - | 62 | .buildkite/scripts/record_step_result.sh {SANITIZED_MODEL_NAME}_IntegrationTest 63 | -------------------------------------------------------------------------------- /.buildkite/features/KV_Cache_Host_Offloading.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: KV cache host offloading 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for KV cache host offloading" 19 | key: "KV_Cache_Host_Offloading_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "KV_Cache_Host_Offloading_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for KV cache host offloading" 27 | key: "record_KV_Cache_Host_Offloading_CorrectnessTest" 28 | depends_on: "KV_Cache_Host_Offloading_CorrectnessTest" 29 | env: 30 | CI_TARGET: "KV cache host offloading" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "feature support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_CorrectnessTest 38 | 39 | - label: "Performance tests for KV cache host offloading" 40 | key: "KV_Cache_Host_Offloading_PerformanceTest" 41 | depends_on: "record_KV_Cache_Host_Offloading_CorrectnessTest" 42 | soft_fail: true 43 | agents: 44 | queue: tpu_v6e_queue 45 | commands: 46 | - | 47 | buildkite-agent meta-data set "KV_Cache_Host_Offloading_PerformanceTest" "unverified" 48 | - label: "Record performance test result for KV cache host offloading" 49 | key: "record_KV_Cache_Host_Offloading_PerformanceTest" 50 | depends_on: "KV_Cache_Host_Offloading_PerformanceTest" 51 | env: 52 | CI_TARGET: "KV cache host offloading" 53 | CI_STAGE: "PerformanceTest" 54 | CI_CATEGORY: "feature support matrix" 55 | agents: 56 | queue: cpu 57 | commands: 58 | - | 59 | .buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_PerformanceTest 60 | -------------------------------------------------------------------------------- /docs/getting_started/quickstart.md: -------------------------------------------------------------------------------- 1 | # Get started with vLLM TPU 2 | 3 | Google Cloud TPUs (Tensor Processing Units) accelerate machine learning workloads. vLLM supports TPU v6e and v5e. For architecture, supported topologies, and more, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture) and specific TPU version pages ([v5e](https://cloud.google.com/tpu/docs/v5e) and [v6e](https://cloud.google.com/tpu/docs/v6e)). 4 | 5 | --- 6 | 7 | ## Requirements 8 | 9 | * **Google Cloud TPU VM:** Access to a TPU VM. For setup instructions, see the [Cloud TPU Setup guide](tpu_setup.md). 10 | * **TPU versions:** v6e, v5e 11 | * **Python:** 3.11 or newer (3.12 used in examples). 12 | 13 | --- 14 | 15 | ## Installation 16 | 17 | For detailed steps on installing `vllm-tpu` with `pip` or running it as a Docker image, please see the [**Installation Guide**](installation.md). 18 | 19 | ## Run the vLLM Server 20 | 21 | After installing `vllm-tpu`, you can start the API server. 22 | 23 | 1. **Log in to Hugging Face:** 24 | You'll need a Hugging Face token to download models. 25 | 26 | ```shell 27 | export TOKEN=YOUR_TOKEN 28 | git config --global credential.helper store 29 | huggingface-cli login --token $TOKEN 30 | ``` 31 | 32 | 2. **Launch the Server:** 33 | The following command starts the server with the Llama-3.1-8B model. 34 | 35 | ```shell 36 | vllm serve "meta-llama/Llama-3.1-8B" \ 37 | --download_dir /tmp \ 38 | --disable-log-requests \ 39 | --tensor_parallel_size=1 \ 40 | --max-model-len=2048 41 | ``` 42 | 43 | 3. **Send a Request:** 44 | 45 | Once the server is running, you can send it a request using `curl`: 46 | 47 | ```shell 48 | curl http://localhost:8000/v1/completions \ 49 | -H "Content-Type: application/json" \ 50 | -d '{ 51 | "model": "meta-llama/Llama-3.1-8B", 52 | "prompt": "Hello, my name is", 53 | "max_tokens": 20, 54 | "temperature": 0.7 55 | }' 56 | ``` 57 | 58 | ## Next steps: 59 | 60 | Check out complete, end-to-end example recipes in the [tpu-recipes repository](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM) 61 | 62 | ## For further reading 63 | 64 | * [Examples](https://github.com/vllm-project/tpu-inference/tree/main/examples) 65 | * [Recipes](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM) 66 | * [GKE serving with vLLM TPU](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-vllm-tpu) 67 | -------------------------------------------------------------------------------- /tests/core/test_disagg_executor.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | import unittest 3 | from unittest.mock import MagicMock, patch 4 | 5 | from vllm.config import ModelConfig, VllmConfig 6 | 7 | from tpu_inference.core.disagg_executor import DisaggExecutor 8 | 9 | 10 | class DisaggExecutorTest(unittest.TestCase): 11 | 12 | def setUp(self): 13 | """Set up the test environment by mocking dependencies.""" 14 | # Mock configurations 15 | self.mock_vllm_config = MagicMock(spec=VllmConfig) 16 | self.mock_vllm_config.model_config = ModelConfig( 17 | tokenizer_mode="auto", 18 | trust_remote_code=False, 19 | seed=0, 20 | dtype='bfloat16') 21 | self.mock_vllm_config.cache_config = MagicMock() 22 | self.mock_vllm_config.scheduler_config = MagicMock() 23 | self.mock_vllm_config.load_config = MagicMock() 24 | self.mock_vllm_config.lora_config = None 25 | self.mock_vllm_config.parallel_config = MagicMock() 26 | self.mock_vllm_config.device_config = MagicMock() 27 | self.mock_vllm_config.speculative_config = None 28 | self.mock_vllm_config.prompt_adapter_config = None 29 | self.mock_vllm_config.observability_config = MagicMock() 30 | 31 | # Patch the collective_rpc method to avoid actual RPC calls 32 | self.patcher = patch( 33 | "tpu_inference.core.disagg_executor.DisaggExecutor.collective_rpc") 34 | self.mock_collective_rpc = self.patcher.start() 35 | self.addCleanup(self.patcher.stop) 36 | 37 | # Create a DisaggExecutor instance with the mock config 38 | self.executor = DisaggExecutor(vllm_config=self.mock_vllm_config) 39 | 40 | def test_init_with_devices(self): 41 | """Test init_with_devices.""" 42 | self.executor._init_executor() 43 | 44 | # Check that collective_rpc was called with the expected arguments 45 | self.mock_collective_rpc.assert_called() 46 | calls = self.mock_collective_rpc.call_args_list 47 | 48 | # Asserts for init_worker 49 | self.assertEqual(calls[0][0][0], "init_worker") 50 | self.assertEqual(calls[1][0][0], "init_device") 51 | self.assertEqual(calls[2][0][0], "load_model") 52 | 53 | def test_check_health(self): 54 | """Test check_health.""" 55 | # Call check_health (it should always pass) 56 | self.executor.check_health() 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /tests/core/test_init.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | import unittest 17 | from unittest.mock import patch 18 | 19 | 20 | class TestPathwaysInit(unittest.TestCase): 21 | 22 | @patch.dict("os.environ", {"JAX_PLATFORMS": "proxy,cpu"}) 23 | def test_VLLM_TPU_USING_PATHWAYS_enabled(self): 24 | """Test when JAX_PLATFORMS contains 'proxy'.""" 25 | 26 | # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic 27 | import vllm.envs as envs 28 | 29 | # Reload the module to ensure fresh import 30 | importlib.reload(envs) 31 | 32 | # Check that VLLM_TPU_USING_PATHWAYS is True when JAX_PLATFORMS contains "proxy" 33 | self.assertTrue(envs.VLLM_TPU_USING_PATHWAYS) 34 | 35 | @patch.dict("os.environ", {"JAX_PLATFORMS": "cpu"}) 36 | def test_VLLM_TPU_USING_PATHWAYS_not_enabled(self): 37 | """Test when JAX_PLATFORMS does not contain 'proxy'.""" 38 | 39 | # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic 40 | import vllm.envs as envs 41 | 42 | # Reload the module to ensure fresh import 43 | importlib.reload(envs) 44 | 45 | # Check that VLLM_TPU_USING_PATHWAYS is False when JAX_PLATFORMS doesn't contain "proxy" 46 | self.assertFalse(envs.VLLM_TPU_USING_PATHWAYS) 47 | 48 | @patch.dict("os.environ", {"JAX_PLATFORMS": "PROXY,CPU"}) 49 | def test_VLLM_TPU_USING_PATHWAYS_case_insensitive(self): 50 | """Test that JAX_PLATFORMS check is case insensitive.""" 51 | 52 | # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic 53 | import vllm.envs as envs 54 | 55 | # Reload the module to ensure fresh import 56 | importlib.reload(envs) 57 | 58 | # Check that VLLM_TPU_USING_PATHWAYS is True even with uppercase "PROXY" 59 | self.assertTrue(envs.VLLM_TPU_USING_PATHWAYS) 60 | 61 | 62 | if __name__ == "__main__": 63 | unittest.main() 64 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/200-installation.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: 🛠️ Installation 16 | description: Report an issue here when you hit errors during installation. 17 | title: "[Installation]: " 18 | labels: ["installation"] 19 | 20 | body: 21 | - type: markdown 22 | attributes: 23 | value: > 24 | #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/tpu-inference/issues?q=is%3Aissue+sort%3Acreated-desc+). 25 | - type: textarea 26 | attributes: 27 | label: Your current environment 28 | description: | 29 | Please run the following and paste the output below. 30 | ```sh 31 | wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py 32 | # For security purposes, please feel free to check the contents of collect_env.py before running it. 33 | python collect_env.py 34 | python -c "import jax; jax.print_environment_info()" 35 | ``` 36 | It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. 37 | value: | 38 | ```text 39 | Outputs of the commands above 40 | ``` 41 | validations: 42 | required: true 43 | - type: textarea 44 | attributes: 45 | label: How you are installing TPU inference? 46 | description: | 47 | Paste the full command you are trying to execute. 48 | value: | 49 | ```sh 50 | pip install -e . 51 | ``` 52 | - type: markdown 53 | attributes: 54 | value: > 55 | Thanks for contributing 🎉! 56 | - type: checkboxes 57 | id: askllm 58 | attributes: 59 | label: Before submitting a new issue... 60 | options: 61 | - label: Make sure you already searched for relevant issues and checked the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs). 62 | required: true 63 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Publish Package to PyPI 16 | 17 | on: 18 | push: 19 | tags: 20 | - 'v[0-9]+.[0-9]+.[0-9]+*' 21 | schedule: 22 | - cron: '0 8 * * *' 23 | 24 | jobs: 25 | pypi_publish: 26 | name: Build and Publish 27 | runs-on: ubuntu-latest 28 | 29 | permissions: 30 | id-token: write 31 | contents: read 32 | 33 | steps: 34 | - name: Checkout Code 35 | uses: actions/checkout@v4 36 | with: 37 | fetch-depth: 0 38 | 39 | - name: Check if tag is on main branch 40 | if: github.event_name == 'push' 41 | run: | 42 | echo "Checking if tag ${{ github.ref_name }} is on main branch..." 43 | if git branch -r --contains ${{ github.ref_name }} | grep -q "origin/main"; then 44 | echo "Tag is on origin/main. Proceeding with release." 45 | else 46 | echo "ERROR Tag ${{ github.ref_name }} is not on origin/main." 47 | echo "This release will be aborted to prevent publishing from a non-main branch." 48 | exit 1 49 | fi 50 | 51 | - name: Set up Python 52 | uses: actions/setup-python@v5 53 | with: 54 | python-version: '3.x' 55 | 56 | - name: Determine Version 57 | id: vars 58 | env: 59 | GH_EVENT_NAME: ${{ github.event_name }} 60 | GH_REF_NAME: ${{ github.ref_name }} 61 | GH_REF: ${{ github.ref }} 62 | run: bash .github/scripts/determine_release_vars.sh 63 | 64 | - name: Install dependencies (build) 65 | run: 66 | python3 -m pip install build 67 | 68 | - name: Build package (sdist and wheel) 69 | env: 70 | VLLM_VERSION_OVERRIDE: ${{ steps.vars.outputs.VERSION }} 71 | run: python3 -m build 72 | 73 | - name: Publish package distributions to PyPI 74 | uses: pypa/gh-action-pypi-publish@release/v1 75 | 76 | - name: Publish completed message 77 | run: echo "---Build and publish completed successfully.---" 78 | -------------------------------------------------------------------------------- /.buildkite/scripts/commit_verified_commit_hashes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | # --- Configuration --- 19 | REPO_URL="https://github.com/vllm-project/tpu-inference.git" 20 | TARGET_BRANCH="main" 21 | 22 | COMMIT_MESSAGE="Update verified commit hashes" 23 | 24 | # Construct the repository URL with the access token for authentication. 25 | AUTHENTICATED_REPO_URL="https://x-access-token:${GITHUB_PAT}@${REPO_URL#https://}" 26 | 27 | # Ensure the GITHUB_PAT is available before proceeding. 28 | if [ -z "${GITHUB_PAT:-}" ]; then 29 | echo "--- ERROR: GITHUB_PAT secret not found. Cannot proceed." 30 | exit 1 31 | fi 32 | 33 | echo "--- Configuring Git user details" 34 | git config user.name "Buildkite Bot" 35 | git config user.email "buildkite-bot@users.noreply.github.com" 36 | 37 | echo "--- Fetching and checking out the target branch" 38 | git fetch origin "${TARGET_BRANCH}" 39 | git checkout "${TARGET_BRANCH}" 40 | git reset --hard origin/"${TARGET_BRANCH}" 41 | 42 | VLLM_COMMIT_HASH=$(buildkite-agent meta-data get "VLLM_COMMIT_HASH" --default "") 43 | 44 | if [ -z "${VLLM_COMMIT_HASH}" ]; then 45 | echo "VLLM_COMMIT_HASH not found in buildkite meta-data" 46 | exit 1 47 | fi 48 | 49 | if [ -z "${BUILDKITE_COMMIT:-}" ]; then 50 | echo "BUILDKITE_COMMIT not found" 51 | exit 1 52 | fi 53 | 54 | if [ ! -f verified_commit_hashes.csv ]; then 55 | echo "timestamp,vllm_commit_hash,tpu_inference_commit_hash" > verified_commit_hashes.csv 56 | fi 57 | echo "$(date '+%Y-%m-%d %H:%M:%S'),${VLLM_COMMIT_HASH},${BUILDKITE_COMMIT}" >> verified_commit_hashes.csv 58 | 59 | git add verified_commit_hashes.csv 60 | 61 | # --- Check for changes before committing --- 62 | if git diff --quiet --cached; then 63 | echo "No changes to commit. Exiting successfully." 64 | exit 0 65 | else 66 | echo "--- Committing changes" 67 | git commit -s -m "${COMMIT_MESSAGE}" 68 | 69 | echo "--- Pushing changes to '${TARGET_BRANCH}'" 70 | git push "${AUTHENTICATED_REPO_URL}" "HEAD:${TARGET_BRANCH}" 71 | fi 72 | -------------------------------------------------------------------------------- /.buildkite/features/DCN-Based_P-D_disaggregation.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: DCN-based P/D disaggregation 16 | # pipeline-type: feature support matrix 17 | steps: 18 | - label: "Correctness tests for DCN-based P/D disaggregation" 19 | key: "DCN_based_P-D_disaggregation_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - | 25 | buildkite-agent meta-data set "DCN_based_P-D_disaggregation_CorrectnessTest" "unverified" 26 | - label: "Record correctness test result for DCN-based P/D disaggregation" 27 | key: "record_DCN_based_P-D_disaggregation_CorrectnessTest" 28 | depends_on: "DCN_based_P-D_disaggregation_CorrectnessTest" 29 | env: 30 | CI_TARGET: "DCN-based P/D disaggregation" 31 | CI_STAGE: "CorrectnessTest" 32 | CI_CATEGORY: "feature support matrix" 33 | agents: 34 | queue: cpu 35 | commands: 36 | - | 37 | .buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_CorrectnessTest 38 | 39 | - label: "Performance tests for DCN-based P/D disaggregation" 40 | key: "DCN_based_P-D_disaggregation_PerformanceTest" 41 | soft_fail: true 42 | env: 43 | MODEL:"Qwen/Qwen3-0.6B" 44 | INPUT_LEN:1024 45 | OUTPUT_LEN:1024 46 | NUM_PROMPTS:20 47 | RANDOM_SEED:10 48 | MAX_CONCURRENCY:1 49 | agents: 50 | queue: tpu_v6e_8_queue 51 | commands: 52 | - | 53 | .buildkite/scripts/run_disagg.sh 54 | 55 | - label: "Record performance test result for DCN-based P/D disaggregation" 56 | key: "record_DCN_based_P-D_disaggregation_PerformanceTest" 57 | depends_on: "DCN_based_P-D_disaggregation_PerformanceTest" 58 | env: 59 | CI_TARGET: "DCN-based P/D disaggregation" 60 | CI_STAGE: "PerformanceTest" 61 | CI_CATEGORY: "feature support matrix" 62 | agents: 63 | queue: cpu 64 | commands: 65 | - | 66 | .buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_PerformanceTest 67 | -------------------------------------------------------------------------------- /tests/lora/test_lora_perf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import time 17 | 18 | import pytest 19 | import vllm 20 | from vllm.lora.request import LoRARequest 21 | 22 | TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1] 23 | 24 | 25 | @pytest.mark.parametrize("tp", TP) 26 | def test_lora_performance(tp): 27 | prompt = "What is 1+1? \n" 28 | llm_without_lora = vllm.LLM( 29 | model="Qwen/Qwen2.5-3B-Instruct", 30 | max_model_len=256, 31 | max_num_batched_tokens=64, 32 | max_num_seqs=8, 33 | tensor_parallel_size=tp, 34 | ) 35 | start_time = time.time() 36 | llm_without_lora.generate( 37 | prompt, 38 | sampling_params=vllm.SamplingParams(max_tokens=16, temperature=0), 39 | )[0].outputs[0].text 40 | base_time = time.time() - start_time 41 | 42 | del llm_without_lora 43 | # Waiting for TPUs to be released 44 | time.sleep(10) 45 | 46 | llm_with_lora = vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", 47 | max_model_len=256, 48 | max_num_batched_tokens=64, 49 | max_num_seqs=8, 50 | tensor_parallel_size=tp, 51 | enable_lora=True, 52 | max_loras=1, 53 | max_lora_rank=8) 54 | lora_request = LoRARequest( 55 | "lora_adapter_2", 2, 56 | "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_2_adapter") 57 | start_time = time.time() 58 | llm_with_lora.generate(prompt, 59 | sampling_params=vllm.SamplingParams(max_tokens=16, 60 | temperature=0), 61 | lora_request=lora_request)[0].outputs[0].text 62 | lora_time = time.time() - start_time 63 | print(f"Base time: {base_time}, LoRA time: {lora_time}") 64 | assert (base_time / 65 | lora_time) < 8, f"Base time: {base_time}, LoRA time: {lora_time}" 66 | 67 | del llm_with_lora 68 | -------------------------------------------------------------------------------- /.buildkite/features/Collective_Communication_Matmul.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pipeline-name: Collective Communication Matmul 16 | # pipeline-type: kernel support matrix 17 | steps: 18 | - label: "Correctness tests for Collective Communication Matmul" 19 | key: "Collective_Communication_Matmul_CorrectnessTest" 20 | soft_fail: true 21 | agents: 22 | queue: tpu_v6e_queue 23 | commands: 24 | - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/kernels/collectives/all_gather_matmul_kernel_test.py 25 | - label: "Record correctness test result for Collective Communication Matmul" 26 | key: "record_Collective_Communication_Matmul_CorrectnessTest" 27 | depends_on: "Collective_Communication_Matmul_CorrectnessTest" 28 | env: 29 | CI_TARGET: "Collective Communication Matmul" 30 | CI_STAGE: "CorrectnessTest" 31 | CI_CATEGORY: "kernel support matrix" 32 | agents: 33 | queue: cpu 34 | commands: 35 | - | 36 | .buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_CorrectnessTest 37 | 38 | - label: "Performance tests for Collective Communication Matmul" 39 | key: "Collective_Communication_Matmul_PerformanceTest" 40 | depends_on: "record_Collective_Communication_Matmul_CorrectnessTest" 41 | soft_fail: true 42 | agents: 43 | queue: tpu_v6e_queue 44 | commands: 45 | - | 46 | buildkite-agent meta-data set "Collective_Communication_Matmul_PerformanceTest" "unverified" 47 | - label: "Record performance test result for Collective Communication Matmul" 48 | key: "record_Collective_Communication_Matmul_PerformanceTest" 49 | depends_on: "Collective_Communication_Matmul_PerformanceTest" 50 | env: 51 | CI_TARGET: "Collective Communication Matmul" 52 | CI_STAGE: "PerformanceTest" 53 | CI_CATEGORY: "kernel support matrix" 54 | agents: 55 | queue: cpu 56 | commands: 57 | - | 58 | .buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_PerformanceTest 59 | -------------------------------------------------------------------------------- /.buildkite/scripts/commit_support_matrices.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | # --- Configuration --- 19 | REPO_URL="https://github.com/vllm-project/tpu-inference.git" 20 | TARGET_BRANCH="main" 21 | 22 | # Conditional Configuration for Release vs. Nightly 23 | if [ "${NIGHTLY}" = "1" ]; then 24 | # Set path and commit message for nightly builds. 25 | ARTIFACT_DOWNLOAD_PATH="support_matrices/nightly" 26 | COMMIT_MESSAGE="[skip ci] Update nightly support matrices" 27 | else 28 | # Set path and commit message for release tag builds. 29 | COMMIT_TAG="${BUILDKITE_TAG:-unknown-tag}" 30 | ARTIFACT_DOWNLOAD_PATH="support_matrices" 31 | COMMIT_MESSAGE="[skip ci] Update support matrices for ${COMMIT_TAG}" 32 | fi 33 | # Construct the repository URL with the access token for authentication. 34 | AUTHENTICATED_REPO_URL="https://x-access-token:${GITHUB_PAT}@${REPO_URL#https://}" 35 | 36 | # Ensure the GITHUB_PAT is available before proceeding. 37 | if [ -z "${GITHUB_PAT:-}" ]; then 38 | echo "--- ERROR: GITHUB_PAT secret not found. Cannot proceed." 39 | exit 1 40 | fi 41 | 42 | echo "--- Configuring Git user details" 43 | git config user.name "Buildkite Bot" 44 | git config user.email "buildkite-bot@users.noreply.github.com" 45 | 46 | echo "--- Fetching and checking out the target branch" 47 | git fetch origin "${TARGET_BRANCH}" 48 | git checkout "${TARGET_BRANCH}" 49 | git reset --hard origin/"${TARGET_BRANCH}" 50 | 51 | echo "--- Downloading CSV artifacts" 52 | mkdir -p "${ARTIFACT_DOWNLOAD_PATH}" 53 | buildkite-agent artifact download "*.csv" "${ARTIFACT_DOWNLOAD_PATH}/" --flat 54 | 55 | echo "--- Staging downloaded artifacts" 56 | git add "${ARTIFACT_DOWNLOAD_PATH}"/*.csv 57 | 58 | # --- Check for changes before committing --- 59 | if git diff --quiet --cached; then 60 | echo "No changes to commit. Exiting successfully." 61 | exit 0 62 | else 63 | echo "--- Committing changes" 64 | git commit -s -m "${COMMIT_MESSAGE}" 65 | 66 | echo "--- Pushing changes to '${TARGET_BRANCH}'" 67 | git push "${AUTHENTICATED_REPO_URL}" "HEAD:${TARGET_BRANCH}" 68 | fi 69 | -------------------------------------------------------------------------------- /tests/e2e/benchmarking/bench_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # ----------------------------------------------------------------------------- 5 | # BENCHMARK UTILITY FUNCTIONS 6 | # This file is sourced by various performance scripts (e.g., mlperf.sh, 7 | # llama_guard_perf_recipe.sh) to share common functions. 8 | # ----------------------------------------------------------------------------- 9 | 10 | # waitForServerReady: Blocks execution until the server prints the READY_MESSAGE or times out. 11 | # This logic is shared across all benchmark scripts. 12 | waitForServerReady() { 13 | # shellcheck disable=SC2155 14 | local start_time=$(date +%s) 15 | echo "Waiting for server ready message: '$READY_MESSAGE'" 16 | 17 | local fatal_error_patterns=( 18 | "RuntimeError:" 19 | "ValueError:" 20 | "FileNotFoundError:" 21 | "TypeError:" 22 | "ImportError:" 23 | "NotImplementedError:" 24 | "AssertionError:" 25 | "TimeoutError:" 26 | "OSError:" 27 | "AttributeError:" 28 | "NVMLError:" 29 | ) 30 | 31 | local error_regex 32 | error_regex=$(IFS=\|; echo "${fatal_error_patterns[*]}") 33 | 34 | while true; do 35 | current_time=$(date +%s) 36 | elapsed_time=$((current_time - start_time)) 37 | 38 | sleep 5 39 | 40 | if [[ "$elapsed_time" -ge "$TIMEOUT_SECONDS" ]]; then 41 | echo "TIMEOUT: Waited $elapsed_time seconds (limit was $TIMEOUT_SECONDS). The string '$READY_MESSAGE' was NOT found." 42 | # Call cleanup and exit (cleanup must be handled by the calling script's trap) 43 | exit 1 44 | fi 45 | 46 | if grep -Eq "$error_regex" "$LOG_FILE"; then 47 | echo "FATAL ERROR DETECTED: The server log contains a fatal error pattern." 48 | # Call cleanup and exit (cleanup must be handled by the calling script's trap) 49 | exit 1 50 | fi 51 | 52 | if grep -Fq "$READY_MESSAGE" "$LOG_FILE" ; then 53 | echo "Server is ready." 54 | return 0 55 | fi 56 | done 57 | } 58 | 59 | # cleanUp: Stops the vLLM server process and deletes log files. 60 | # Usage: cleanUp 61 | cleanUp() { 62 | echo "Stopping the vLLM server and cleaning up log files..." 63 | # $1 is the MODEL_NAME passed as argument 64 | pkill -f "vllm serve $1" 65 | # Kill all processes related to vllm. 66 | pgrep -f -i vllm | xargs -r kill -9 67 | 68 | # Clean up log files. Use -f to avoid errors if files don't exist. 69 | rm -f "$LOG_FILE" 70 | rm -f "$BENCHMARK_LOG_FILE" 71 | echo "Cleanup complete." 72 | } 73 | --------------------------------------------------------------------------------