├── .coveragerc
├── docs
    ├── assets
    │   ├── torchax.png
    │   ├── get_model.png
    │   ├── model-fn.png
    │   ├── sandwich.png
    │   ├── wrap_model.png
    │   ├── tpu_inference-light.png
    │   ├── tpu_inference_dark.png
    │   ├── tpu_inference_light.png
    │   ├── tpu-inference-banner.png
    │   ├── tpu_inference_dark_2.png
    │   ├── tpu_inference_dark_mode.png
    │   ├── tpu_inference_light_mode.png
    │   ├── tpu_header_new_preview_v1.png
    │   ├── tpu_header_new_preview_v2.png
    │   ├── tpu_inference_dark_20px_space.png
    │   ├── tpu_inference_dark_mode_short.png
    │   ├── tpu_inference_dark_more_space.png
    │   └── tpu_inference_light_mode_short.png
    ├── requirements.txt
    ├── recommended_models_features.md
    └── getting_started
    │   ├── installation.md
    │   └── quickstart.md
├── MANIFEST.in
├── support_matrices
    ├── multimodal_model_support_matrix.csv
    ├── parallelism_support_matrix.csv
    ├── nightly
    │   ├── parallelism_support_matrix.csv
    │   ├── multimodal_model_support_matrix.csv
    │   ├── kernel_support_matrix.csv
    │   ├── quantization_support_matrix.csv
    │   ├── text_only_model_support_matrix.csv
    │   └── feature_support_matrix.csv
    ├── text_only_model_support_matrix.csv
    ├── kernel_support_matrix.csv
    ├── quantization_support_matrix.csv
    └── feature_support_matrix.csv
├── requirements_benchmarking.txt
├── .buildkite
    ├── features
    │   ├── default_features.txt
    │   ├── sampling_params.yml
    │   ├── Single-Host-P-D-disaggregation.yml
    │   ├── Structured_Decoding.yml
    │   ├── Hybrid_kvcache.yml
    │   ├── runai_model_streamer_loader.yml
    │   ├── MLA.yml
    │   ├── MoE.yml
    │   ├── Quantized_Matmul.yml
    │   ├── Multimodal_Inputs.yml
    │   ├── Quantized_KV_Cache.yml
    │   ├── Quantized_Attention.yml
    │   ├── data_parallelism.yml
    │   ├── async_scheduler.yml
    │   ├── LoRA_Torch.yml
    │   ├── KV_Cache_Host_Offloading.yml
    │   ├── DCN-Based_P-D_disaggregation.yml
    │   └── Collective_Communication_Matmul.yml
    ├── pipeline_generation
    │   ├── constant.py
    │   ├── feature_template.yml
    │   └── vllm_native_model_template.yml
    ├── scripts
    │   ├── notify_test_results.sh
    │   ├── check_results.sh
    │   ├── run_disagg.sh
    │   ├── record_step_result.sh
    │   ├── commit_verified_commit_hashes.sh
    │   └── commit_support_matrices.sh
    ├── parallelism
    │   ├── CP.yml
    │   ├── EP.yml
    │   ├── SP.yml
    │   ├── TP.yml
    │   └── DP.yml
    └── quantization
    │   ├── AWQ_INT4.yml
    │   ├── FP8_W8A8.yml
    │   ├── FP4_W4A16.yml
    │   ├── FP8_W8A16.yml
    │   ├── INT8_W8A8.yml
    │   └── INT4_W4A16.yml
├── requirements.txt
├── tpu_inference
    ├── logger.py
    ├── env_override.py
    ├── core
    │   ├── __init__.py
    │   ├── sched
    │   │   └── __init__.py
    │   └── disagg_utils.py
    ├── kernels
    │   ├── __init__.py
    │   ├── mla
    │   │   ├── __init__.py
    │   │   └── v1
    │   │   │   └── __init__.py
    │   ├── fused_moe
    │   │   ├── __init__.py
    │   │   └── v1
    │   │   │   └── __init__.py
    │   ├── megablox
    │   │   ├── __init__.py
    │   │   └── common.py
    │   ├── collectives
    │   │   ├── __init__.py
    │   │   ├── all_gather_matmul_tuned_block_sizes.py
    │   │   └── util.py
    │   ├── flash_attention
    │   │   └── __init__.py
    │   ├── quantized_matmul
    │   │   ├── __init__.py
    │   │   └── util.py
    │   └── ragged_paged_attention
    │   │   ├── __init__.py
    │   │   ├── v2
    │   │       └── __init__.py
    │   │   └── v3
    │   │       ├── __init__.py
    │   │       └── util.py
    ├── layers
    │   ├── __init__.py
    │   ├── jax
    │   │   ├── __init__.py
    │   │   ├── moe
    │   │   │   └── __init__.py
    │   │   ├── sample
    │   │   │   └── __init__.py
    │   │   ├── attention
    │   │   │   └── __init__.py
    │   │   ├── misc.py
    │   │   ├── glossary.md
    │   │   └── pp_utils.py
    │   ├── vllm
    │   │   ├── __init__.py
    │   │   └── quantization
    │   │   │   └── compressed_tensors
    │   │   │       ├── __init__.py
    │   │   │       └── schemes
    │   │   │           └── __init__.py
    │   └── common
    │   │   ├── __init__.py
    │   │   ├── quant_methods.py
    │   │   └── attention_metadata.py
    ├── lora
    │   └── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── jax
    │   │   ├── __init__.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   └── qwix
    │   │   │       ├── __init__.py
    │   │   │       └── configs
    │   │   │           ├── int8_all_modules_w_only.yaml
    │   │   │           ├── fp8_all_modules_w_only.yaml
    │   │   │           ├── int8_default.yaml
    │   │   │           └── fp8_default.yaml
    │   ├── vllm
    │   │   ├── __init__.py
    │   │   └── vllm_model_wrapper_context.py
    │   └── common
    │   │   └── __init__.py
    ├── runner
    │   └── __init__.py
    ├── worker
    │   └── __init__.py
    ├── distributed
    │   └── __init__.py
    ├── executors
    │   └── __init__.py
    ├── experimental
    │   └── __init__.py
    ├── spec_decode
    │   ├── __init__.py
    │   └── jax
    │   │   └── __init__.py
    └── platforms
    │   └── __init__.py
├── requirements_v7x.txt
├── .dockerignore
├── pyproject.toml
├── tests
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── test_disagg_executor.py
    │   └── test_init.py
    ├── e2e
    │   ├── __init__.py
    │   ├── test_structured_decoding.py
    │   └── benchmarking
    │   │   └── bench_utils.sh
    ├── lora
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_bgmv.py
    │   └── test_lora_perf.py
    ├── executors
    │   └── __init__.py
    ├── kernels
    │   ├── __init__.py
    │   └── collectives
    │   │   └── __init__.py
    ├── layers
    │   ├── __init__.py
    │   ├── jax
    │   │   ├── __init__.py
    │   │   ├── moe
    │   │   │   └── __init__.py
    │   │   ├── attention
    │   │   │   └── __init__.py
    │   │   └── sample
    │   │   │   └── __init__.py
    │   ├── common
    │   │   └── __init__.py
    │   └── vllm
    │   │   ├── __init__.py
    │   │   └── test_fp8.py
    ├── models
    │   ├── __init__.py
    │   ├── jax
    │   │   ├── __init__.py
    │   │   └── utils
    │   │   │   └── __init__.py
    │   └── common
    │   │   └── __init__.py
    ├── platforms
    │   ├── __init__.py
    │   └── test_tpu_platform.py
    ├── runner
    │   └── __init__.py
    ├── worker
    │   └── __init__.py
    ├── distributed
    │   └── __init__.py
    ├── experimental
    │   └── __init__.py
    ├── spec_decode
    │   └── __init__.py
    └── scripts
    │   └── run_rpa_v3_tests.sh
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── workflows
    │   ├── pre-commit.yml
    │   ├── check_ready_label.yml
    │   └── release.yml
    ├── PULL_REQUEST_TEMPLATE
    │   └── MODELING_CODE_PR.md
    ├── ISSUE_TEMPLATE
    │   ├── 100-documentation.yml
    │   ├── 500-feature-request.yml
    │   └── 200-installation.yml
    ├── scripts
    │   └── determine_release_vars.sh
    └── CODEOWNERS
├── .readthedocs.yaml
├── DCO
├── scripts
    └── vllm
    │   └── integration
    │       └── conftest.py
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     tpu_inference/kernels/*
4 | 


--------------------------------------------------------------------------------
/docs/assets/torchax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/torchax.png


--------------------------------------------------------------------------------
/docs/assets/get_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/get_model.png


--------------------------------------------------------------------------------
/docs/assets/model-fn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/model-fn.png


--------------------------------------------------------------------------------
/docs/assets/sandwich.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/sandwich.png


--------------------------------------------------------------------------------
/docs/assets/wrap_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/wrap_model.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include README.md
3 | include tpu_inference/models/jax/utils/quantization/configs/*.yaml
4 | 


--------------------------------------------------------------------------------
/docs/assets/tpu_inference-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference-light.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light.png


--------------------------------------------------------------------------------
/docs/assets/tpu-inference-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu-inference-banner.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_2.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_mode.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_light_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light_mode.png


--------------------------------------------------------------------------------
/docs/assets/tpu_header_new_preview_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_header_new_preview_v1.png


--------------------------------------------------------------------------------
/docs/assets/tpu_header_new_preview_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_header_new_preview_v2.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark_20px_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_20px_space.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark_mode_short.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_mode_short.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_dark_more_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_dark_more_space.png


--------------------------------------------------------------------------------
/docs/assets/tpu_inference_light_mode_short.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/tpu-inference/HEAD/docs/assets/tpu_inference_light_mode_short.png


--------------------------------------------------------------------------------
/support_matrices/multimodal_model_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Model,UnitTest,IntegrationTest,Benchmark
2 | "meta-llama/Llama-4-Maverick-17B-128E-Instruct",unverified,unverified,unverified
3 | "Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅
4 | 


--------------------------------------------------------------------------------
/requirements_benchmarking.txt:
--------------------------------------------------------------------------------
1 | # These are required for running any benchmarking
2 | # See more information at scripts/vllm/benchmarking/README.md
3 | nltk
4 | evaluate
5 | datasets
6 | rouge-score
7 | scikit-learn
8 | pandas
9 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | mkdocs
 2 | mkdocs-api-autonav
 3 | mkdocs-material
 4 | mkdocs-awesome-nav
 5 | mkdocs-glightbox
 6 | python-markdown-math
 7 | mkdocs-same-dir
 8 | mkdocs-open-in-new-tab
 9 | mkdocs-table-reader-plugin
10 | 


--------------------------------------------------------------------------------
/.buildkite/features/default_features.txt:
--------------------------------------------------------------------------------
1 | Chunked Prefill (feature support matrix)
2 | Prefix Caching (feature support matrix)
3 | Ragged Paged Attention V3 (kernel support matrix)
4 | Single Program Multi Data (feature support matrix)
5 | 


--------------------------------------------------------------------------------
/support_matrices/parallelism_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,CorrectnessTest,PerformanceTest
2 | "CP",unverified,unverified
3 | "DP",❌,N/A
4 | "EP",unverified,unverified
5 | "PP",✅,✅
6 | "SP",unverified,unverified
7 | "TP",unverified,unverified
8 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/parallelism_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,CorrectnessTest,PerformanceTest
2 | "CP",unverified,unverified
3 | "DP",✅,unverified
4 | "EP",unverified,unverified
5 | "PP",✅,✅
6 | "SP",unverified,unverified
7 | "TP",unverified,unverified
8 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/multimodal_model_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Model,UnitTest,IntegrationTest,Benchmark
2 | "meta-llama/Llama-4-Maverick-17B-128E-Instruct",unverified,unverified,unverified
3 | "Qwen/Qwen2.5-VL-7B-Instruct",✅,✅,✅
4 | "Qwen/Qwen3-Omni-30B-A3B-Instruct",unverified,unverified,unverified
5 | 


--------------------------------------------------------------------------------
/support_matrices/text_only_model_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Model,UnitTest,IntegrationTest,Benchmark
2 | "meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅
3 | "Qwen/Qwen3-4B",✅,✅,✅
4 | "google/gemma-3-27b-it",✅,✅,✅
5 | "Qwen/Qwen3-32B",✅,✅,✅
6 | "meta-llama/Llama-Guard-4-12B",✅,✅,✅
7 | "meta-llama/Llama-3.1-8B-Instruct",✅,✅,✅
8 | "Qwen/Qwen3-30B-A3B",✅,✅,✅
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tpu-info==0.7.1
 2 | yapf==0.43.0
 3 | pytest
 4 | pytest-mock
 5 | absl-py
 6 | numpy
 7 | google-cloud-storage
 8 | jax[tpu]==0.8.0
 9 | jaxlib==0.8.0
10 | jaxtyping
11 | flax==0.11.1
12 | torchax==0.0.10
13 | qwix==0.1.1
14 | torchvision==0.24.0
15 | pathwaysutils
16 | parameterized
17 | numba==0.62.1
18 | runai-model-streamer[s3,gcs]==0.15.0
19 | 


--------------------------------------------------------------------------------
/support_matrices/kernel_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,CorrectnessTest,PerformanceTest
2 | "Collective Communication Matmul",✅,unverified
3 | "MLA",unverified,unverified
4 | "MoE",unverified,unverified
5 | "Quantized Attention",unverified,unverified
6 | "Quantized KV Cache",unverified,unverified
7 | "Quantized Matmul",unverified,unverified
8 | "Ragged Paged Attention V3",✅,✅
9 | 


--------------------------------------------------------------------------------
/support_matrices/quantization_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,Recommended TPU Generations,CorrectnessTest,PerformanceTest
2 | "AWQ INT4","v5, v6",unverified,unverified
3 | "FP4 W4A16",v7,unverified,unverified
4 | "FP8 W8A8",v7,unverified,unverified
5 | "FP8 W8A16",v7,unverified,unverified
6 | "INT4 W4A16","v5, v6",unverified,unverified
7 | "INT8 W8A8","v5, v6",unverified,unverified
8 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/kernel_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,CorrectnessTest,PerformanceTest
2 | "Collective Communication Matmul",✅,unverified
3 | "MLA",unverified,unverified
4 | "MoE",unverified,unverified
5 | "Quantized Attention",unverified,unverified
6 | "Quantized KV Cache",unverified,unverified
7 | "Quantized Matmul",unverified,unverified
8 | "Ragged Paged Attention V3",✅,✅
9 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/quantization_support_matrix.csv:
--------------------------------------------------------------------------------
1 | Feature,Recommended TPU Generations,CorrectnessTest,PerformanceTest
2 | "AWQ INT4","v5, v6",unverified,unverified
3 | "FP4 W4A16",v7,unverified,unverified
4 | "FP8 W8A8",v7,unverified,unverified
5 | "FP8 W8A16",v7,unverified,unverified
6 | "INT4 W4A16","v5, v6",unverified,unverified
7 | "INT8 W8A8","v5, v6",unverified,unverified
8 | 


--------------------------------------------------------------------------------
/tpu_inference/logger.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | from vllm.logger import _VllmLogger
 4 | from vllm.logger import init_logger as init_vllm_logger
 5 | 
 6 | 
 7 | def init_logger(name: str) -> _VllmLogger:
 8 |     # Prepend the root "vllm" to the module path to use vllm's configured logger.
 9 |     patched_name = "vllm." + name
10 |     return init_vllm_logger(patched_name)
11 | 


--------------------------------------------------------------------------------
/requirements_v7x.txt:
--------------------------------------------------------------------------------
 1 | # This file contains additional dependencies needed for TPU v7x support.
 2 | # It is expected to be used in conjunction with the main requirements.txt file.
 3 | --pre
 4 | -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/
 5 | -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 6 | jax==0.8.1
 7 | jaxlib==0.8.1
 8 | jaxtyping==0.3.2
 9 | libtpu==0.0.31
10 | 


--------------------------------------------------------------------------------
/tpu_inference/env_override.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | # SPDX-FileCopyrightText: Copyright contributors to the tpu-inference project
 3 | 
 4 | import os
 5 | 
 6 | # Disable CUDA-specific shared experts stream for TPU
 7 | # This prevents errors when trying to create CUDA streams on TPU hardware
 8 | # The issue was introduced by vllm-project/vllm#26440
 9 | os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
10 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | /.venv
 2 | /build
 3 | dist
 4 | vllm/*.so
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | 
11 | .mypy_cache
12 | 
13 | # Distribution / packaging
14 | .Python
15 | /build/
16 | cmake-build-*/
17 | CMakeUserPresets.json
18 | develop-eggs/
19 | /dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 | 


--------------------------------------------------------------------------------
/support_matrices/feature_support_matrix.csv:
--------------------------------------------------------------------------------
 1 | Feature,CorrectnessTest,PerformanceTest
 2 | "Chunked Prefill",✅,✅
 3 | "DCN-based P/D disaggregation",unverified,unverified
 4 | "KV cache host offloading",unverified,unverified
 5 | "LoRA_Torch",✅,unverified
 6 | "Multimodal Inputs",✅,✅
 7 | "Out-of-tree model support",✅,✅
 8 | "Prefix Caching",✅,✅
 9 | "Single Program Multi Data",✅,✅
10 | "Speculative Decoding: Eagle3",✅,✅
11 | "Speculative Decoding: Ngram",✅,✅
12 | "async scheduler",✅,✅
13 | "runai_model_streamer_loader",✅,N/A
14 | "sampling_params",✅,N/A
15 | "structured_decoding",✅,N/A
16 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/text_only_model_support_matrix.csv:
--------------------------------------------------------------------------------
 1 | Model,UnitTest,IntegrationTest,Benchmark
 2 | "moonshotai/Kimi-K2-Thinking",unverified,unverified,unverified
 3 | "Qwen/Qwen3-Coder-480B-A35B-Instruct",unverified,unverified,unverified
 4 | "meta-llama/Llama-3.3-70B-Instruct",✅,✅,✅
 5 | "Qwen/Qwen3-4B",✅,✅,✅
 6 | "google/gemma-3-27b-it",✅,✅,✅
 7 | "Qwen/Qwen3-32B",✅,✅,✅
 8 | "deepseek-ai/DeepSeek-V3.1",unverified,unverified,unverified
 9 | "meta-llama/Llama-Guard-4-12B",✅,✅,✅
10 | "openai/gpt-oss-120b",unverified,unverified,unverified
11 | "meta-llama/Llama-3.1-8B-Instruct",✅,✅,✅
12 | "Qwen/Qwen3-30B-A3B",✅,✅,✅
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/support_matrices/nightly/feature_support_matrix.csv:
--------------------------------------------------------------------------------
 1 | Feature,CorrectnessTest,PerformanceTest
 2 | "Chunked Prefill",✅,✅
 3 | "DCN-based P/D disaggregation",unverified,✅
 4 | "KV cache host offloading",unverified,unverified
 5 | "LoRA_Torch",✅,✅
 6 | "Multimodal Inputs",✅,✅
 7 | "Out-of-tree model support",✅,✅
 8 | "Prefix Caching",✅,✅
 9 | "Single Program Multi Data",✅,✅
10 | "Single-Host-P-D-disaggregation",N/A,N/A
11 | "Speculative Decoding: Eagle3",✅,✅
12 | "Speculative Decoding: Ngram",✅,✅
13 | "async scheduler",✅,✅
14 | "data_parallelism",✅,❌
15 | "runai_model_streamer_loader",✅,N/A
16 | "sampling_params",✅,N/A
17 | "structured_decoding",✅,N/A
18 | 


--------------------------------------------------------------------------------
/tests/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/lora/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/executors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/jax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/models/jax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/platforms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/runner/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/experimental/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/jax/moe/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/models/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/spec_decode/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/lora/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/runner/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/worker/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/kernels/collectives/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/jax/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/layers/jax/sample/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/models/jax/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/core/sched/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/executors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/experimental/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/mla/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/spec_decode/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/fused_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/megablox/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/mla/v1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/moe/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/sample/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/spec_decode/jax/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/collectives/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/fused_moe/v1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/qwix/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/flash_attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/quantized_matmul/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/ragged_paged_attention/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/ragged_paged_attention/v2/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/ragged_paged_attention/v3/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tpu_inference/platforms/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # ruff: noqa
16 | from tpu_inference.platforms.tpu_platform import TpuPlatform
17 | 


--------------------------------------------------------------------------------
/tests/layers/vllm/test_fp8.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import pytest
16 | 
17 | pytest.skip("FP8 implementation not complete yet", allow_module_level=True)
18 | 


--------------------------------------------------------------------------------
/.buildkite/pipeline_generation/constant.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | QUEUE_TO_TENSOR_PARALLEL_SIZE_MAP = {
16 |     "tpu_v6e_queue": 1,
17 |     "tpu_v6e_8_queue": 8,
18 | }
19 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/qwix/configs/int8_all_modules_w_only.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | qwix:
16 |   rules:
17 |     # NOTE: each entry corresponds to a qwix.QuantizationRule
18 |     - module_path: '.*'
19 |       weight_qtype: 'int8'
20 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/qwix/configs/fp8_all_modules_w_only.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | qwix:
16 |   rules:
17 |     # NOTE: each entry corresponds to a qwix.QuantizationRule
18 |     - module_path: '.*'
19 |       weight_qtype: 'float8_e4m3fn'
20 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/qwix/configs/int8_default.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | qwix:
16 |   rules:
17 |     # NOTE: each entry corresponds to a qwix.QuantizationRule
18 |     - module_path: '.*'
19 |       weight_qtype: 'int8'
20 |       act_qtype: 'int8'
21 | 


--------------------------------------------------------------------------------
/tpu_inference/models/jax/utils/qwix/configs/fp8_default.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | qwix:
16 |   rules:
17 |     # NOTE: each entry corresponds to a qwix.QuantizationRule
18 |     - module_path: '.*'
19 |       weight_qtype: 'float8_e4m3fn'
20 |       act_qtype: 'float8_e4m3fn'
21 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/common/quant_methods.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | UNQUANTIZED = "unquantized"
16 | MXFP4 = "mxfp4"
17 | AWQ = "awq"
18 | COMPRESSED_TENSORS = "compressed-tensors"
19 | FP8 = "fp8"
20 | 
21 | 
22 | def get_tpu_quant_method(quant_method: str) -> str:
23 |     return "tpu-" + quant_method
24 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/notify_test_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -e
17 | 
18 | ANY_FAILED=$(buildkite-agent meta-data get "CI_TESTS_FAILED")
19 | FAILURE_LABEL="Not all models and/or features passed"
20 | 
21 | echo "--- Checking test outcomes"
22 | 
23 | if [ "${ANY_FAILED}" = "true" ] ; then
24 |   echo "${FAILURE_LABEL}"
25 |   exit 1
26 | else
27 |   echo "All models & features passed."
28 | fi
29 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Start with a short description of what the PR does and how this is a change from
 4 | the past.
 5 | 
 6 | The rest of the description includes relevant details and context, examples:
 7 | - why is this change being made,
 8 | - the problem being solved and any relevant context,
 9 | - why this is a good solution,
10 | - some information about the specific implementation,
11 | - shortcomings of the solution and possible future improvements.
12 | 
13 | If the change fixes a Github issue, please include a link, e.g.,:
14 | FIXES: #123456
15 | 
16 | # Tests
17 | 
18 | Please describe how you tested this change, and include any instructions and/or
19 | commands to reproduce.
20 | 
21 | # Checklist
22 | 
23 | Before submitting this PR, please make sure:
24 | - I have performed a self-review of my code.
25 | - I have necessary comments in my code, particularly in hard-to-understand areas.
26 | - I have made or will make corresponding changes to any relevant documentation.
27 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: pre-commit
16 | 
17 | on:
18 |   pull_request:
19 |   push:
20 |     branches: [main]
21 | 
22 | jobs:
23 |   pre-commit:
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - uses: actions/checkout@v4
27 |       - name: Set up Python
28 |         uses: actions/setup-python@v5
29 |         with:
30 |           python-version: '3.12'
31 |       - name: Install pre-commit
32 |         run: pip install pre-commit
33 |       - name: Run pre-commit
34 |         uses: pre-commit/action@v3.0.1
35 |         with:
36 |           extra_args: --all-files
37 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/misc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import math
16 | from typing import Tuple
17 | 
18 | import jax
19 | from jax.sharding import NamedSharding
20 | from jax.sharding import PartitionSpec as P
21 | 
22 | 
23 | # TODO(xiang): move this to weight_utils.py
24 | def shard_put(x: jax.Array, sharding_names: Tuple[str, ...] | P,
25 |               mesh: jax.sharding.Mesh) -> jax.Array:
26 |     # Single device sharding requires this special handling
27 |     # to avoid the recursive jit error.
28 |     if math.prod(mesh.axis_sizes) == 1:
29 |         return jax.device_put(x, mesh.devices.flatten()[0])
30 |     return jax.device_put(x, NamedSharding(mesh, P(*sharding_names)))
31 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Read the Docs configuration file
16 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
17 | 
18 | # Required
19 | version: 2
20 | 
21 | # Set the OS, Python version, and other tools you might need
22 | build:
23 |   os: ubuntu-24.04
24 |   tools:
25 |     python: "3.13"
26 | 
27 | # Build documentation with Mkdocs
28 | mkdocs:
29 |    configuration: mkdocs.yml
30 | 
31 | # Optionally, but recommended,
32 | # declare the Python requirements required to build your documentation
33 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
34 | python:
35 |    install:
36 |    - requirements: docs/requirements.txt
37 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/MODELING_CODE_PR.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Start with a short description of what the PR does and how this is a change from
 4 | the past.
 5 | 
 6 | The rest of the description includes relevant details and context, examples:
 7 | - why is this change being made,
 8 | - the problem being solved and any relevant context,
 9 | - why this is a good solution,
10 | - some information about the specific implementation,
11 | - shortcomings of the solution and possible future improvements.
12 | 
13 | If the change fixes a Github issue, please include a link, e.g.,:
14 | FIXES: #123456
15 | 
16 | # Tests
17 | 
18 | Please describe how you tested this change, and include any instructions and/or
19 | commands to reproduce.
20 | 
21 | # Checklist
22 | 
23 | Before submitting this PR, please make sure (put X in square brackets):
24 | - [ ] I have performed a self-review of my code.
25 | - [ ] I have necessary comments in my code, particularly in hard-to-understand areas.
26 | - [ ] I have made or will make corresponding changes to any relevant documentation.
27 | - [ ] I have reviewed the uLLM modeling code checklist: https://docs.google.com/document/d/1DGQBVvr2bh4G8tBUO1YH8pO7Dd_myw5rfEMfVDymEk8/edit?resourcekey=0-V7MGHu3aQjJH6YrI3-y8Hg&tab=t.t91cyovog2mr#heading=h.cqdzv8mlszca
28 | - [ ] I have received at least 1 readability approval and 1 correctness approval.
29 | 


--------------------------------------------------------------------------------
/tests/scripts/run_rpa_v3_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # Install dependencies
18 | pip install -U --pre jax jaxlib libtpu requests -i https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
19 | 
20 | TPU_INFERENCE_DIR="/workspace/tpu_inference/"
21 | 
22 | # RPA v3 test files - add new tests here
23 | RPA_V3_TESTS=(
24 |     "tests/kernels/ragged_paged_attention_kernel_v3_test.py"
25 |     "tests/layers/attention/test_deepseek_v3_attention.py"
26 | )
27 | 
28 | # Convert array to space-separated string for pytest
29 | FULL_PATHS=()
30 | for test in "${RPA_V3_TESTS[@]}"; do
31 |     FULL_PATHS+=("$TPU_INFERENCE_DIR/$test")
32 | done
33 | 
34 | # Run all tests in a single pytest command
35 | pytest "${FULL_PATHS[@]}"
36 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 
 6 | Everyone is permitted to copy and distribute verbatim copies of this
 7 | license document, but changing it is not allowed.
 8 | 
 9 | 
10 | Developer's Certificate of Origin 1.1
11 | 
12 | By making a contribution to this project, I certify that:
13 | 
14 | (a) The contribution was created in whole or in part by me and I
15 |     have the right to submit it under the open source license
16 |     indicated in the file; or
17 | 
18 | (b) The contribution is based upon previous work that, to the best
19 |     of my knowledge, is covered under an appropriate open source
20 |     license and I have the right under that license to submit that
21 |     work with modifications, whether created in whole or in part
22 |     by me, under the same open source license (unless I am
23 |     permitted to submit under a different license), as indicated
24 |     in the file; or
25 | 
26 | (c) The contribution was provided directly to me by some other
27 |     person who certified (a), (b) or (c) and I have not modified
28 |     it.
29 | 
30 | (d) I understand and agree that this project and the contribution
31 |     are public and that a record of the contribution (including all
32 |     personal information I submit with it, including my sign-off) is
33 |     maintained indefinitely and may be redistributed consistent with
34 |     this project or the open source license(s) involved.
35 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/check_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -e
17 | 
18 | ANY_FAILED=false
19 | if [ "$#" -lt 2 ]; then
20 |     echo "Usage: $0 <failure_label> <step_key_1> <step_key_2> ..."
21 |     exit 1
22 | fi
23 | 
24 | FAILURE_LABEL="$1"
25 | shift
26 | 
27 | echo "--- Checking Test Outcomes"
28 | 
29 | for KEY in "$@"; do
30 |     OUTCOME=$(buildkite-agent step get "outcome" --step "${KEY}" || echo "skipped")
31 |     echo "Step ${KEY} outcome: ${OUTCOME}"
32 | 
33 |     if [ "${OUTCOME}" != "passed" ] && [ "${OUTCOME}" != "skipped" ] ; then
34 |         ANY_FAILED=true
35 |     fi
36 | done
37 | 
38 | if [ "${ANY_FAILED}" = "true" ] ; then
39 |     cat <<- YAML | buildkite-agent pipeline upload
40 | steps:
41 |    - label: "${FAILURE_LABEL}"
42 |      agents:
43 |        queue: cpu
44 |      command: echo "${FAILURE_LABEL}"
45 | YAML
46 |     exit 1
47 | else
48 |     echo "All relevant TPU tests passed (or were skipped)."
49 | fi
50 | 


--------------------------------------------------------------------------------
/.github/workflows/check_ready_label.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Enforce Ready Label
16 | 
17 | on:
18 |   pull_request:
19 |     types: [opened, synchronize, reopened, labeled, unlabeled]
20 | 
21 | jobs:
22 |   check-ready-label:
23 |     name: Check for Ready Label
24 |     runs-on: ubuntu-latest
25 |     steps:
26 |       - name: Verify label existence
27 |         shell: bash
28 |         run: |
29 |           echo "Checking for 'ready' label..."
30 |           LABELS_JSON='${{ toJSON(github.event.pull_request.labels) }}'
31 |           echo "Current labels: $LABELS_JSON"
32 |           MATCHED_LABEL=$(echo "$LABELS_JSON" | jq -r '.[] | select(.name=="ready") | .name')
33 | 
34 |           if [ "$MATCHED_LABEL" == "ready" ]; then
35 |             echo "'ready' label found. Check passed."
36 |             exit 0
37 |           else
38 |             echo "'ready' label NOT found."
39 |             echo "Blocking merge until 'ready' label is applied."
40 |             exit 1
41 |           fi
42 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/run_disagg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | set -ex
18 | 
19 | IMAGE_NAME='vllm-tpu'
20 | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
21 | # Source the environment setup script
22 | # shellcheck disable=SC1091
23 | source "$SCRIPT_DIR/setup_docker_env.sh"
24 | setup_environment $IMAGE_NAME
25 | 
26 | SCRIPT_DIR=$SCRIPT_DIR/../../examples/disagg
27 | 
28 | # call the /examples/disagg/run_disagg_multi_host.sh script
29 | CONTAINER_PREFIX="disagg-node"
30 | 
31 | CONTAINER_PREFIX=${CONTAINER_PREFIX} \
32 | RUN_IN_BUILDKITE=true \
33 | MODEL=${MODEL:="Qwen/Qwen3-0.6B"} \
34 | DOCKER_IMAGE=${IMAGE_NAME}:${BUILDKITE_COMMIT} \
35 | "$SCRIPT_DIR/run_disagg_multi_host.sh" "$@"
36 | 
37 | # clear existing containers
38 | CONTAINERS=$(docker ps -a --filter "name=${CONTAINER_PREFIX}*" -q)
39 | if [ -n "$CONTAINERS" ]; then
40 |   # shellcheck disable=SC2086
41 |   docker stop $CONTAINERS
42 |   # shellcheck disable=SC2086
43 |   docker rm -f $CONTAINERS
44 | fi
45 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/glossary.md:
--------------------------------------------------------------------------------
 1 | ### Variable Glossary
 2 | 
 3 | | Variable | Full Name | Description |
 4 | | :--- | :--- | :--- |
 5 | | **B** | Batch Size | The number of samples processed at once. |
 6 | | **T** | Sequence Length | The number of tokens in the Query sequence. |
 7 | | **S** | Sequence Length | The number of tokens in the Key/Value sequence. |
 8 | | **D** | $d_{model}$ | The embedding dimension of the model. |
 9 | | **F** | $d_{ff}$ | The hidden dimension of the feed-forward MLP layers. |
10 | | **V** | Vocab Size | The size of the vocabulary. |
11 | | **H** | Head Dimension | The dimension of each attention head, typically $D/N$. |
12 | | **N** | Number of Query Heads | The total number of query heads in multi-head attention. |
13 | | **Q** | Number of Query Heads | Synonymous with **N**. |
14 | | **K** | Number of Key/Value Heads | The total number of key/value heads. |
15 | | **C** | Expert Capacity | The maximum number of tokens an expert can process in an MoE layer. |
16 | | **X** | Activated Experts | The number of activated experts per token in MoE. |
17 | | **G** | Number of Groups | The number of groups for grouped-query attention. |
18 | | **E** | Total Experts | The total number of experts in the MoE layer. |
19 | | **M** | Experts per Group | The number of experts within each group, where $M = E/G$. |
20 | | **A** | Q Lora Rank | Used for DeepSeek models.
21 | | **L** | Product of QK NoPE Head Dim and V Head Dim | Used for DeepSeek models.
22 | | **P** | Product of Total (NoPE + RoPE) QK Head Dim and V Head Dim | Used for DeepSeek models.
23 | | **R** | Product of Number of Attention Heads and V Head Dim | Used for DeepSeek models.
24 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/common/attention_metadata.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import functools
16 | from dataclasses import dataclass, field
17 | from typing import Any
18 | 
19 | import jax
20 | 
21 | 
22 | @functools.partial(
23 |     jax.tree_util.register_dataclass,
24 |     data_fields=[
25 |         "input_positions",
26 |         "block_tables",
27 |         "seq_lens",
28 |         "query_start_loc",
29 |         "request_distribution",
30 |     ],
31 |     meta_fields=[],
32 |     drop_fields=["query_start_loc_cpu", "seq_lens_cpu"],
33 | )
34 | @dataclass
35 | class AttentionMetadata(object):
36 |     # (padded_total_num_scheduled_tokens,)
37 |     input_positions: jax.Array
38 |     # (max_num_seqs * max_num_blocks_per_req,)
39 |     block_tables: jax.Array = None
40 |     # (max_num_seqs,)
41 |     seq_lens: jax.Array = None
42 |     # (max_num_seqs + 1,)
43 |     query_start_loc: jax.Array = None
44 |     # (3,)
45 |     request_distribution: jax.Array = None
46 | 
47 |     query_start_loc_cpu: Any = field(init=False)
48 |     seq_lens_cpu: Any = field(init=False)
49 | 


--------------------------------------------------------------------------------
/.buildkite/features/sampling_params.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: sampling_params
16 | # pipeline-type: feature support matrix
17 | # Sampling parameters control how the model selects tokens during generation.
18 | # These tests verify that temperature, top_p, top_k, and logprobs work correctly.
19 | steps:
20 |   - label: "Correctness tests for sampling_params"
21 |     key: "sampling_params_CorrectnessTest"
22 |     soft_fail: true
23 |     agents:
24 |       queue: tpu_v6e_queue
25 |     commands:
26 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_sampling_params.py
27 |   - label: "Record correctness test result for sampling_params"
28 |     key: "record_sampling_params_CorrectnessTest"
29 |     depends_on: "sampling_params_CorrectnessTest"
30 |     env:
31 |       CI_TARGET: "sampling_params"
32 |       CI_STAGE: "CorrectnessTest"
33 |       CI_CATEGORY: "feature support matrix"
34 |     agents:
35 |       queue: cpu
36 |     commands:
37 |       - |
38 |         .buildkite/scripts/record_step_result.sh sampling_params_CorrectnessTest
39 | 


--------------------------------------------------------------------------------
/tpu_inference/core/disagg_utils.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | from tpu_inference import envs
 6 | 
 7 | 
 8 | def is_disagg_enabled() -> bool:
 9 |     # We triggrer our code path as long as prefill slices are set. This
10 |     # allows us to test interleave mode effectively with the code path
11 |     # for comparison purposes.
12 |     return bool(envs.PREFILL_SLICES)
13 | 
14 | 
15 | def _parse_slices(slices_str: str) -> Tuple[int, ...]:
16 |     """Parse slices environment variable and return the a list of integers, each the size of a slice.
17 | 
18 |     For example, if slices_str is set to `2x2,2x1,2x4`, we should return `(4, 2, 8)`.
19 | 
20 |     Throws exception if the slice str is malformed.
21 |     """
22 |     if not slices_str:
23 |         return ()
24 | 
25 |     try:
26 |         slice_sizes = []
27 |         for s in slices_str.split(','):
28 |             dims = s.split('x')
29 |             if len(dims) == 1:
30 |                 slice_sizes.append(int(dims[0]))
31 |             elif len(dims) == 2:
32 |                 slice_sizes.append((int(dims[0]), int(dims[1])))
33 |             else:
34 |                 raise ValueError("Each slice must be in 'N' or 'NxM' format.")
35 |         return tuple(slice_sizes)
36 |     except ValueError as e:
37 |         raise ValueError(f"Malformed slice string: '{slices_str}'") from e
38 | 
39 | 
40 | def get_prefill_slices() -> Tuple[int, ...]:
41 |     if not envs.PREFILL_SLICES:
42 |         return ()
43 |     return _parse_slices(envs.PREFILL_SLICES)
44 | 
45 | 
46 | def get_decode_slices() -> Tuple[int, ...]:
47 |     if not envs.DECODE_SLICES:
48 |         return ()
49 |     return _parse_slices(envs.DECODE_SLICES)
50 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/100-documentation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: 📚 Documentation
16 | description: Report an issue related to https://github.com/vllm-project/tpu-inference/tree/main/docs
17 | title: "[Doc]: "
18 | labels: ["documentation"]
19 | 
20 | body:
21 | - type: textarea
22 |   attributes:
23 |     label: 📚 The doc issue
24 |     description: >
25 |       A clear and concise description of what content in https://github.com/vllm-project/tpu-inference/tree/main/docs is an issue.
26 |   validations:
27 |     required: true
28 | - type: textarea
29 |   attributes:
30 |     label: Suggest a potential alternative/fix
31 |     description: >
32 |       Tell us how we could improve the documentation in this regard.
33 | - type: markdown
34 |   attributes:
35 |     value: >
36 |       Thanks for contributing 🎉!
37 | - type: checkboxes
38 |   id: askllm
39 |   attributes:
40 |     label: Before submitting a new issue...
41 |     options:
42 |       - label: Make sure you already searched for relevant issues and checked the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs).
43 |         required: true
44 | 


--------------------------------------------------------------------------------
/tests/lora/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import tempfile
16 | 
17 | import pytest
18 | from vllm.config import set_current_vllm_config
19 | from vllm.distributed import cleanup_dist_env_and_memory
20 | from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
21 |                                              init_distributed_environment)
22 | from vllm.engine.arg_utils import EngineArgs
23 | 
24 | 
25 | @pytest.fixture
26 | def dist_init():
27 |     engine_args = EngineArgs(
28 |         model="Qwen/Qwen2-1.5B-Instruct",
29 |         max_model_len=64,
30 |         max_num_batched_tokens=64,
31 |         max_num_seqs=4,
32 |     )
33 | 
34 |     vllm_config = engine_args.create_engine_config()
35 | 
36 |     with set_current_vllm_config(vllm_config):
37 |         temp_file = tempfile.mkstemp()[1]
38 |         init_distributed_environment(
39 |             1,
40 |             0,
41 |             local_rank=0,
42 |             distributed_init_method=f"file://{temp_file}",
43 |             backend="gloo")
44 |         ensure_model_parallel_initialized(1, 1)
45 |         yield vllm_config
46 |     cleanup_dist_env_and_memory(shutdown_ray=True)
47 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/record_step_result.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -e
17 | 
18 | if [ "$#" -ne 1 ]; then
19 |     echo "Usage: $0 <step_key>"
20 |     exit 1
21 | fi
22 | 
23 | STEP_KEY="$1"
24 | 
25 | echo "--- Checking ${STEP_KEY} Outcome"
26 | 
27 | # Try to get the custom string you saved
28 | CUSTOM_STATUS=$(buildkite-agent meta-data get "${STEP_KEY}" --default "")
29 | 
30 | if [ -n "$CUSTOM_STATUS" ]; then
31 |     OUTCOME="$CUSTOM_STATUS"
32 | else
33 |     OUTCOME=$(buildkite-agent step get "outcome" --step "${STEP_KEY}" || echo "skipped")
34 | fi
35 | 
36 | echo "Step ${STEP_KEY} outcome: ${OUTCOME}"
37 | message=""
38 | 
39 | case $OUTCOME in
40 |   "passed")
41 |     message="✅"
42 |     ;;
43 |   "skipped")
44 |     message="N/A"
45 |     ;;
46 |   "unverified")
47 |     message="unverified"
48 |     ;;
49 |   *)
50 |     message="❌"
51 |     ;;
52 | esac
53 | 
54 | buildkite-agent meta-data set "${CI_TARGET}_category" "${CI_CATEGORY}"
55 | buildkite-agent meta-data set "${CI_TARGET}:${CI_STAGE}" "${message}"
56 | 
57 | if [ "${OUTCOME}" != "passed" ] && [ "${OUTCOME}" != "skipped" ] && [ "${OUTCOME}" != "unverified" ]; then
58 |     exit 1
59 | fi
60 | 


--------------------------------------------------------------------------------
/scripts/vllm/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | def pytest_addoption(parser):
17 |     """Adds custom command-line options to pytest."""
18 |     parser.addoption("--tensor-parallel-size",
19 |                      type=int,
20 |                      default=1,
21 |                      help="The tensor parallel size to use for the test.")
22 |     parser.addoption(
23 |         "--expected-value",
24 |         type=float,
25 |         default=None,
26 |         help=
27 |         "This value will be used to compare the measure value and determine if the test passes or fails."
28 |     )
29 |     parser.addoption("--model-name",
30 |                      type=str,
31 |                      default=None,
32 |                      help="Model name to test (e.g., 'model1')")
33 |     parser.addoption("--fp8-kv-model-name",
34 |                      type=str,
35 |                      default=None,
36 |                      help="Model name to test fp8-kv (e.g., 'model1')")
37 |     parser.addoption(
38 |         "--dataset-path",
39 |         type=str,
40 |         default=None,
41 |         help=
42 |         "Path to the dataset file used for accuracy evaluation (CSV or PKL).")
43 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | """All-gather matmul kernel's tuned block sizes."""
 3 | 
 4 | import re
 5 | 
 6 | import jax
 7 | 
 8 | # key:
 9 | #    - tpu_version
10 | #    - m
11 | #    - n
12 | #    - k
13 | #    - dtype
14 | #    - tp_size
15 | # value:
16 | #    - bn
17 | #    - bk
18 | TUNED_BLOCK_SIZES = {
19 |     # go/keep-sorted start
20 |     (6, 1024, 51200, 5120, 'bfloat16', 8): (6400, 2560),
21 |     (6, 1024, 57344, 8192, 'bfloat16', 8): (7168, 8192),
22 |     (6, 2048, 51200, 5120, 'bfloat16', 8): (1280, 5120),
23 |     (6, 2048, 57344, 8192, 'bfloat16', 8): (1024, 8192),
24 |     (6, 4096, 51200, 5120, 'bfloat16', 8): (3200, 5120),
25 |     (6, 8192, 51200, 5120, 'bfloat16', 8): (1280, 5120),
26 |     # go/keep-sorted end
27 | }
28 | 
29 | 
30 | def get_tpu_version() -> int:
31 |     """Returns the numeric version of the TPU, or -1 if not on TPU."""
32 |     kind = jax.devices()[0].device_kind
33 |     if 'TPU' not in kind:
34 |         return -1
35 |     if kind.endswith(' lite'):
36 |         kind = kind[:-len(' lite')]
37 | 
38 |     # v6: "TPU v6"
39 |     # v7: "TPU7x"
40 |     assert kind[:3] == 'TPU', kind
41 |     return int(re.search(r'\d+', kind).group())
42 | 
43 | 
44 | def get_key(
45 |     m,
46 |     n,
47 |     k,
48 |     dtype,
49 |     tp_size,
50 | ):
51 |     """Returns the key for the given parameters."""
52 |     return (
53 |         get_tpu_version(),
54 |         m,
55 |         n,
56 |         k,
57 |         dtype,
58 |         tp_size,
59 |     )
60 | 
61 | 
62 | def get_tuned_block_sizes(m, n, k, dtype_name, tp_size):
63 |     """Returns the tuned block sizes for the given parameters."""
64 |     key = get_key(m, n, k, dtype_name, tp_size)
65 |     return TUNED_BLOCK_SIZES.get(key, (None, None))
66 | 


--------------------------------------------------------------------------------
/.buildkite/features/Single-Host-P-D-disaggregation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Single-Host-P-D-disaggregation
16 | # pipeline-type: features support matrix
17 | steps:
18 |   - label: "Correctness tests for Single-Host P-D disaggregation"
19 |     key: "SingleHostPDDisaggregation_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_8_queue
23 |     commands:
24 |       - |
25 |         .buildkite/scripts/run_in_docker.sh \
26 |           python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_local_disagg.py::test_disaggregated_serving \
27 |           /workspace/tpu_inference/tests/e2e/test_local_disagg.py::test_disaggregated_serving_correctness
28 |   - label: "Record correctness test result for Single-Host P-D disaggregation"
29 |     key: "record_SingleHostPDDisaggregation_CorrectnessTest"
30 |     depends_on: "SingleHostPDDisaggregation_CorrectnessTest"
31 |     env:
32 |       CI_TARGET: "SingleHostPDDisaggregation"
33 |       CI_STAGE: "CorrectnessTest"
34 |       CI_CATEGORY: "features support matrix"
35 |     agents:
36 |       queue: cpu
37 |     commands:
38 |       - |
39 |         .buildkite/scripts/record_step_result.sh SingleHostPDDisaggregation_CorrectnessTest
40 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/megablox/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Common utilities for GMM kernels."""
15 | 
16 | import re
17 | 
18 | import jax
19 | import jax.numpy as jnp
20 | 
21 | 
22 | def is_tpu() -> bool:
23 |     return "TPU" in jax.devices()[0].device_kind
24 | 
25 | 
26 | def tpu_kind() -> str:
27 |     """Query identification string for the currently attached TPU."""
28 |     return jax.devices()[0].device_kind
29 | 
30 | 
31 | # Most TPU devices follow the pattern "TPU v{version}{variant}", e.g. "TPU v5p"
32 | # TPU v7 has a different pattern (i.e. "TPU7x")
33 | _TPU_KIND_PATTERN = re.compile(r"TPU( v)?(\d+)")
34 | 
35 | 
36 | def tpu_generation() -> int:
37 |     """Generation number of the currently attached TPU."""
38 |     if version := _TPU_KIND_PATTERN.match(tpu_kind()):
39 |         return int(version[2])
40 |     raise NotImplementedError("only TPU devices are supported")
41 | 
42 | 
43 | def assert_is_supported_dtype(dtype: jnp.dtype) -> None:
44 |     if dtype not in [
45 |             jnp.bfloat16,
46 |             jnp.float32,
47 |             jnp.float8_e4m3fn,
48 |             jnp.float8_e5m2,
49 |             jnp.int8,
50 |             jnp.int4,
51 |             jnp.float4_e2m1fn,
52 |             jnp.uint4,
53 |     ]:
54 |         raise ValueError(f"No support for {dtype=}.")
55 | 


--------------------------------------------------------------------------------
/.buildkite/features/Structured_Decoding.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: structured_decoding
16 | # pipeline-type: feature support matrix
17 | # Structured decoding allows constraining the model's output to follow a
18 | # specific format, such as choosing from a predefined set of options or
19 | # following a JSON schema. This is useful for classification tasks,
20 | # structured data extraction, and ensuring outputs conform to expected formats.
21 | steps:
22 |   - label: "Correctness tests for structured_decoding"
23 |     key: "structured_decoding_CorrectnessTest"
24 |     soft_fail: true
25 |     agents:
26 |       queue: tpu_v6e_queue
27 |     commands:
28 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_structured_decoding.py::test_structured_decoding
29 |   - label: "Record correctness test result for structured_decoding"
30 |     key: "record_structured_decoding_CorrectnessTest"
31 |     depends_on: "structured_decoding_CorrectnessTest"
32 |     env:
33 |       CI_TARGET: "structured_decoding"
34 |       CI_STAGE: "CorrectnessTest"
35 |       CI_CATEGORY: "feature support matrix"
36 |     agents:
37 |       queue: cpu
38 |     commands:
39 |       - |
40 |         .buildkite/scripts/record_step_result.sh structured_decoding_CorrectnessTest
41 | 


--------------------------------------------------------------------------------
/docs/recommended_models_features.md:
--------------------------------------------------------------------------------
 1 | # Recommended Model and Feature Matrices
 2 | 
 3 | Although vLLM TPU’s new unified backend makes out-of-the-box high performance serving possible with any model supported in vLLM, the reality is that we're still in the process of implementing a few core components.
 4 | For this reason, until we land more capabilities, we recommend starting from this list of stress tested models and features below.
 5 | 
 6 | We are still landing components in tpu-inference that will improve performance for larger scale, higher complexity models (XL MoE, +vision encoders, MLA, etc.).
 7 | 
 8 | If you’d like us to prioritize something specific, please submit a GitHub feature request [here](https://github.com/vllm-project/tpu-inference/issues/new/choose).
 9 | 
10 | ## Recommended Models
11 | 
12 | These tables show the models currently tested for accuracy and performance.
13 | 
14 | ### Text-Only Models
15 | 
16 | {{ read_csv('../support_matrices/text_only_model_support_matrix.csv', keep_default_na=False) }}
17 | 
18 | ### Multimodal Models
19 | 
20 | {{ read_csv('../support_matrices/multimodal_model_support_matrix.csv', keep_default_na=False) }}
21 | 
22 | ## Recommended Features
23 | 
24 | This table shows the features currently tested for accuracy and performance.
25 | 
26 | {{ read_csv('../support_matrices/feature_support_matrix.csv', keep_default_na=False) }}
27 | 
28 | ## Kernel Support
29 | 
30 | This table shows the current kernel support status.
31 | 
32 | {{ read_csv('../support_matrices/kernel_support_matrix.csv', keep_default_na=False) }}
33 | 
34 | ## Parallelism Support
35 | 
36 | This table shows the current parallelism support status.
37 | 
38 | {{ read_csv('../support_matrices/parallelism_support_matrix.csv', keep_default_na=False) }}
39 | 
40 | ## Quantization Support
41 | 
42 | This table shows the current quantization support status.
43 | 
44 | {{ read_csv('../support_matrices/quantization_support_matrix.csv', keep_default_na=False) }}
45 | 


--------------------------------------------------------------------------------
/tpu_inference/layers/jax/pp_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List, Protocol
16 | 
17 | from flax import nnx
18 | from vllm.distributed import get_pp_group
19 | from vllm.distributed.utils import get_pp_indices
20 | 
21 | 
22 | class PPMissingLayer(nnx.Module):
23 |     """
24 |     A placeholder layer for missing layers in a pipeline parallel model.
25 |     """
26 | 
27 |     def __init__(self, *args, **kwargs):
28 |         pass
29 | 
30 |     def __call__(self, *args, **kwargs):
31 |         """Return the first arg from args or the first value from kwargs."""
32 |         return args[0] if args else next(iter(kwargs.values()))
33 | 
34 | 
35 | class LayerFn(Protocol):
36 | 
37 |     def __call__(self) -> nnx.Module:
38 |         ...
39 | 
40 | 
41 | def make_layers(
42 |     num_hidden_layers: int,
43 |     layer_fn: LayerFn,
44 | ) -> tuple[int, int, List[nnx.Module]]:
45 |     start_layer, end_layer = get_pp_indices(num_hidden_layers,
46 |                                             get_pp_group().rank_in_group,
47 |                                             get_pp_group().world_size)
48 | 
49 |     layers = [PPMissingLayer() for _ in range(start_layer)] \
50 |         + [layer_fn() for _ in range(start_layer, end_layer)] \
51 |         + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]
52 | 
53 |     return start_layer, end_layer, layers
54 | 


--------------------------------------------------------------------------------
/.github/scripts/determine_release_vars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -eu pipefail
17 | 
18 | # --- SCHEDULE TRIGGER ---
19 | if [[ "$GH_EVENT_NAME"  == "schedule" ]]; then
20 |     echo "Trigger: Schedule - Generating nightly build"
21 | 
22 |     # --- Get Base Version from Tag ---
23 |     echo "Fetching latest tags..."
24 |     git fetch --tags --force
25 |     echo "Finding the latest stable version tag (vX.Y.Z)..."
26 |     LATEST_STABLE_TAG=$(git tag --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n 1)
27 |     if [[ -z "$LATEST_STABLE_TAG" ]]; then
28 |         echo "Warning: No stable tag found."
29 |         exit 1
30 |     else
31 |         BASE_VERSION=${LATEST_STABLE_TAG#v}
32 |     fi
33 |     echo "Using BASE_VERSION=${BASE_VERSION}"
34 | 
35 |     # --- Generate Nightly Version ---
36 |     DATETIME_STR=$(date -u +%Y%m%d)
37 |     VERSION="${BASE_VERSION}.dev${DATETIME_STR}"
38 | 
39 | # --- PUSH TAG TRIGGER ---
40 | elif [[ "$GH_EVENT_NAME" == "push" && "$GH_REF" == refs/tags/* ]]; then
41 |     echo "Trigger: Push Tag - Generating stable build"
42 |     TAG_NAME="$GH_REF_NAME"
43 |     VERSION=${TAG_NAME#v}
44 | 
45 | else
46 |     echo "Error: Unknown or unsupported trigger."
47 |     exit 1
48 | fi
49 | 
50 | # --- output ---
51 | echo "Final determined values: VERSION=${VERSION}"
52 | echo "VERSION=${VERSION}" >> "$GITHUB_OUTPUT"
53 | 


--------------------------------------------------------------------------------
/.buildkite/features/Hybrid_kvcache.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # hybrid kv cache
16 | # Hybrid kv cache allows the kv cache mgr to allocate different number of 
17 | # blocks for different attention types. This is useful for models with more 
18 | # than 1 attention type (e.g. opt-oss 120b, gemma-27b 
19 | # with full + sliding window attn) to save HBM memory for kv cache and be able
20 | # to accomodate more requests. 
21 | steps:
22 |   - label: "Correctness tests for hybrid kv cache allocation"
23 |     key: "hybrid_kvcache_CorrectnessTest"
24 |     soft_fail: true
25 |     agents:
26 |       queue: tpu_v6e_8_queue
27 |     commands:
28 |       - |
29 |         .buildkite/scripts/run_in_docker.sh \
30 |           python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_hybrid_kvcache.py::test_hybrid_kv_cache \
31 |           /workspace/tpu_inference/tests/e2e/test_hybrid_kvcache.py::test_hybrid_kv_cache_correctness
32 |   - label: "Record correctness test result for hybrid kv cache allocation"
33 |     key: "record_hybrid_kvcache_CorrectnessTest"
34 |     depends_on: "hybrid_kvcache_CorrectnessTest"
35 |     env:
36 |       CI_TARGET: "hybrid_kvcache"
37 |       CI_STAGE: "CorrectnessTest"
38 |       CI_CATEGORY: "feature support matrix"
39 |     agents:
40 |       queue: cpu
41 |     commands:
42 |       - |
43 |         .buildkite/scripts/record_step_result.sh hybrid_kvcache_CorrectnessTest
44 | 


--------------------------------------------------------------------------------
/.buildkite/features/runai_model_streamer_loader.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: runai_model_streamer_loader
16 | # pipeline-type: feature support matrix
17 | # The RunAI Model Streamer is a high-performance model loader that serves as an
18 | # alternative to the default Hugging Face loader. Instead of downloading a model
19 | # to local disk, it streams the weights from object storage (like GCS) into
20 | # GPU memory. This streaming process is significantly faster than the traditional
21 | # disk-based loading method.
22 | steps:
23 |   - label: "Correctness tests for runai_model_streamer_loader"
24 |     key: "runai_model_streamer_loader_CorrectnessTest"
25 |     soft_fail: true
26 |     agents:
27 |       queue: tpu_v6e_queue
28 |     commands:
29 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_runai_model_streamer_loader.py::test_correctness
30 |   - label: "Record correctness test result for runai_model_streamer_loader"
31 |     key: "record_runai_model_streamer_loader_CorrectnessTest"
32 |     depends_on: "runai_model_streamer_loader_CorrectnessTest"
33 |     env:
34 |       CI_TARGET: "runai_model_streamer_loader"
35 |       CI_STAGE: "CorrectnessTest"
36 |       CI_CATEGORY: "feature support matrix"
37 |     agents:
38 |       queue: cpu
39 |     commands:
40 |       - |
41 |         .buildkite/scripts/record_step_result.sh runai_model_streamer_loader_CorrectnessTest
42 | 


--------------------------------------------------------------------------------
/tpu_inference/models/vllm/vllm_model_wrapper_context.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from contextlib import contextmanager
16 | from dataclasses import dataclass
17 | from typing import Dict, List, Optional
18 | 
19 | import jax
20 | from jax.sharding import Mesh
21 | 
22 | 
23 | @dataclass
24 | class VllmModelWrapperContext:
25 |     kv_caches: List[jax.Array]
26 |     mesh: Mesh
27 |     layer_name_to_kvcache_index: Dict[str, int]
28 | 
29 | 
30 | _vllm_model_wrapper_context: Optional[VllmModelWrapperContext] = None
31 | 
32 | 
33 | def get_vllm_model_wrapper_context() -> VllmModelWrapperContext:
34 |     assert _vllm_model_wrapper_context is not None, (
35 |         "VllmModelWrapperContext is not set. "
36 |         "Please use `set_vllm_model_wrapper_context` to set the VllmModelWrapperContext."
37 |     )
38 |     return _vllm_model_wrapper_context
39 | 
40 | 
41 | @contextmanager
42 | def set_vllm_model_wrapper_context(
43 |     *,
44 |     kv_caches: List[jax.Array],
45 |     mesh: Mesh,
46 |     layer_name_to_kvcache_index: Dict[str, int] = None,
47 | ):
48 |     global _vllm_model_wrapper_context
49 |     prev_context = _vllm_model_wrapper_context
50 |     _vllm_model_wrapper_context = VllmModelWrapperContext(
51 |         kv_caches=kv_caches,
52 |         mesh=mesh,
53 |         layer_name_to_kvcache_index=layer_name_to_kvcache_index,
54 |     )
55 | 
56 |     try:
57 |         yield
58 |     finally:
59 |         _vllm_model_wrapper_context = prev_context
60 | 


--------------------------------------------------------------------------------
/tests/e2e/test_structured_decoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains end-to-end tests for structured decoding.
16 | #
17 | # Structured decoding allows constraining the model's output to follow a
18 | # specific format, such as choosing from a predefined set of options or
19 | # following a JSON schema. This is useful for classification tasks,
20 | # structured data extraction, and ensuring outputs conform to expected formats.
21 | 
22 | # The tests in this file verify that:
23 | # 1. Choice-based structured decoding correctly constrains output to valid options
24 | # 2. The model produces deterministic results when given structured constraints
25 | 
26 | from __future__ import annotations
27 | 
28 | from vllm import LLM, SamplingParams
29 | from vllm.sampling_params import StructuredOutputsParams
30 | 
31 | 
32 | def test_structured_decoding():
33 |     llm = LLM(model='meta-llama/Llama-3.2-1B-Instruct',
34 |               max_model_len=1024,
35 |               max_num_seqs=1,
36 |               enable_prefix_caching=False)
37 | 
38 |     choices = ['Positive', 'Negative']
39 |     structured_outputs_params = StructuredOutputsParams(choice=choices)
40 |     sampling_params = SamplingParams(
41 |         structured_outputs=structured_outputs_params)
42 |     outputs = llm.generate(
43 |         prompts="Classify this sentiment: tpu-inference is wonderful!",
44 |         sampling_params=sampling_params,
45 |     )
46 |     assert outputs[0].outputs[0].text in choices
47 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/collectives/util.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | """utilities for collective kernels."""
 3 | 
 4 | import functools
 5 | 
 6 | from jax.experimental import pallas as pl
 7 | from jax.experimental.pallas import tpu as pltpu
 8 | 
 9 | 
10 | def local_barrier(left_neighbor, right_neighbor, double_barrier=True):
11 |     """Performs a barrier with neighbors on the global barrier semaphore.
12 | 
13 |   Optionally performs a second barrier, which prevents a potential race
14 |   when reusing the same collective_id across kernel invocations.
15 | 
16 |   Args:
17 |     left_neighbor: Left neighbor device id.
18 |     right_neighbor: Right neighbor device id.
19 |     double_barrier: Whether to perform a second barrier.
20 |   """
21 |     barrier_sem = pltpu.get_barrier_semaphore()
22 |     for neighbor in [left_neighbor, right_neighbor]:
23 |         pltpu.semaphore_signal(
24 |             barrier_sem,
25 |             inc=1,
26 |             device_id=(neighbor, ),
27 |             device_id_type=pltpu.DeviceIdType.MESH,
28 |         )
29 |     pltpu.semaphore_wait(barrier_sem, 2)
30 |     if double_barrier:
31 |         # The double-barrier prevents a race condition where one neighbor can
32 |         # re-enter the kernel again on a subsequent call and increment the
33 |         # barrier semaphore a second time. This would unblock the current device
34 |         # even if the other neighbor is not ready yet.
35 |         # To implement a double-barrier, we stack-allocate a second REGULAR
36 |         # semaphore using run_scoped.
37 |         @functools.partial(pl.run_scoped,
38 |                            second_barrier=pltpu.SemaphoreType.REGULAR)
39 |         def _(second_barrier):
40 |             for neighbor in [left_neighbor, right_neighbor]:
41 |                 pltpu.semaphore_signal(
42 |                     second_barrier,
43 |                     inc=1,
44 |                     device_id=(neighbor, ),
45 |                     device_id_type=pltpu.DeviceIdType.MESH,
46 |                 )
47 |             pltpu.semaphore_wait(second_barrier, 2)
48 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/ragged_paged_attention/v3/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utility functions for ragged paged attention."""
15 | import jax
16 | from jax._src import dtypes
17 | 
18 | 
19 | def cdiv(a, b):
20 |     assert b != 0
21 |     return (a + b - 1) // b
22 | 
23 | 
24 | def align_to(x, a):
25 |     return cdiv(x, a) * a
26 | 
27 | 
28 | def get_dtype_bitwidth(dtype):
29 |     return (dtypes.bit_width(dtype)
30 |             if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
31 | 
32 | 
33 | def get_dtype_packing(dtype):
34 |     bits = get_dtype_bitwidth(dtype)
35 |     return 32 // bits
36 | 
37 | 
38 | def next_power_of_2(x: int):
39 |     """Finds the smallest power of 2 >= x using bit manipulation.
40 | 
41 |   Args:
42 |     x: The input number (should be an integer).
43 | 
44 |   Returns:
45 |     The smallest integer power of 2 that is >= x.
46 |   """
47 |     assert x > 0
48 |     if x == 1:
49 |         return 1
50 |     return 1 << (x - 1).bit_length()
51 | 
52 | 
53 | def get_tpu_version() -> int:
54 |     """Returns the numeric version of the TPU, or -1 if not on TPU."""
55 |     kind = jax.devices()[0].device_kind
56 |     if 'TPU' not in kind:
57 |         return -1
58 |     if kind.endswith(' lite'):
59 |         kind = kind[:-len(' lite')]
60 |     if kind.endswith('p') or kind.endswith('e'):
61 |         kind = kind[:-1]
62 |     if kind == 'TPU7x':
63 |         return 7
64 |     assert kind[:-1] == 'TPU v', kind
65 |     return int(kind[-1])
66 | 


--------------------------------------------------------------------------------
/.buildkite/features/MLA.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: MLA
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for MLA"
19 |     key: "MLA_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "MLA_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for MLA"
27 |     key: "record_MLA_CorrectnessTest"
28 |     depends_on: "MLA_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "MLA"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "kernel support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh MLA_CorrectnessTest
38 | 
39 |   - label: "Performance tests for MLA"
40 |     key: "MLA_PerformanceTest"
41 |     depends_on: "record_MLA_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "MLA_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for MLA"
49 |     key: "record_MLA_PerformanceTest"
50 |     depends_on: "MLA_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "MLA"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "kernel support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh MLA_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/features/MoE.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: MoE
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for MoE"
19 |     key: "MoE_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "MoE_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for MoE"
27 |     key: "record_MoE_CorrectnessTest"
28 |     depends_on: "MoE_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "MoE"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "kernel support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh MoE_CorrectnessTest
38 | 
39 |   - label: "Performance tests for MoE"
40 |     key: "MoE_PerformanceTest"
41 |     depends_on: "record_MoE_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "MoE_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for MoE"
49 |     key: "record_MoE_PerformanceTest"
50 |     depends_on: "MoE_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "MoE"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "kernel support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh MoE_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/parallelism/CP.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: CP
16 | # pipeline-type: parallelism support matrix
17 | steps:
18 |   - label: "Correctness tests for CP"
19 |     key: "CP_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "CP_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for CP"
27 |     key: "record_CP_CorrectnessTest"
28 |     depends_on: "CP_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "CP"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "parallelism support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh CP_CorrectnessTest
38 | 
39 |   - label: "Performance tests for CP"
40 |     key: "CP_PerformanceTest"
41 |     depends_on: "record_CP_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "CP_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for CP"
49 |     key: "record_CP_PerformanceTest"
50 |     depends_on: "CP_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "CP"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "parallelism support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh CP_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/parallelism/EP.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: EP
16 | # pipeline-type: parallelism support matrix
17 | steps:
18 |   - label: "Correctness tests for EP"
19 |     key: "EP_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "EP_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for EP"
27 |     key: "record_EP_CorrectnessTest"
28 |     depends_on: "EP_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "EP"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "parallelism support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh EP_CorrectnessTest
38 | 
39 |   - label: "Performance tests for EP"
40 |     key: "EP_PerformanceTest"
41 |     depends_on: "record_EP_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "EP_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for EP"
49 |     key: "record_EP_PerformanceTest"
50 |     depends_on: "EP_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "EP"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "parallelism support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh EP_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/parallelism/SP.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: SP
16 | # pipeline-type: parallelism support matrix
17 | steps:
18 |   - label: "Correctness tests for SP"
19 |     key: "SP_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "SP_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for SP"
27 |     key: "record_SP_CorrectnessTest"
28 |     depends_on: "SP_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "SP"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "parallelism support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh SP_CorrectnessTest
38 | 
39 |   - label: "Performance tests for SP"
40 |     key: "SP_PerformanceTest"
41 |     depends_on: "record_SP_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "SP_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for SP"
49 |     key: "record_SP_PerformanceTest"
50 |     depends_on: "SP_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "SP"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "parallelism support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh SP_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/parallelism/TP.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: TP
16 | # pipeline-type: parallelism support matrix
17 | steps:
18 |   - label: "Correctness tests for TP"
19 |     key: "TP_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "TP_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for TP"
27 |     key: "record_TP_CorrectnessTest"
28 |     depends_on: "TP_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "TP"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "parallelism support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh TP_CorrectnessTest
38 | 
39 |   - label: "Performance tests for TP"
40 |     key: "TP_PerformanceTest"
41 |     depends_on: "record_TP_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "TP_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for TP"
49 |     key: "record_TP_PerformanceTest"
50 |     depends_on: "TP_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "TP"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "parallelism support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh TP_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/tests/lora/test_bgmv.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import jax
16 | import torch
17 | import torchax
18 | 
19 | from tpu_inference.lora.torch_lora_ops import bgmv_torch
20 | 
21 | 
22 | def test_bgmv_torch():
23 |     num_tokens = 16
24 |     hidden_size = 128
25 |     max_loras = 9
26 |     max_lora_rank = 8
27 | 
28 |     with torchax.default_env(), jax.default_device(jax.devices("tpu")[0]):
29 |         inputs = torch.rand(num_tokens, hidden_size, device='jax')
30 |         loras = torch.rand(max_loras,
31 |                            1,
32 |                            max_lora_rank,
33 |                            hidden_size,
34 |                            device='jax')
35 |         idxs = torch.randint(0, max_loras, (num_tokens, ), device='jax')
36 | 
37 |         actual = bgmv_torch(inputs, loras, idxs)
38 |         expected = _ref_bgmv_torch(inputs, loras, idxs)
39 |         torch.testing.assert_close(actual, expected, atol=3e-2, rtol=1e-3)
40 | 
41 | 
42 | def _ref_bgmv_torch(inputs, loras, idxs):
43 |     if len(loras.shape) == 4:
44 |         loras = loras.squeeze(axis=1)
45 | 
46 |     # Another equivalent ref impl is as the 2 lines below.
47 |     # selected_loras = loras[idxs]
48 |     # return torch.einsum('td,tld->tl', inputs, selected_loras)
49 |     num_tokens, _ = inputs.shape
50 |     outputs = []
51 |     for i in range(num_tokens):
52 |         input = inputs[i]  # [hidden_size]
53 |         lora = loras[idxs[i]]  # [max_lora_rank, hidden_size]
54 |         out = torch.matmul(lora, input)
55 |         outputs.append(out)
56 | 
57 |     return torch.stack(outputs, axis=0)
58 | 


--------------------------------------------------------------------------------
/tpu_inference/kernels/quantized_matmul/util.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | """Utility functions for quantized matmul kernel."""
 3 | from typing import Any, Callable
 4 | 
 5 | import jax
 6 | import jax.numpy as jnp
 7 | 
 8 | from tpu_inference.kernels.quantized_matmul.tuned_block_sizes import TunedValue
 9 | 
10 | 
11 | def unfold_args(
12 |     conditions: tuple[jax.Array | bool, ...],
13 |     fn_conditions: tuple[bool, ...],
14 |     fn: Callable[..., Any],
15 | ):
16 |     """Minimize run-time branching of fn by converting jnp.bool to python bool."""
17 |     if conditions:
18 |         arg = conditions[0]
19 |         if isinstance(arg, bool):
20 |             unfold_args(conditions[1:], fn_conditions + (arg, ), fn)
21 |         else:
22 |             assert arg.dtype == jnp.bool and arg.size == 1
23 |             jax.lax.cond(
24 |                 arg,
25 |                 lambda: unfold_args(conditions[1:], fn_conditions +
26 |                                     (True, ), fn),
27 |                 lambda: unfold_args(conditions[1:], fn_conditions +
28 |                                     (False, ), fn),
29 |             )
30 |     else:
31 |         fn(*fn_conditions)
32 | 
33 | 
34 | def quantize_tensor(x: jax.Array, dtype: jnp.dtype, dim: int = -1):
35 |     if jnp.issubdtype(dtype, jnp.integer):
36 |         dtype_info = jnp.iinfo(dtype)
37 |         max_val = int(dtype_info.max)
38 |         min_val = int(dtype_info.min)
39 |     else:
40 |         dtype_info = jnp.finfo(dtype)
41 |         max_val = float(dtype_info.max)
42 |         min_val = float(dtype_info.min)
43 | 
44 |     x_abs_max = jnp.max(jnp.abs(x), axis=dim, keepdims=True)
45 |     scale = x_abs_max / max_val
46 |     x_q = jnp.clip(x / scale, min_val, max_val).astype(dtype)
47 |     return x_q, scale.astype(jnp.float32)
48 | 
49 | 
50 | def next_multiple(x, multiple):
51 |     return ((x + multiple - 1) // multiple) * multiple
52 | 
53 | 
54 | def get_kernel_name(tuned_value: TunedValue):
55 |     batch_block_size = tuned_value.batch_block_size
56 |     out_block_size = tuned_value.out_block_size
57 |     in_block_size = tuned_value.in_block_size
58 |     return f'quantized_matmul_kernel_{batch_block_size}_{out_block_size}_{in_block_size}'
59 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/AWQ_INT4.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: AWQ INT4
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for AWQ INT4"
19 |     key: "AWQ_INT4_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "AWQ_INT4_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for AWQ INT4"
27 |     key: "record_AWQ_INT4_CorrectnessTest"
28 |     depends_on: "AWQ_INT4_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "AWQ INT4"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh AWQ_INT4_CorrectnessTest
38 | 
39 |   - label: "Performance tests for AWQ INT4"
40 |     key: "AWQ_INT4_PerformanceTest"
41 |     depends_on: "record_AWQ_INT4_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "AWQ_INT4_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for AWQ INT4"
49 |     key: "record_AWQ_INT4_PerformanceTest"
50 |     depends_on: "AWQ_INT4_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "AWQ INT4"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh AWQ_INT4_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/FP8_W8A8.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: FP8 W8A8
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for FP8 W8A8"
19 |     key: "FP8_W8A8_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "FP8_W8A8_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for FP8 W8A8"
27 |     key: "record_FP8_W8A8_CorrectnessTest"
28 |     depends_on: "FP8_W8A8_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "FP8 W8A8"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh FP8_W8A8_CorrectnessTest
38 | 
39 |   - label: "Performance tests for FP8 W8A8"
40 |     key: "FP8_W8A8_PerformanceTest"
41 |     depends_on: "record_FP8_W8A8_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "FP8_W8A8_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for FP8 W8A8"
49 |     key: "record_FP8_W8A8_PerformanceTest"
50 |     depends_on: "FP8_W8A8_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "FP8 W8A8"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh FP8_W8A8_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/parallelism/DP.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: DP
16 | # pipeline-type: parallelism support matrix
17 | steps:
18 |   - label: "Correctness tests for DP"
19 |     key: "DP_CorrectnessTest"
20 |     soft_fail: true
21 |     env:
22 |       NEW_MODEL_DESIGN: "1"
23 |     agents:
24 |       queue: tpu_v6e_8_queue
25 |     commands:
26 |       - |
27 |         .buildkite/scripts/run_in_docker.sh \
28 |           bash -c 'python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_data_parallel.py'
29 |   - label: "Record correctness test result for DP"
30 |     key: "record_DP_CorrectnessTest"
31 |     depends_on: "DP_CorrectnessTest"
32 |     env:
33 |       CI_TARGET: "DP"
34 |       CI_STAGE: "CorrectnessTest"
35 |       CI_CATEGORY: "parallelism support matrix"
36 |     agents:
37 |       queue: cpu
38 |     commands:
39 |       - |
40 |         .buildkite/scripts/record_step_result.sh DP_CorrectnessTest
41 | 
42 |   - label: "Performance tests for DP"
43 |     key: "DP_PerformanceTest"
44 |     depends_on: "record_DP_CorrectnessTest"
45 |     soft_fail: true
46 |     agents:
47 |       queue: tpu_v6e_queue
48 |     commands:
49 |       - |
50 |         buildkite-agent meta-data set "DP_PerformanceTest" "unverified"
51 |   - label: "Record performance test result for DP"
52 |     key: "record_DP_PerformanceTest"
53 |     depends_on: "DP_PerformanceTest"
54 |     env:
55 |       CI_TARGET: "DP"
56 |       CI_STAGE: "PerformanceTest"
57 |       CI_CATEGORY: "parallelism support matrix"
58 |     agents:
59 |       queue: cpu
60 |     commands:
61 |       - |
62 |         .buildkite/scripts/record_step_result.sh DP_PerformanceTest
63 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/FP4_W4A16.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: FP4 W4A16
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for FP4 W4A16"
19 |     key: "FP4_W4A16_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "FP4_W4A16_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for FP4 W4A16"
27 |     key: "record_FP4_W4A16_CorrectnessTest"
28 |     depends_on: "FP4_W4A16_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "FP4 W4A16"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh FP4_W4A16_CorrectnessTest
38 | 
39 |   - label: "Performance tests for FP4 W4A16"
40 |     key: "FP4_W4A16_PerformanceTest"
41 |     depends_on: "record_FP4_W4A16_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "FP4_W4A16_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for FP4 W4A16"
49 |     key: "record_FP4_W4A16_PerformanceTest"
50 |     depends_on: "FP4_W4A16_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "FP4 W4A16"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh FP4_W4A16_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/FP8_W8A16.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: FP8 W8A16
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for FP8 W8A16"
19 |     key: "FP8_W8A16_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "FP8_W8A16_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for FP8 W8A16"
27 |     key: "record_FP8_W8A16_CorrectnessTest"
28 |     depends_on: "FP8_W8A16_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "FP8 W8A16"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh FP8_W8A16_CorrectnessTest
38 | 
39 |   - label: "Performance tests for FP8 W8A16"
40 |     key: "FP8_W8A16_PerformanceTest"
41 |     depends_on: "record_FP8_W8A16_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "FP8_W8A16_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for FP8 W8A16"
49 |     key: "record_FP8_W8A16_PerformanceTest"
50 |     depends_on: "FP8_W8A16_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "FP8 W8A16"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh FP8_W8A16_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/INT8_W8A8.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: INT8 W8A8
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for INT8 W8A8"
19 |     key: "INT8_W8A8_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "INT8_W8A8_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for INT8 W8A8"
27 |     key: "record_INT8_W8A8_CorrectnessTest"
28 |     depends_on: "INT8_W8A8_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "INT8 W8A8"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh INT8_W8A8_CorrectnessTest
38 | 
39 |   - label: "Performance tests for INT8 W8A8"
40 |     key: "INT8_W8A8_PerformanceTest"
41 |     depends_on: "record_INT8_W8A8_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "INT8_W8A8_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for INT8 W8A8"
49 |     key: "record_INT8_W8A8_PerformanceTest"
50 |     depends_on: "INT8_W8A8_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "INT8 W8A8"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh INT8_W8A8_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/quantization/INT4_W4A16.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: INT4 W4A16
16 | # pipeline-type: quantization support matrix
17 | steps:
18 |   - label: "Correctness tests for INT4 W4A16"
19 |     key: "INT4_W4A16_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "INT4_W4A16_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for INT4 W4A16"
27 |     key: "record_INT4_W4A16_CorrectnessTest"
28 |     depends_on: "INT4_W4A16_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "INT4 W4A16"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "quantization support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh INT4_W4A16_CorrectnessTest
38 | 
39 |   - label: "Performance tests for INT4 W4A16"
40 |     key: "INT4_W4A16_PerformanceTest"
41 |     depends_on: "record_INT4_W4A16_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "INT4_W4A16_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for INT4 W4A16"
49 |     key: "record_INT4_W4A16_PerformanceTest"
50 |     depends_on: "INT4_W4A16_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "INT4 W4A16"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "quantization support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh INT4_W4A16_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/pipeline_generation/feature_template.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: {FEATURE_NAME}
16 | # pipeline-type: {CATEGORY}
17 | steps:
18 |   - label: "Correctness tests for {FEATURE_NAME}"
19 |     key: "{SANITIZED_FEATURE_NAME}_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: {QUEUE}
23 |     commands:
24 |       - echo "placeholder"  # TODO : replace with your correctness test command
25 |   - label: "Record correctness test result for {FEATURE_NAME}"
26 |     key: "record_{SANITIZED_FEATURE_NAME}_CorrectnessTest"
27 |     depends_on: "{SANITIZED_FEATURE_NAME}_CorrectnessTest"
28 |     env:
29 |       CI_TARGET: "{FEATURE_NAME}"
30 |       CI_STAGE: "CorrectnessTest"
31 |       CI_CATEGORY: "{CATEGORY}"
32 |     agents:
33 |       queue: cpu
34 |     commands:
35 |       - |
36 |         .buildkite/scripts/record_step_result.sh {SANITIZED_FEATURE_NAME}_CorrectnessTest
37 | 
38 |   - label: "Performance tests for {FEATURE_NAME}"
39 |     key: "{SANITIZED_FEATURE_NAME}_PerformanceTest"
40 |     depends_on: "record_{SANITIZED_FEATURE_NAME}_CorrectnessTest"
41 |     soft_fail: true
42 |     agents:
43 |       queue: {QUEUE}
44 |     commands:
45 |       - echo "placeholder"  # TODO : replace with your performance test command
46 |   - label: "Record performance test result for {FEATURE_NAME}"
47 |     key: "record_{SANITIZED_FEATURE_NAME}_PerformanceTest"
48 |     depends_on: "{SANITIZED_FEATURE_NAME}_PerformanceTest"
49 |     env:
50 |       CI_TARGET: "{FEATURE_NAME}"
51 |       CI_STAGE: "PerformanceTest"
52 |       CI_CATEGORY: "{CATEGORY}"
53 |     agents:
54 |       queue: cpu
55 |     commands:
56 |       - |
57 |         .buildkite/scripts/record_step_result.sh {SANITIZED_FEATURE_NAME}_PerformanceTest
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | import os
 4 | from typing import List
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | ROOT_DIR = os.path.dirname(__file__)
 9 | 
10 | 
11 | def get_path(*filepath) -> str:
12 |     return os.path.join(ROOT_DIR, *filepath)
13 | 
14 | 
15 | def get_requirements() -> List[str]:
16 |     """Get Python package dependencies from requirements.txt."""
17 | 
18 |     def _read_requirements(filename: str) -> List[str]:
19 |         with open(get_path(filename)) as f:
20 |             requirements = f.read().strip().split("\n")
21 |         resolved_requirements = []
22 |         for line in requirements:
23 |             if line.startswith("-r "):
24 |                 resolved_requirements += _read_requirements(line.split()[1])
25 |             elif line.startswith("--"):
26 |                 continue
27 |             else:
28 |                 resolved_requirements.append(line)
29 |         return resolved_requirements
30 | 
31 |     try:
32 |         requirements = _read_requirements("requirements.txt")
33 |     except ValueError:
34 |         print("Failed to read requirements.txt in vllm_tpu.")
35 |     return requirements
36 | 
37 | 
38 | def get_version():
39 |     if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
40 |         return env_version
41 |     return "0.0.0"
42 | 
43 | 
44 | setup(
45 |     name="tpu_inference",
46 |     version=get_version(),
47 |     description="",
48 |     long_description=open("README.md").read() if hasattr(
49 |         open("README.md"), "read") else "",
50 |     long_description_content_type="text/markdown",
51 |     author="tpu_inference Contributors",
52 |     packages=find_packages(),
53 |     python_requires=">=3.10",
54 |     install_requires=get_requirements(),
55 |     include_package_data=True,
56 |     classifiers=[
57 |         "Development Status :: 3 - Alpha",
58 |         "Intended Audience :: Developers",
59 |         "Intended Audience :: Education",
60 |         "Intended Audience :: Science/Research",
61 |         "License :: OSI Approved :: Apache Software License",
62 |         "Programming Language :: Python :: 3.10",
63 |         "Programming Language :: Python :: 3.11",
64 |         "Programming Language :: Python :: 3.12",
65 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
66 |     ],
67 | )
68 | 


--------------------------------------------------------------------------------
/.buildkite/features/Quantized_Matmul.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Quantized Matmul
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for Quantized Matmul"
19 |     key: "Quantized_Matmul_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "Quantized_Matmul_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for Quantized Matmul"
27 |     key: "record_Quantized_Matmul_CorrectnessTest"
28 |     depends_on: "Quantized_Matmul_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "Quantized Matmul"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "kernel support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh Quantized_Matmul_CorrectnessTest
38 | 
39 |   - label: "Performance tests for Quantized Matmul"
40 |     key: "Quantized_Matmul_PerformanceTest"
41 |     depends_on: "record_Quantized_Matmul_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "Quantized_Matmul_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for Quantized Matmul"
49 |     key: "record_Quantized_Matmul_PerformanceTest"
50 |     depends_on: "Quantized_Matmul_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "Quantized Matmul"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "kernel support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh Quantized_Matmul_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # CODEOWNERS file for tpu-inference
 2 | # This file defines code ownership for different parts of the repository.
 3 | # Each line is a file pattern followed by one or more owners.
 4 | # Owners are notified when PRs modify code in their areas.
 5 | #
 6 | # Order matters - the last matching pattern takes precedence.
 7 | # Analysis includes full history from tpu_commons and tpu_inference paths.
 8 | 
 9 | # Default owners for everything in the repo (fallback)
10 | * @vipannalla
11 | 
12 | # CI/CD and Build Configuration
13 | /.buildkite/ @jcyang43 @QiliangCui
14 | /.github/ @jcyang43 @QiliangCui
15 | 
16 | # Documentation
17 | /docs/ @bvrockwell
18 | /README.md @bvrockwell
19 | /CONTRIBUTING.md @jrplatin @bvrockwell
20 | 
21 | # Distributed Computing
22 | /tpu_inference/distributed/ @mrjunwan-lang @sixiang-google
23 | 
24 | # Kernel Implementations (Performance-critical)
25 | /tpu_inference/kernels/ @kyuyeunk @yaochengji @bythew3i
26 | 
27 | # JAX Model Layers - Attention
28 | /tpu_inference/layers/jax/ @py4 @bzgoogle @jrplatin @gpolovets1
29 | /tpu_inference/layers/vllm/ @kyuyeunk @hfan @vanbasten23
30 | 
31 | # JAX Model Implementations
32 | /tpu_inference/models/jax/qwen2_5_vl.py @hfan @kwang3939
33 | /tpu_inference/models/jax/gpt_oss.py @bzgoogle
34 | /tpu_inference/models/jax/deepseek_v3.py @bzgoogle @gpolovets1 @jrplatin
35 | /tpu_inference/models/vllm/ @kyuyeunk @hfan @vanbasten23
36 | 
37 | # Runner and Execution
38 | /tpu_inference/runner/ @kyuyeunk @py4 @wenxindongwork @sixiang-google  @mrjunwan-lang
39 | /tpu_inference/runner/tpu_runner.py @py4 @kyuyeunk @wenxindongwork @sixiang-google
40 | /tpu_inference/runner/persistent_batch_manager.py @py4 @wenxindongwork
41 | /tpu_inference/runner/speculative_decoding_manager.py @py4 @Lumosis
42 | /tpu_inference/executors/ @sixiang-google @mrjunwan-lang
43 | /tpu_inference/core/ @sixiang-google @mrjunwan-lang @wenxindongwork
44 | 
45 | # Worker Management
46 | /tpu_inference/worker/ @sixiang-google @mrjunwan-lang @py4 @vanbasten23 @wenxindongwork
47 | 
48 | # Speculative Decoding
49 | /tpu_inference/spec_decode/ @py4 @Lumosis
50 | 
51 | # Platform Support
52 | /tpu_inference/platforms/ @sixiang-google @mrjunwan-lang
53 | 
54 | # LoRA and Adapters
55 | /tpu_inference/lora/ @vanbasten23
56 | /tpu_inference/runner/lora_utils.py @vanbasten23
57 | 
58 | # Docker Configuration
59 | /docker/ @jrplatin @QiliangCui
60 | 


--------------------------------------------------------------------------------
/docs/getting_started/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | This guide provides instructions for installing and running `tpu-inference`.
 4 | 
 5 | There are three ways to install `tpu-inference`:
 6 | 
 7 | 1. **[Install with pip](#install-using-pip)**
 8 | 2. **[Run with Docker](#run-with-docker)**
 9 | 3. **[Install from source](#install-from-source)**
10 | 
11 | ## Install using pip
12 | 
13 | 1. Create a working directory:
14 | 
15 |     ```shell
16 |     mkdir ~/work-dir
17 |     cd ~/work-dir
18 |     ```
19 | 
20 | 2. Set up a Python virtual environment:
21 | 
22 |     ```shell
23 |     python3.12 -m venv vllm_env --symlinks
24 |     source vllm_env/bin/activate
25 |     ```
26 | 
27 | 3. Use the following command to install vllm-tpu using `pip`
28 | 
29 |     ```shell
30 |     pip install vllm-tpu
31 |     ```
32 | 
33 | ## Run with Docker
34 | 
35 | Include the `--privileged`, `--net=host`, and `--shm-size=150gb` options to enable TPU interaction and shared memory.
36 | 
37 | ```shell
38 | export DOCKER_URI=vllm/vllm-tpu:latest
39 | sudo docker run -it --rm --name $USER-vllm --privileged --net=host \
40 |     -v /dev/shm:/dev/shm \
41 |     --shm-size 150gb \
42 |     -p 8000:8000 \
43 |     --entrypoint /bin/bash ${DOCKER_URI}
44 | ```
45 | 
46 | ## Install from source
47 | 
48 | For debugging or development purposes, you can install `tpu-inference` from source. `tpu-inference` is a plugin for `vllm`, so you need to install both from source.
49 | 
50 | 1. Install system dependencies:
51 | 
52 |     ```shell
53 |     sudo apt-get update && sudo apt-get install -y libopenblas-base libopenmpi-dev libomp-dev
54 |     ```
55 | 
56 | 1. Clone the `vllm` and `tpu-inference` repositories:
57 | 
58 |     ```shell
59 |     git clone https://github.com/vllm-project/vllm.git
60 |     git clone https://github.com/vllm-project/tpu-inference.git
61 |     ```
62 | 
63 | 1. Set up a Python virtual environment:
64 | 
65 |     ```shell
66 |     python3.12 -m venv vllm_env --symlinks
67 |     source vllm_env/bin/activate
68 |     ```
69 | 
70 | 1. Install `vllm` from source, targeting the TPU device:
71 | 
72 |     ```shell
73 |     cd vllm
74 |     pip install -r requirements/tpu.txt
75 |     VLLM_TARGET_DEVICE="tpu" pip install -e .
76 |     cd ..
77 |     ```
78 | 
79 | 1. Install `tpu-inference` from source:
80 | 
81 |     ```shell
82 |     cd tpu-inference
83 |     pip install -e .
84 |     cd ..
85 |     ```
86 | 


--------------------------------------------------------------------------------
/tests/platforms/test_tpu_platform.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from unittest.mock import MagicMock, patch
16 | 
17 | import pytest
18 | import torch
19 | from vllm.config import CacheConfig, VllmConfig
20 | 
21 | from tpu_inference.platforms.tpu_platform import TpuPlatform
22 | 
23 | 
24 | class TestTpuPlatform:
25 | 
26 |     @pytest.fixture
27 |     def vllm_config(self):
28 |         cache_config = CacheConfig(block_size=16,
29 |                                    gpu_memory_utilization=0.9,
30 |                                    swap_space=4,
31 |                                    cache_dtype="fp8")
32 | 
33 |         vllm_config = MagicMock(spec=VllmConfig)
34 |         vllm_config.cache_config = cache_config
35 |         vllm_config.model_config = MagicMock(dtype='bfloat16')
36 |         vllm_config.scheduler_config = MagicMock(is_multimodal_model=False)
37 |         vllm_config.parallel_config = MagicMock()
38 |         vllm_config.compilation_config = MagicMock(mode="dynamo_trace_once",
39 |                                                    backend="openxla")
40 |         vllm_config.kv_transfer_config = None
41 |         return vllm_config
42 | 
43 |     @pytest.mark.parametrize("chip_name,expected_dtype", [
44 |         ("v6e", torch.float8_e5m2),
45 |         ("v5e", torch.float8_e4m3fn),
46 |     ])
47 |     def test_fp8_dtype(self, chip_name, expected_dtype):
48 |         mock_chip_type = MagicMock()
49 |         mock_chip_type.name = chip_name
50 | 
51 |         with patch('tpu_inference.platforms.tpu_platform.init_logger'), \
52 |              patch('tpu_inference.platforms.tpu_platform.device.get_local_chips', return_value=(mock_chip_type, None)), \
53 |              patch('vllm.envs.VLLM_TPU_USING_PATHWAYS', False):
54 |             assert TpuPlatform.fp8_dtype() == expected_dtype
55 | 


--------------------------------------------------------------------------------
/.buildkite/features/Multimodal_Inputs.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Multimodal Inputs
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for Multimodal Inputs"
19 |     key: "Multimodal_Inputs_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py
25 |   - label: "Record correctness test result for Multimodal Inputs"
26 |     key: "record_Multimodal_Inputs_CorrectnessTest"
27 |     depends_on: "Multimodal_Inputs_CorrectnessTest"
28 |     env:
29 |       CI_TARGET: Multimodal Inputs
30 |       CI_STAGE: "CorrectnessTest"
31 |       CI_CATEGORY: "feature support matrix"
32 |     agents:
33 |       queue: cpu
34 |     commands:
35 |       - |
36 |         .buildkite/scripts/record_step_result.sh Multimodal_Inputs_CorrectnessTest
37 | 
38 |   - label: "Performance tests for Multimodal Inputs"
39 |     key: "Multimodal_Inputs_PerformanceTest"
40 |     depends_on: "record_Multimodal_Inputs_CorrectnessTest"
41 |     soft_fail: true
42 |     agents:
43 |       queue: tpu_v6e_queue
44 |     commands:
45 |       - .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/mm_bench.sh
46 |   - label: "Record performance test result for Multimodal Inputs"
47 |     key: "record_Multimodal_Inputs_PerformanceTest"
48 |     depends_on: "Multimodal_Inputs_PerformanceTest"
49 |     env:
50 |       CI_TARGET: Multimodal Inputs
51 |       CI_STAGE: "PerformanceTest"
52 |       CI_CATEGORY: "feature support matrix"
53 |     agents:
54 |       queue: cpu
55 |     commands:
56 |       - |
57 |         .buildkite/scripts/record_step_result.sh Multimodal_Inputs_PerformanceTest
58 | 


--------------------------------------------------------------------------------
/.buildkite/features/Quantized_KV_Cache.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Quantized KV Cache
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for Quantized KV Cache"
19 |     key: "Quantized_KV_Cache_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "Quantized_KV_Cache_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for Quantized KV Cache"
27 |     key: "record_Quantized_KV_Cache_CorrectnessTest"
28 |     depends_on: "Quantized_KV_Cache_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "Quantized KV Cache"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "kernel support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh Quantized_KV_Cache_CorrectnessTest
38 | 
39 |   - label: "Performance tests for Quantized KV Cache"
40 |     key: "Quantized_KV_Cache_PerformanceTest"
41 |     depends_on: "record_Quantized_KV_Cache_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "Quantized_KV_Cache_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for Quantized KV Cache"
49 |     key: "record_Quantized_KV_Cache_PerformanceTest"
50 |     depends_on: "Quantized_KV_Cache_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "Quantized KV Cache"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "kernel support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh Quantized_KV_Cache_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/features/Quantized_Attention.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Quantized Attention
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for Quantized Attention"
19 |     key: "Quantized_Attention_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "Quantized_Attention_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for Quantized Attention"
27 |     key: "record_Quantized_Attention_CorrectnessTest"
28 |     depends_on: "Quantized_Attention_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "Quantized Attention"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "kernel support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh Quantized_Attention_CorrectnessTest
38 | 
39 |   - label: "Performance tests for Quantized Attention"
40 |     key: "Quantized_Attention_PerformanceTest"
41 |     depends_on: "record_Quantized_Attention_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "Quantized_Attention_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for Quantized Attention"
49 |     key: "record_Quantized_Attention_PerformanceTest"
50 |     depends_on: "Quantized_Attention_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "Quantized Attention"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "kernel support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh Quantized_Attention_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/.buildkite/features/data_parallelism.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: data_parallelism
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for data_parallelism"
19 |     key: "data_parallelism_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_8_queue
23 |     commands:
24 |       - |
25 |         .buildkite/scripts/run_in_docker.sh \
26 |           bash -c 'NEW_MODEL_DESIGN=1 python3 -m pytest -s -v -x /workspace/tpu_inference/tests/e2e/test_data_parallel.py'
27 |   - label: "Record correctness test result for data_parallelism"
28 |     key: "record_data_parallelism_CorrectnessTest"
29 |     depends_on: "data_parallelism_CorrectnessTest"
30 |     env:
31 |       CI_TARGET: "data_parallelism"
32 |       CI_STAGE: "CorrectnessTest"
33 |       CI_CATEGORY: "feature support matrix"
34 |     agents:
35 |       queue: cpu
36 |     commands:
37 |       - |
38 |         .buildkite/scripts/record_step_result.sh data_parallelism_CorrectnessTest
39 | 
40 |   - label: "Performance tests for data_parallelism"
41 |     key: "data_parallelism_PerformanceTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_8_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "data_parallelism_PerformanceTest" "run together with correctness test"
48 | 
49 |   - label: "Record performance test result for data_parallelism"
50 |     key: "record_data_parallelism_PerformanceTest"
51 |     depends_on: "data_parallelism_PerformanceTest"
52 |     env:
53 |       CI_TARGET: "data_parallelism"
54 |       CI_STAGE: "PerformanceTest"
55 |       CI_CATEGORY: "feature support matrix"
56 |     agents:
57 |       queue: cpu
58 |     commands:
59 |       - |
60 |         .buildkite/scripts/record_step_result.sh data_parallelism_PerformanceTest
61 | 


--------------------------------------------------------------------------------
/.buildkite/features/async_scheduler.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: async scheduler
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for async scheduler"
19 |     key: "async_scheduler_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_async_scheduler.py::test_async_correctness
25 |   - label: "Record correctness test result for async scheduler"
26 |     key: "record_async_scheduler_CorrectnessTest"
27 |     depends_on: "async_scheduler_CorrectnessTest"
28 |     env:
29 |       CI_TARGET: "async scheduler"
30 |       CI_STAGE: "CorrectnessTest"
31 |       CI_CATEGORY: "feature support matrix"
32 |     agents:
33 |       queue: cpu
34 |     commands:
35 |       - |
36 |         .buildkite/scripts/record_step_result.sh async_scheduler_CorrectnessTest
37 | 
38 |   - label: "Performance tests for async scheduler"
39 |     key: "async_scheduler_PerformanceTest"
40 |     depends_on: "record_async_scheduler_CorrectnessTest"
41 |     soft_fail: true
42 |     agents:
43 |       queue: tpu_v6e_queue
44 |     commands:
45 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/e2e/test_async_scheduler.py::test_performance
46 |   - label: "Record performance test result for async scheduler"
47 |     key: "record_async_scheduler_PerformanceTest"
48 |     depends_on: "async_scheduler_PerformanceTest"
49 |     env:
50 |       CI_TARGET: "async scheduler"
51 |       CI_STAGE: "PerformanceTest"
52 |       CI_CATEGORY: "feature support matrix"
53 |     agents:
54 |       queue: cpu
55 |     commands:
56 |       - |
57 |         .buildkite/scripts/record_step_result.sh async_scheduler_PerformanceTest
58 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/500-feature-request.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: 🚀 Feature request
16 | description: Submit a proposal/request for a new TPU Inference feature
17 | title: "[Feature]: "
18 | labels: ["feature request"]
19 | 
20 | body:
21 | - type: markdown
22 |   attributes:
23 |     value: >
24 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/tpu-inference/issues?q=is%3Aissue+sort%3Acreated-desc+).
25 | - type: textarea
26 |   attributes:
27 |     label: 🚀 The feature, motivation and pitch
28 |     description: >
29 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
30 |   validations:
31 |     required: true
32 | - type: textarea
33 |   attributes:
34 |     label: Alternatives
35 |     description: >
36 |       A description of any alternative solutions or features you've considered, if any.
37 | - type: textarea
38 |   attributes:
39 |     label: Additional context
40 |     description: >
41 |       Add any other context or screenshots about the feature request.
42 | - type: markdown
43 |   attributes:
44 |     value: >
45 |       Thanks for contributing 🎉!
46 | - type: checkboxes
47 |   id: askllm
48 |   attributes:
49 |     label: Before submitting a new issue...
50 |     options:
51 |       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs), which can answer lots of frequently asked questions.
52 |         required: true
53 | 


--------------------------------------------------------------------------------
/.buildkite/features/LoRA_Torch.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: LoRA_Torch
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for LoRA_Torch"
19 |     key: "LoRA_Torch_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         .buildkite/scripts/run_in_docker.sh \
26 |           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora.py'
27 |   - label: "Record correctness test result for LoRA_Torch"
28 |     key: "record_LoRA_Torch_CorrectnessTest"
29 |     depends_on: "LoRA_Torch_CorrectnessTest"
30 |     env:
31 |       CI_TARGET: "LoRA_Torch"
32 |       CI_STAGE: "CorrectnessTest"
33 |       CI_CATEGORY: "feature support matrix"
34 |     agents:
35 |       queue: cpu
36 |     commands:
37 |       - |
38 |         .buildkite/scripts/record_step_result.sh LoRA_Torch_CorrectnessTest
39 | 
40 |   - label: "Performance tests for LoRA_Torch"
41 |     key: "LoRA_Torch_PerformanceTest"
42 |     depends_on: "record_LoRA_Torch_CorrectnessTest"
43 |     soft_fail: true
44 |     agents:
45 |       queue: tpu_v6e_queue
46 |     commands:
47 |       - |
48 |         .buildkite/scripts/run_in_docker.sh \
49 |           bash -c 'MODEL_IMPL_TYPE=vllm TPU_BACKEND_TYPE=jax python3 -m pytest -s -v -x /workspace/tpu_inference/tests/lora/test_lora_perf.py'
50 |   - label: "Record performance test result for LoRA_Torch"
51 |     key: "record_LoRA_Torch_PerformanceTest"
52 |     depends_on: "LoRA_Torch_PerformanceTest"
53 |     env:
54 |       CI_TARGET: "LoRA_Torch"
55 |       CI_STAGE: "PerformanceTest"
56 |       CI_CATEGORY: "feature support matrix"
57 |     agents:
58 |       queue: cpu
59 |     commands:
60 |       - |
61 |         .buildkite/scripts/record_step_result.sh LoRA_Torch_PerformanceTest
62 | 


--------------------------------------------------------------------------------
/.buildkite/pipeline_generation/vllm_native_model_template.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: {MODEL_NAME}
16 | # pipeline-type: {CATEGORY}
17 | steps:
18 |   - label: "Unit tests for {MODEL_NAME}"
19 |     key: "{SANITIZED_MODEL_NAME}_UnitTest"
20 |     agents:
21 |       queue: {QUEUE}
22 |     soft_fail: true
23 |     commands:
24 |       - echo "placeholder"  # TODO: replace with your unit test command
25 |   - label: "Record unit test result for {MODEL_NAME}"
26 |     key: "record_{SANITIZED_MODEL_NAME}_UnitTest"
27 |     depends_on: "{SANITIZED_MODEL_NAME}_UnitTest"
28 |     env:
29 |       CI_TARGET: {MODEL_NAME}
30 |       CI_STAGE: "UnitTest"
31 |       CI_CATEGORY: "{CATEGORY}"
32 |     agents:
33 |       queue: cpu
34 |     commands:
35 |       - |
36 |         .buildkite/scripts/record_step_result.sh {SANITIZED_MODEL_NAME}_UnitTest
37 | 
38 |   - label: "Integration tests for {MODEL_NAME}"
39 |     key: "{SANITIZED_MODEL_NAME}_IntegrationTest"
40 |     depends_on: "record_{SANITIZED_MODEL_NAME}_UnitTest"
41 |     agents:
42 |       queue: {QUEUE}
43 |     soft_fail: true
44 |     env:
45 |       TEST_MODEL: {MODEL_NAME}
46 |       TENSOR_PARALLEL_SIZE: {TENSOR_PARALLEL_SIZE}
47 |       MINIMUM_ACCURACY_THRESHOLD: 0  # TODO : replace 0 with your accuracy threshold
48 |     commands:
49 |       - |
50 |         .buildkite/scripts/run_in_docker.sh bash /workspace/tpu_inference/tests/e2e/benchmarking/test_accuracy.sh
51 |   - label: "Record integration test result for {MODEL_NAME}"
52 |     key: "record_{SANITIZED_MODEL_NAME}_IntegrationTest"
53 |     depends_on: "{SANITIZED_MODEL_NAME}_IntegrationTest"
54 |     env:
55 |       CI_TARGET: {MODEL_NAME}
56 |       CI_STAGE: "IntegrationTest"
57 |       CI_CATEGORY: "{CATEGORY}"
58 |     agents:
59 |       queue: cpu
60 |     commands:
61 |       - |
62 |         .buildkite/scripts/record_step_result.sh {SANITIZED_MODEL_NAME}_IntegrationTest
63 | 


--------------------------------------------------------------------------------
/.buildkite/features/KV_Cache_Host_Offloading.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: KV cache host offloading
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for KV cache host offloading"
19 |     key: "KV_Cache_Host_Offloading_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "KV_Cache_Host_Offloading_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for KV cache host offloading"
27 |     key: "record_KV_Cache_Host_Offloading_CorrectnessTest"
28 |     depends_on: "KV_Cache_Host_Offloading_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "KV cache host offloading"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "feature support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_CorrectnessTest
38 | 
39 |   - label: "Performance tests for KV cache host offloading"
40 |     key: "KV_Cache_Host_Offloading_PerformanceTest"
41 |     depends_on: "record_KV_Cache_Host_Offloading_CorrectnessTest"
42 |     soft_fail: true
43 |     agents:
44 |       queue: tpu_v6e_queue
45 |     commands:
46 |       - |
47 |         buildkite-agent meta-data set "KV_Cache_Host_Offloading_PerformanceTest" "unverified"
48 |   - label: "Record performance test result for KV cache host offloading"
49 |     key: "record_KV_Cache_Host_Offloading_PerformanceTest"
50 |     depends_on: "KV_Cache_Host_Offloading_PerformanceTest"
51 |     env:
52 |       CI_TARGET: "KV cache host offloading"
53 |       CI_STAGE: "PerformanceTest"
54 |       CI_CATEGORY: "feature support matrix"
55 |     agents:
56 |       queue: cpu
57 |     commands:
58 |       - |
59 |         .buildkite/scripts/record_step_result.sh KV_Cache_Host_Offloading_PerformanceTest
60 | 


--------------------------------------------------------------------------------
/docs/getting_started/quickstart.md:
--------------------------------------------------------------------------------
 1 | # Get started with vLLM TPU
 2 | 
 3 | Google Cloud TPUs (Tensor Processing Units) accelerate machine learning workloads. vLLM supports TPU v6e and v5e. For architecture, supported topologies, and more, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture) and specific TPU version pages ([v5e](https://cloud.google.com/tpu/docs/v5e) and [v6e](https://cloud.google.com/tpu/docs/v6e)).
 4 | 
 5 | ---
 6 | 
 7 | ## Requirements
 8 | 
 9 | * **Google Cloud TPU VM:** Access to a TPU VM. For setup instructions, see the [Cloud TPU Setup guide](tpu_setup.md).
10 | * **TPU versions:** v6e, v5e
11 | * **Python:** 3.11 or newer (3.12 used in examples).
12 | 
13 | ---
14 | 
15 | ## Installation
16 | 
17 | For detailed steps on installing `vllm-tpu` with `pip` or running it as a Docker image, please see the [**Installation Guide**](installation.md).
18 | 
19 | ## Run the vLLM Server
20 | 
21 | After installing `vllm-tpu`, you can start the API server.
22 | 
23 | 1. **Log in to Hugging Face:**
24 |    You'll need a Hugging Face token to download models.
25 | 
26 |    ```shell
27 |    export TOKEN=YOUR_TOKEN
28 |    git config --global credential.helper store
29 |    huggingface-cli login --token $TOKEN
30 |    ```
31 | 
32 | 2. **Launch the Server:**
33 |    The following command starts the server with the Llama-3.1-8B model.
34 | 
35 |    ```shell
36 |    vllm serve "meta-llama/Llama-3.1-8B" \
37 |        --download_dir /tmp \
38 |        --disable-log-requests \
39 |        --tensor_parallel_size=1 \
40 |        --max-model-len=2048
41 |    ```
42 | 
43 | 3. **Send a Request:**
44 | 
45 | Once the server is running, you can send it a request using `curl`:
46 | 
47 | ```shell
48 | curl http://localhost:8000/v1/completions \
49 |     -H "Content-Type: application/json" \
50 |     -d '{
51 |         "model": "meta-llama/Llama-3.1-8B",
52 |         "prompt": "Hello, my name is",
53 |         "max_tokens": 20,
54 |         "temperature": 0.7
55 |     }'
56 | ```
57 | 
58 | ## Next steps:
59 | 
60 | Check out complete, end-to-end example recipes in the [tpu-recipes repository](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM)
61 | 
62 | ## For further reading
63 | 
64 | * [Examples](https://github.com/vllm-project/tpu-inference/tree/main/examples)
65 | * [Recipes](https://github.com/AI-Hypercomputer/tpu-recipes/tree/main/inference/trillium/vLLM)
66 | * [GKE serving with vLLM TPU](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-vllm-tpu)
67 | 


--------------------------------------------------------------------------------
/tests/core/test_disagg_executor.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | import unittest
 3 | from unittest.mock import MagicMock, patch
 4 | 
 5 | from vllm.config import ModelConfig, VllmConfig
 6 | 
 7 | from tpu_inference.core.disagg_executor import DisaggExecutor
 8 | 
 9 | 
10 | class DisaggExecutorTest(unittest.TestCase):
11 | 
12 |     def setUp(self):
13 |         """Set up the test environment by mocking dependencies."""
14 |         # Mock configurations
15 |         self.mock_vllm_config = MagicMock(spec=VllmConfig)
16 |         self.mock_vllm_config.model_config = ModelConfig(
17 |             tokenizer_mode="auto",
18 |             trust_remote_code=False,
19 |             seed=0,
20 |             dtype='bfloat16')
21 |         self.mock_vllm_config.cache_config = MagicMock()
22 |         self.mock_vllm_config.scheduler_config = MagicMock()
23 |         self.mock_vllm_config.load_config = MagicMock()
24 |         self.mock_vllm_config.lora_config = None
25 |         self.mock_vllm_config.parallel_config = MagicMock()
26 |         self.mock_vllm_config.device_config = MagicMock()
27 |         self.mock_vllm_config.speculative_config = None
28 |         self.mock_vllm_config.prompt_adapter_config = None
29 |         self.mock_vllm_config.observability_config = MagicMock()
30 | 
31 |         # Patch the collective_rpc method to avoid actual RPC calls
32 |         self.patcher = patch(
33 |             "tpu_inference.core.disagg_executor.DisaggExecutor.collective_rpc")
34 |         self.mock_collective_rpc = self.patcher.start()
35 |         self.addCleanup(self.patcher.stop)
36 | 
37 |         # Create a DisaggExecutor instance with the mock config
38 |         self.executor = DisaggExecutor(vllm_config=self.mock_vllm_config)
39 | 
40 |     def test_init_with_devices(self):
41 |         """Test init_with_devices."""
42 |         self.executor._init_executor()
43 | 
44 |         # Check that collective_rpc was called with the expected arguments
45 |         self.mock_collective_rpc.assert_called()
46 |         calls = self.mock_collective_rpc.call_args_list
47 | 
48 |         # Asserts for init_worker
49 |         self.assertEqual(calls[0][0][0], "init_worker")
50 |         self.assertEqual(calls[1][0][0], "init_device")
51 |         self.assertEqual(calls[2][0][0], "load_model")
52 | 
53 |     def test_check_health(self):
54 |         """Test check_health."""
55 |         # Call check_health (it should always pass)
56 |         self.executor.check_health()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/tests/core/test_init.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import importlib
16 | import unittest
17 | from unittest.mock import patch
18 | 
19 | 
20 | class TestPathwaysInit(unittest.TestCase):
21 | 
22 |     @patch.dict("os.environ", {"JAX_PLATFORMS": "proxy,cpu"})
23 |     def test_VLLM_TPU_USING_PATHWAYS_enabled(self):
24 |         """Test when JAX_PLATFORMS contains 'proxy'."""
25 | 
26 |         # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic
27 |         import vllm.envs as envs
28 | 
29 |         # Reload the module to ensure fresh import
30 |         importlib.reload(envs)
31 | 
32 |         # Check that VLLM_TPU_USING_PATHWAYS is True when JAX_PLATFORMS contains "proxy"
33 |         self.assertTrue(envs.VLLM_TPU_USING_PATHWAYS)
34 | 
35 |     @patch.dict("os.environ", {"JAX_PLATFORMS": "cpu"})
36 |     def test_VLLM_TPU_USING_PATHWAYS_not_enabled(self):
37 |         """Test when JAX_PLATFORMS does not contain 'proxy'."""
38 | 
39 |         # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic
40 |         import vllm.envs as envs
41 | 
42 |         # Reload the module to ensure fresh import
43 |         importlib.reload(envs)
44 | 
45 |         # Check that VLLM_TPU_USING_PATHWAYS is False when JAX_PLATFORMS doesn't contain "proxy"
46 |         self.assertFalse(envs.VLLM_TPU_USING_PATHWAYS)
47 | 
48 |     @patch.dict("os.environ", {"JAX_PLATFORMS": "PROXY,CPU"})
49 |     def test_VLLM_TPU_USING_PATHWAYS_case_insensitive(self):
50 |         """Test that JAX_PLATFORMS check is case insensitive."""
51 | 
52 |         # Import vllm.envs to test the VLLM_TPU_USING_PATHWAYS logic
53 |         import vllm.envs as envs
54 | 
55 |         # Reload the module to ensure fresh import
56 |         importlib.reload(envs)
57 | 
58 |         # Check that VLLM_TPU_USING_PATHWAYS is True even with uppercase "PROXY"
59 |         self.assertTrue(envs.VLLM_TPU_USING_PATHWAYS)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()
64 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/200-installation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: 🛠️ Installation
16 | description: Report an issue here when you hit errors during installation.
17 | title: "[Installation]: "
18 | labels: ["installation"]
19 | 
20 | body:
21 | - type: markdown
22 |   attributes:
23 |     value: >
24 |       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/tpu-inference/issues?q=is%3Aissue+sort%3Acreated-desc+).
25 | - type: textarea
26 |   attributes:
27 |     label: Your current environment
28 |     description: |
29 |       Please run the following and paste the output below.
30 |       ```sh
31 |       wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
32 |       # For security purposes, please feel free to check the contents of collect_env.py before running it.
33 |       python collect_env.py
34 |       python -c "import jax; jax.print_environment_info()"
35 |       ```
36 |       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
37 |     value: |
38 |       ```text
39 |       Outputs of the commands above
40 |       ```
41 |   validations:
42 |     required: true
43 | - type: textarea
44 |   attributes:
45 |     label: How you are installing TPU inference?
46 |     description: |
47 |       Paste the full command you are trying to execute.
48 |     value: |
49 |       ```sh
50 |       pip install -e .
51 |       ```
52 | - type: markdown
53 |   attributes:
54 |     value: >
55 |       Thanks for contributing 🎉!
56 | - type: checkboxes
57 |   id: askllm
58 |   attributes:
59 |     label: Before submitting a new issue...
60 |     options:
61 |       - label: Make sure you already searched for relevant issues and checked the [documentation page](https://github.com/vllm-project/tpu-inference/tree/main/docs).
62 |         required: true
63 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Publish Package to PyPI
16 | 
17 | on:
18 |   push:
19 |     tags:
20 |       - 'v[0-9]+.[0-9]+.[0-9]+*'
21 |   schedule:
22 |       - cron: '0 8 * * *'
23 | 
24 | jobs:
25 |   pypi_publish:
26 |     name: Build and Publish
27 |     runs-on: ubuntu-latest
28 | 
29 |     permissions:
30 |       id-token: write
31 |       contents: read
32 | 
33 |     steps:
34 |     - name: Checkout Code
35 |       uses: actions/checkout@v4
36 |       with:
37 |         fetch-depth: 0
38 | 
39 |     - name: Check if tag is on main branch
40 |       if: github.event_name == 'push'
41 |       run: |
42 |         echo "Checking if tag ${{ github.ref_name }} is on main branch..."
43 |         if git branch -r --contains ${{ github.ref_name }} | grep -q "origin/main"; then
44 |           echo "Tag is on origin/main. Proceeding with release."
45 |         else
46 |           echo "ERROR Tag ${{ github.ref_name }} is not on origin/main."
47 |           echo "This release will be aborted to prevent publishing from a non-main branch."
48 |           exit 1
49 |         fi
50 | 
51 |     - name: Set up Python
52 |       uses: actions/setup-python@v5
53 |       with:
54 |         python-version: '3.x'
55 | 
56 |     - name: Determine Version
57 |       id: vars
58 |       env:
59 |           GH_EVENT_NAME: ${{ github.event_name }}
60 |           GH_REF_NAME: ${{ github.ref_name }}
61 |           GH_REF: ${{ github.ref }}
62 |       run: bash .github/scripts/determine_release_vars.sh
63 | 
64 |     - name: Install dependencies (build)
65 |       run:
66 |         python3 -m pip install build
67 | 
68 |     - name: Build package (sdist and wheel)
69 |       env:
70 |           VLLM_VERSION_OVERRIDE: ${{ steps.vars.outputs.VERSION }}
71 |       run: python3 -m build
72 | 
73 |     - name: Publish package distributions to PyPI
74 |       uses: pypa/gh-action-pypi-publish@release/v1
75 | 
76 |     - name: Publish completed message
77 |       run: echo "---Build and publish completed successfully.---"
78 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/commit_verified_commit_hashes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -e
17 | 
18 | # --- Configuration ---
19 | REPO_URL="https://github.com/vllm-project/tpu-inference.git"
20 | TARGET_BRANCH="main"
21 | 
22 | COMMIT_MESSAGE="Update verified commit hashes"
23 | 
24 | # Construct the repository URL with the access token for authentication.
25 | AUTHENTICATED_REPO_URL="https://x-access-token:${GITHUB_PAT}@${REPO_URL#https://}"
26 | 
27 | # Ensure the GITHUB_PAT is available before proceeding.
28 | if [ -z "${GITHUB_PAT:-}" ]; then
29 |   echo "--- ERROR: GITHUB_PAT secret not found. Cannot proceed."
30 |   exit 1
31 | fi
32 | 
33 | echo "--- Configuring Git user details"
34 | git config user.name "Buildkite Bot"
35 | git config user.email "buildkite-bot@users.noreply.github.com"
36 | 
37 | echo "--- Fetching and checking out the target branch"
38 | git fetch origin "${TARGET_BRANCH}"
39 | git checkout "${TARGET_BRANCH}"
40 | git reset --hard origin/"${TARGET_BRANCH}"
41 | 
42 | VLLM_COMMIT_HASH=$(buildkite-agent meta-data get "VLLM_COMMIT_HASH" --default "")
43 | 
44 | if [ -z "${VLLM_COMMIT_HASH}" ]; then
45 |     echo "VLLM_COMMIT_HASH not found in buildkite meta-data"
46 |     exit 1
47 | fi
48 | 
49 | if [ -z "${BUILDKITE_COMMIT:-}" ]; then
50 |     echo "BUILDKITE_COMMIT not found"
51 |     exit 1
52 | fi
53 | 
54 | if [ ! -f verified_commit_hashes.csv ]; then
55 |     echo "timestamp,vllm_commit_hash,tpu_inference_commit_hash" > verified_commit_hashes.csv
56 | fi
57 | echo "$(date '+%Y-%m-%d %H:%M:%S'),${VLLM_COMMIT_HASH},${BUILDKITE_COMMIT}" >> verified_commit_hashes.csv
58 | 
59 | git add verified_commit_hashes.csv
60 | 
61 | # --- Check for changes before committing ---
62 | if git diff --quiet --cached; then
63 |   echo "No changes to commit. Exiting successfully."
64 |   exit 0
65 | else
66 |   echo "--- Committing changes"
67 |   git commit -s -m "${COMMIT_MESSAGE}"
68 | 
69 |   echo "--- Pushing changes to '${TARGET_BRANCH}'"
70 |   git push "${AUTHENTICATED_REPO_URL}" "HEAD:${TARGET_BRANCH}"
71 | fi
72 | 


--------------------------------------------------------------------------------
/.buildkite/features/DCN-Based_P-D_disaggregation.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: DCN-based P/D disaggregation
16 | # pipeline-type: feature support matrix
17 | steps:
18 |   - label: "Correctness tests for DCN-based P/D disaggregation"
19 |     key: "DCN_based_P-D_disaggregation_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - |
25 |         buildkite-agent meta-data set "DCN_based_P-D_disaggregation_CorrectnessTest" "unverified"
26 |   - label: "Record correctness test result for DCN-based P/D disaggregation"
27 |     key: "record_DCN_based_P-D_disaggregation_CorrectnessTest"
28 |     depends_on: "DCN_based_P-D_disaggregation_CorrectnessTest"
29 |     env:
30 |       CI_TARGET: "DCN-based P/D disaggregation"
31 |       CI_STAGE: "CorrectnessTest"
32 |       CI_CATEGORY: "feature support matrix"
33 |     agents:
34 |       queue: cpu
35 |     commands:
36 |       - |
37 |         .buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_CorrectnessTest
38 | 
39 |   - label: "Performance tests for DCN-based P/D disaggregation"
40 |     key: "DCN_based_P-D_disaggregation_PerformanceTest"
41 |     soft_fail: true
42 |     env:
43 |       MODEL:"Qwen/Qwen3-0.6B"
44 |       INPUT_LEN:1024
45 |       OUTPUT_LEN:1024
46 |       NUM_PROMPTS:20
47 |       RANDOM_SEED:10
48 |       MAX_CONCURRENCY:1
49 |     agents:
50 |       queue: tpu_v6e_8_queue
51 |     commands:
52 |       - |
53 |         .buildkite/scripts/run_disagg.sh
54 | 
55 |   - label: "Record performance test result for DCN-based P/D disaggregation"
56 |     key: "record_DCN_based_P-D_disaggregation_PerformanceTest"
57 |     depends_on: "DCN_based_P-D_disaggregation_PerformanceTest"
58 |     env:
59 |       CI_TARGET: "DCN-based P/D disaggregation"
60 |       CI_STAGE: "PerformanceTest"
61 |       CI_CATEGORY: "feature support matrix"
62 |     agents:
63 |       queue: cpu
64 |     commands:
65 |       - |
66 |         .buildkite/scripts/record_step_result.sh DCN_based_P-D_disaggregation_PerformanceTest
67 | 


--------------------------------------------------------------------------------
/tests/lora/test_lora_perf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import time
17 | 
18 | import pytest
19 | import vllm
20 | from vllm.lora.request import LoRARequest
21 | 
22 | TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1]
23 | 
24 | 
25 | @pytest.mark.parametrize("tp", TP)
26 | def test_lora_performance(tp):
27 |     prompt = "What is 1+1? \n"
28 |     llm_without_lora = vllm.LLM(
29 |         model="Qwen/Qwen2.5-3B-Instruct",
30 |         max_model_len=256,
31 |         max_num_batched_tokens=64,
32 |         max_num_seqs=8,
33 |         tensor_parallel_size=tp,
34 |     )
35 |     start_time = time.time()
36 |     llm_without_lora.generate(
37 |         prompt,
38 |         sampling_params=vllm.SamplingParams(max_tokens=16, temperature=0),
39 |     )[0].outputs[0].text
40 |     base_time = time.time() - start_time
41 | 
42 |     del llm_without_lora
43 |     # Waiting for TPUs to be released
44 |     time.sleep(10)
45 | 
46 |     llm_with_lora = vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
47 |                              max_model_len=256,
48 |                              max_num_batched_tokens=64,
49 |                              max_num_seqs=8,
50 |                              tensor_parallel_size=tp,
51 |                              enable_lora=True,
52 |                              max_loras=1,
53 |                              max_lora_rank=8)
54 |     lora_request = LoRARequest(
55 |         "lora_adapter_2", 2,
56 |         "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_2_adapter")
57 |     start_time = time.time()
58 |     llm_with_lora.generate(prompt,
59 |                            sampling_params=vllm.SamplingParams(max_tokens=16,
60 |                                                                temperature=0),
61 |                            lora_request=lora_request)[0].outputs[0].text
62 |     lora_time = time.time() - start_time
63 |     print(f"Base time: {base_time}, LoRA time: {lora_time}")
64 |     assert (base_time /
65 |             lora_time) < 8, f"Base time: {base_time}, LoRA time: {lora_time}"
66 | 
67 |     del llm_with_lora
68 | 


--------------------------------------------------------------------------------
/.buildkite/features/Collective_Communication_Matmul.yml:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # pipeline-name: Collective Communication Matmul
16 | # pipeline-type: kernel support matrix
17 | steps:
18 |   - label: "Correctness tests for Collective Communication Matmul"
19 |     key: "Collective_Communication_Matmul_CorrectnessTest"
20 |     soft_fail: true
21 |     agents:
22 |       queue: tpu_v6e_queue
23 |     commands:
24 |       - .buildkite/scripts/run_in_docker.sh python3 -m pytest -s -v /workspace/tpu_inference/tests/kernels/collectives/all_gather_matmul_kernel_test.py
25 |   - label: "Record correctness test result for Collective Communication Matmul"
26 |     key: "record_Collective_Communication_Matmul_CorrectnessTest"
27 |     depends_on: "Collective_Communication_Matmul_CorrectnessTest"
28 |     env:
29 |       CI_TARGET: "Collective Communication Matmul"
30 |       CI_STAGE: "CorrectnessTest"
31 |       CI_CATEGORY: "kernel support matrix"
32 |     agents:
33 |       queue: cpu
34 |     commands:
35 |       - |
36 |         .buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_CorrectnessTest
37 | 
38 |   - label: "Performance tests for Collective Communication Matmul"
39 |     key: "Collective_Communication_Matmul_PerformanceTest"
40 |     depends_on: "record_Collective_Communication_Matmul_CorrectnessTest"
41 |     soft_fail: true
42 |     agents:
43 |       queue: tpu_v6e_queue
44 |     commands:
45 |       - |
46 |         buildkite-agent meta-data set "Collective_Communication_Matmul_PerformanceTest" "unverified"
47 |   - label: "Record performance test result for Collective Communication Matmul"
48 |     key: "record_Collective_Communication_Matmul_PerformanceTest"
49 |     depends_on: "Collective_Communication_Matmul_PerformanceTest"
50 |     env:
51 |       CI_TARGET: "Collective Communication Matmul"
52 |       CI_STAGE: "PerformanceTest"
53 |       CI_CATEGORY: "kernel support matrix"
54 |     agents:
55 |       queue: cpu
56 |     commands:
57 |       - |
58 |         .buildkite/scripts/record_step_result.sh Collective_Communication_Matmul_PerformanceTest
59 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/commit_support_matrices.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright 2025 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -e
17 | 
18 | # --- Configuration ---
19 | REPO_URL="https://github.com/vllm-project/tpu-inference.git"
20 | TARGET_BRANCH="main"
21 | 
22 | # Conditional Configuration for Release vs. Nightly
23 | if [ "${NIGHTLY}" = "1" ]; then
24 |   # Set path and commit message for nightly builds.
25 |   ARTIFACT_DOWNLOAD_PATH="support_matrices/nightly"
26 |   COMMIT_MESSAGE="[skip ci] Update nightly support matrices"
27 | else
28 |   # Set path and commit message for release tag builds.
29 |   COMMIT_TAG="${BUILDKITE_TAG:-unknown-tag}"
30 |   ARTIFACT_DOWNLOAD_PATH="support_matrices"
31 |   COMMIT_MESSAGE="[skip ci] Update support matrices for ${COMMIT_TAG}"
32 | fi
33 | # Construct the repository URL with the access token for authentication.
34 | AUTHENTICATED_REPO_URL="https://x-access-token:${GITHUB_PAT}@${REPO_URL#https://}"
35 | 
36 | # Ensure the GITHUB_PAT is available before proceeding.
37 | if [ -z "${GITHUB_PAT:-}" ]; then
38 |   echo "--- ERROR: GITHUB_PAT secret not found. Cannot proceed."
39 |   exit 1
40 | fi
41 | 
42 | echo "--- Configuring Git user details"
43 | git config user.name "Buildkite Bot"
44 | git config user.email "buildkite-bot@users.noreply.github.com"
45 | 
46 | echo "--- Fetching and checking out the target branch"
47 | git fetch origin "${TARGET_BRANCH}"
48 | git checkout "${TARGET_BRANCH}"
49 | git reset --hard origin/"${TARGET_BRANCH}"
50 | 
51 | echo "--- Downloading CSV artifacts"
52 | mkdir -p "${ARTIFACT_DOWNLOAD_PATH}"
53 | buildkite-agent artifact download "*.csv" "${ARTIFACT_DOWNLOAD_PATH}/" --flat
54 | 
55 | echo "--- Staging downloaded artifacts"
56 | git add "${ARTIFACT_DOWNLOAD_PATH}"/*.csv
57 | 
58 | # --- Check for changes before committing ---
59 | if git diff --quiet --cached; then
60 |   echo "No changes to commit. Exiting successfully."
61 |   exit 0
62 | else
63 |   echo "--- Committing changes"
64 |   git commit -s -m "${COMMIT_MESSAGE}"
65 | 
66 |   echo "--- Pushing changes to '${TARGET_BRANCH}'"
67 |   git push "${AUTHENTICATED_REPO_URL}" "HEAD:${TARGET_BRANCH}"
68 | fi
69 | 


--------------------------------------------------------------------------------
/tests/e2e/benchmarking/bench_utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # -----------------------------------------------------------------------------
 5 | # BENCHMARK UTILITY FUNCTIONS
 6 | # This file is sourced by various performance scripts (e.g., mlperf.sh,
 7 | # llama_guard_perf_recipe.sh) to share common functions.
 8 | # -----------------------------------------------------------------------------
 9 | 
10 | # waitForServerReady: Blocks execution until the server prints the READY_MESSAGE or times out.
11 | # This logic is shared across all benchmark scripts.
12 | waitForServerReady() {
13 |     # shellcheck disable=SC2155
14 |     local start_time=$(date +%s)
15 |     echo "Waiting for server ready message: '$READY_MESSAGE'"
16 | 
17 |     local fatal_error_patterns=(
18 |         "RuntimeError:"
19 |         "ValueError:"
20 |         "FileNotFoundError:"
21 |         "TypeError:"
22 |         "ImportError:"
23 |         "NotImplementedError:"
24 |         "AssertionError:"
25 |         "TimeoutError:"
26 |         "OSError:"
27 |         "AttributeError:"
28 |         "NVMLError:"
29 |     )
30 | 
31 |     local error_regex
32 |     error_regex=$(IFS=\|; echo "${fatal_error_patterns[*]}")
33 | 
34 |     while true; do
35 |         current_time=$(date +%s)
36 |         elapsed_time=$((current_time - start_time))
37 | 
38 |         sleep 5
39 | 
40 |         if [[ "$elapsed_time" -ge "$TIMEOUT_SECONDS" ]]; then
41 |             echo "TIMEOUT: Waited $elapsed_time seconds (limit was $TIMEOUT_SECONDS). The string '$READY_MESSAGE' was NOT found."
42 |             # Call cleanup and exit (cleanup must be handled by the calling script's trap)
43 |             exit 1
44 |         fi
45 | 
46 |         if grep -Eq "$error_regex" "$LOG_FILE"; then
47 |             echo "FATAL ERROR DETECTED: The server log contains a fatal error pattern."
48 |             # Call cleanup and exit (cleanup must be handled by the calling script's trap)
49 |             exit 1
50 |         fi
51 | 
52 |         if grep -Fq "$READY_MESSAGE" "$LOG_FILE" ; then
53 |             echo "Server is ready."
54 |             return 0
55 |         fi
56 |     done
57 | }
58 | 
59 | # cleanUp: Stops the vLLM server process and deletes log files.
60 | # Usage: cleanUp <MODEL_NAME>
61 | cleanUp() {
62 |     echo "Stopping the vLLM server and cleaning up log files..."
63 |     # $1 is the MODEL_NAME passed as argument
64 |     pkill -f "vllm serve $1"
65 |     # Kill all processes related to vllm.
66 |     pgrep -f -i vllm | xargs -r kill -9
67 | 
68 |     # Clean up log files. Use -f to avoid errors if files don't exist.
69 |     rm -f "$LOG_FILE"
70 |     rm -f "$BENCHMARK_LOG_FILE"
71 |     echo "Cleanup complete."
72 | }
73 | 


--------------------------------------------------------------------------------