├── src ├── frameworks │ ├── a3ultra │ │ ├── trtllm-configs │ │ │ └── llama-3.1-405b.yaml │ │ ├── vllm-configs │ │ │ ├── gpt-oss-120b.yaml │ │ │ ├── llama4-scout.yaml │ │ │ ├── deepseek-r1-671b.yaml │ │ │ └── llama4-maverick.yaml │ │ ├── dynamo-configs │ │ │ ├── llama-3.3-70b-multi-node.yaml │ │ │ └── llama-3.3-70b-single-node.yaml │ │ ├── sglang-configs │ │ │ ├── deepseek-r1-671b.yaml │ │ │ ├── llama4-maverick.yaml │ │ │ └── llama4-scout.yaml │ │ └── maxtext-configs │ │ │ ├── llama3-1-405b-256gpus-a3u-fp8.yaml │ │ │ ├── llama3-1-405b-512gpus-a3u-fp8.yaml │ │ │ ├── llama3-1-70b-512gpus-a3u-fp8-gbs2048.yaml │ │ │ ├── llama3-1-70b-256gpus-a3u-bf16.yaml │ │ │ ├── llama3-1-70b-512gpus-a3u-bf16.yaml │ │ │ ├── llama3-1-405b-1024gpus-a3u-fp8.yaml │ │ │ ├── llama3-1-405b-768gpus-a3u-fp8.yaml │ │ │ ├── llama3-1-70b-1024gpus-a3u-fp8-gbs2048.yaml │ │ │ ├── llama3-1-70b-256gpus-a3u-fp8.yaml │ │ │ └── llama3-1-70b-512gpus-a3u-fp8.yaml │ ├── a4 │ │ ├── sglang-configs │ │ │ └── deepseek-r1-671b.yaml │ │ ├── vllm-configs │ │ │ └── deepseek-r1-671b.yaml │ │ └── maxtext-configs │ │ │ ├── llama3-1-405b-256gpus-a4-bf16.yaml │ │ │ ├── llama3-1-70b-256gpus-a4-bf16.yaml │ │ │ ├── llama3-1-405b-1024gpus-a4-bf16.yaml │ │ │ ├── llama3-1-405b-1024gpus-a4-fp8.yaml │ │ │ ├── llama3-1-405b-256gpus-a4-fp8.yaml │ │ │ ├── llama3-1-70b-1024gpus-a4-bf16.yaml │ │ │ ├── llama3-1-70b-256gpus-a4-fp8.yaml │ │ │ └── llama3-1-70b-1024gpus-a4-fp8.yaml │ ├── a3mega │ │ ├── nemo-configs │ │ │ └── README.md │ │ └── maxtext-configs │ │ │ ├── llama-2-7b-128gpus-a3mega-bf16.yaml │ │ │ └── llama-2-7b-256gpus-a3mega-bf16.yaml │ └── a4x │ │ └── trtllm-configs │ │ └── deepseek-r1-nvfp4.yaml ├── docker │ ├── trtllm │ │ ├── requirements.in │ │ └── cloudbuild.yml │ ├── sglang │ │ ├── requirements.in │ │ └── cloudbuild.yml │ ├── nemo-24.05 │ │ ├── requirements.in │ │ ├── README.md │ │ ├── requirements.txt │ │ └── cloudbuild.yml │ ├── nemo-24.07 │ │ ├── requirements.in │ │ ├── README.md │ │ ├── requirements.txt │ │ └── cloudbuild.yml │ ├── nemo-aotc-24.07 │ │ ├── requirements.in │ │ ├── README.md │ │ ├── requirements.txt │ │ └── cloudbuild.yaml │ ├── README.md │ ├── maxtext │ │ ├── README.md │ │ └── cloudbuild.yml │ └── vllm │ │ ├── requirements.in │ │ ├── cloudbuild.yml │ │ └── vllm.Dockerfile ├── README.md ├── utils │ ├── resiliency_metrics │ │ ├── requirements.txt │ │ ├── __init__.py │ │ └── constant.py │ ├── README.md │ └── checkpointing_metrics │ │ ├── README.md │ │ └── log_patterns.py ├── helm-charts │ ├── resiliency │ │ └── supervisor-chart │ │ │ └── Chart.yaml │ ├── storage │ │ ├── README.md │ │ ├── gcs-fuse │ │ │ ├── Chart.yaml │ │ │ ├── values.yaml │ │ │ └── templates │ │ │ │ └── pvc.yaml │ │ └── parallelstore │ │ │ ├── Chart.yaml │ │ │ ├── values.yaml │ │ │ └── templates │ │ │ └── pvc.yaml │ ├── a4 │ │ ├── job │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-svc.yaml │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── jobset │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-svc.yaml │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── nemo-training │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── nemo-configmap.yaml │ │ │ │ └── nemo-launcher-svc.yaml │ │ ├── maxtext-training │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ └── maxtext-launcher-svc.yaml │ │ └── inference-templates │ │ │ └── deployment │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ ├── serving-config-configmap.yaml │ │ │ ├── serving-svc.yaml │ │ │ └── serving-launcher-configmap.yaml │ ├── a3mega │ │ ├── job │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-svc.yaml │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── jobset │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── nemo-training-v2 │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── nemo-configmap.yaml │ │ │ │ └── nemo-launcher-svc.yaml │ │ ├── nccl-tests │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── nccl-test-configmap.yaml │ │ │ │ └── nccl-tests-svc.yaml │ │ ├── sglang-inference │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ └── lws-deployment-svc.yaml │ │ └── vllm-inference │ │ │ ├── multi-host │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ └── lws-deployment-svc.yaml │ │ │ └── single-host │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ └── model-serve-svc.yaml │ ├── a3ultra │ │ ├── job │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-svc.yaml │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── jobset │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ ├── workload-config-configmap.yaml │ │ │ │ └── workload-launcher-configmap.yaml │ │ ├── inference-templates │ │ │ ├── dynamo-deployment │ │ │ │ ├── Chart.yaml │ │ │ │ └── templates │ │ │ │ │ ├── dynamo-config-configmap.yaml │ │ │ │ │ └── dynamo-launcher-configmap.yaml │ │ │ └── deployment │ │ │ │ ├── Chart.yaml │ │ │ │ └── templates │ │ │ │ ├── serving-config-configmap.yaml │ │ │ │ ├── serving-svc.yaml │ │ │ │ └── serving-launcher-configmap.yaml │ │ └── trtllm-inference │ │ │ └── single-node │ │ │ └── Chart.yaml │ └── a4x │ │ ├── inference-templates │ │ └── deployment │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ ├── serving-config-configmap.yaml │ │ │ ├── serving-svc.yaml │ │ │ └── serving-launcher-configmap.yaml │ │ └── inference-templates-gcs │ │ └── deployment │ │ ├── Chart.yaml │ │ └── templates │ │ ├── serving-config-configmap.yaml │ │ ├── serving-svc.yaml │ │ └── serving-launcher-configmap.yaml └── launchers │ ├── vllm-launcher.sh │ ├── dynamo-vllm-launcher.sh │ ├── dynamo-sglang-launcher.sh │ └── sglang-launcher.sh ├── RL └── a4 │ └── recipes │ ├── llama3.1-8b │ └── nemoRL │ │ ├── launcher.sh │ │ ├── Chart.yaml │ │ └── templates │ │ └── fluent-bit-config.yaml.j2 │ └── qwen2.5-1.5b │ └── nemoRL │ ├── launch-ray-cluster.sh │ ├── Chart.yaml │ └── templates │ └── fluent-bit-config.yaml.j2 ├── CONTRIBUTING └── training ├── a4x ├── llama3-1-405b │ └── nemo-pretraining-gke │ │ ├── 32node-BF16-GBS64 │ │ └── recipe │ │ │ ├── recipe_launch_command.sh │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ │ └── values.yaml │ │ └── 16node-FP8CS-GBS2048 │ │ └── recipe │ │ ├── Chart.yaml │ │ └── templates │ │ ├── workload-svc.yaml │ │ ├── workload-config-configmap.yaml │ │ └── workload-launcher-configmap.yaml ├── llama3-1-70b │ └── nemo-pretraining-gke │ │ ├── 16node-BF16-GBS2048 │ │ └── recipe │ │ │ ├── recipe_launch_command.sh │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ │ └── values.yaml │ │ ├── 32node-FP8CS-GBS2048 │ │ └── recipe │ │ │ ├── recipe_launch_command.sh │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ │ └── values.yaml │ │ ├── 64node-FP8CS-GBS2048 │ │ └── recipe │ │ │ ├── recipe_launch_command.sh │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ │ └── values.yaml │ │ └── 16node-FP8CS-GBS2048 │ │ └── recipe │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── workload-svc.yaml │ │ ├── workload-config-configmap.yaml │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml └── llama3-1-8b │ └── nemo-pretraining-gke │ ├── 16node-BF16-GBS1024 │ └── recipe │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── workload-svc.yaml │ │ ├── workload-config-configmap.yaml │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ └── 16node-FP8CS-GBS128 │ └── recipe │ ├── recipe_launch_command.sh │ ├── Chart.yaml │ ├── templates │ ├── workload-svc.yaml │ ├── workload-config-configmap.yaml │ └── workload-launcher-configmap.yaml │ └── values.yaml ├── a4 ├── llama3-1-70b │ └── nemo-pretraining-gke │ │ ├── 2node-bf16-seq8192-gbs256 │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ │ ├── 8node-bf16-seq8192-gbs256 │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ │ ├── 2node-bf16-seq8192-gbs1024 │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ │ ├── 2node-bf16-seq8192-gbs2048 │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ │ ├── 32node-bf16-seq8192-gbs2048 │ │ ├── recipe_launch_command.sh │ │ ├── Chart.yaml │ │ ├── templates │ │ │ ├── workload-svc.yaml │ │ │ ├── workload-config-configmap.yaml │ │ │ └── workload-launcher-configmap.yaml │ │ └── values.yaml │ │ └── 16node-bf16-seq8192-gbs512-gcs │ │ ├── Chart.yaml │ │ └── templates │ │ ├── workload-svc.yaml │ │ ├── workload-config-configmap.yaml │ │ └── workload-launcher-configmap.yaml └── paligemma2 │ ├── Chart.yaml │ ├── templates │ ├── workload-svc.yaml │ ├── workload-config-configmap.yaml │ └── workload-launcher-configmap.yaml │ └── values.yaml ├── a3ultra ├── mixtral-8x7b │ └── nemo-pretraining-gke-resiliency │ │ ├── ksa-setup.yaml │ │ ├── values-gcs.yaml │ │ └── kueue-merge-patch.yaml └── llama3-1-405b │ └── nemo-pretraining-gke-resiliency │ ├── ksa-setup.yaml │ ├── values-gcs.yaml │ └── kueue-merge-patch.yaml └── a3mega └── llama3-1-70b └── nemo-pretraining-gke-resiliency ├── ksa-setup.yaml ├── values-gcs.yaml └── kueue-merge-patch.yaml /src/frameworks/a3ultra/trtllm-configs/llama-3.1-405b.yaml: -------------------------------------------------------------------------------- 1 | tp-size: 8 2 | pp-size: 1 -------------------------------------------------------------------------------- /src/docker/trtllm/requirements.in: -------------------------------------------------------------------------------- 1 | hf_transfer==0.1.9 2 | huggingface_hub[hf_xet]==0.31.4 -------------------------------------------------------------------------------- /src/docker/sglang/requirements.in: -------------------------------------------------------------------------------- 1 | hf_transfer==0.1.9 2 | huggingface_hub[hf_xet]>=0.34.0,<1.0 -------------------------------------------------------------------------------- /src/docker/nemo-24.05/requirements.in: -------------------------------------------------------------------------------- 1 | https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip -------------------------------------------------------------------------------- /src/docker/nemo-24.07/requirements.in: -------------------------------------------------------------------------------- 1 | https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip -------------------------------------------------------------------------------- /src/frameworks/a3ultra/vllm-configs/gpt-oss-120b.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 4 2 | async-scheduling: true -------------------------------------------------------------------------------- /src/docker/nemo-aotc-24.07/requirements.in: -------------------------------------------------------------------------------- 1 | https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip -------------------------------------------------------------------------------- /src/frameworks/a3ultra/dynamo-configs/llama-3.3-70b-multi-node.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 8 2 | enforce-eager: true -------------------------------------------------------------------------------- /src/frameworks/a3ultra/dynamo-configs/llama-3.3-70b-single-node.yaml: -------------------------------------------------------------------------------- 1 | gpu-memory-utilization: 0.95 2 | tensor-parallel-size: 4 -------------------------------------------------------------------------------- /src/frameworks/a3ultra/vllm-configs/llama4-scout.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 8 2 | max-model-len: 3600000 3 | trust-remote-code: true -------------------------------------------------------------------------------- /src/frameworks/a3ultra/vllm-configs/deepseek-r1-671b.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 8 2 | trust-remote-code: true 3 | disable-log-requests: true -------------------------------------------------------------------------------- /src/docker/nemo-24.05/README.md: -------------------------------------------------------------------------------- 1 | # Nemo 24.05 2 | 3 | This is the Dockerfile for building a container image for NVIDIA NeMo-based training workloads. -------------------------------------------------------------------------------- /src/docker/nemo-24.07/README.md: -------------------------------------------------------------------------------- 1 | # Nemo 24.07 2 | 3 | This is the Dockerfile for building a container image for NVIDIA NeMo-based training workloads. -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # Reusable components 2 | This folder contains reusable components, such as Docker images and Helm charts, that are used by the different recipes. -------------------------------------------------------------------------------- /src/frameworks/a3ultra/vllm-configs/llama4-maverick.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 8 2 | max-model-len: 1000000 3 | trust-remote-code: true 4 | gpu_memory_utilization: 0.95 -------------------------------------------------------------------------------- /src/utils/resiliency_metrics/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-logging>=3.0.0 2 | google-auth>=2.0.0 3 | pandas>=1.3.0 4 | tabulate>=0.8.0 5 | python-dateutil>=2.8.0 6 | -------------------------------------------------------------------------------- /src/docker/README.md: -------------------------------------------------------------------------------- 1 | # Dockerfiles for container images 2 | This folder contains the files needed to build the Docker container images for each of the recipes used in this repository. -------------------------------------------------------------------------------- /src/frameworks/a3ultra/sglang-configs/deepseek-r1-671b.yaml: -------------------------------------------------------------------------------- 1 | tp: 8 2 | trust-remote-code: true 3 | enable-dp-attention: true 4 | attention-backend: fa3 5 | dp-size: 4 6 | port: 8000 -------------------------------------------------------------------------------- /src/frameworks/a4/sglang-configs/deepseek-r1-671b.yaml: -------------------------------------------------------------------------------- 1 | tp: 8 2 | trust-remote-code: true 3 | enable-dp-attention: true 4 | attention-backend: flashinfer 5 | dp-size: 4 6 | port: 8000 -------------------------------------------------------------------------------- /src/frameworks/a3mega/nemo-configs/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA NeMo configurations 2 | 3 | This folder contains NVIDIA NeMo framework configurations used by NVIDIA NeMo-based recipes. 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/sglang-configs/llama4-maverick.yaml: -------------------------------------------------------------------------------- 1 | tp: 8 2 | trust-remote-code: true 3 | mem-fraction-static: 0.7 4 | attention-backend: flashinfer 5 | context-length: 1000000 6 | port: 8000 -------------------------------------------------------------------------------- /src/frameworks/a3ultra/sglang-configs/llama4-scout.yaml: -------------------------------------------------------------------------------- 1 | tp: 8 2 | trust-remote-code: true 3 | mem-fraction-static: 0.7 4 | attention-backend: flashinfer 5 | context-length: 3600000 6 | port: 8000 -------------------------------------------------------------------------------- /RL/a4/recipes/llama3.1-8b/nemoRL/launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | REPLICA_COUNT=4 3 | 4 | helm install ray-cluster ../llama3.1-8b \ 5 | --set values.additionalWorkerGroups.worker-grp-0.replicas=$REPLICA_COUNT -------------------------------------------------------------------------------- /RL/a4/recipes/qwen2.5-1.5b/nemoRL/launch-ray-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | REPLICA_COUNT=2 3 | 4 | helm install ray-cluster ../qwen2.5-1.5b \ 5 | --set additionalWorkerGroups.worker-grp-0.replicas=$REPLICA_COUNT -------------------------------------------------------------------------------- /src/helm-charts/resiliency/supervisor-chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: supervisor-chart 3 | description: A Helm chart for deploying the Supervisor 4 | type: application 5 | version: 0.1.0 6 | appVersion: "1.0" -------------------------------------------------------------------------------- /src/docker/maxtext/README.md: -------------------------------------------------------------------------------- 1 | # MaxText Benchmarks 2 | 3 | This is the Dockerfile for building a container image for MaxText/JAX training workloads. 4 | Using the following versions: 5 | - BASE_IMAGE: ghcr.io/nvidia/jax:maxtext-2025-01-10 6 | -------------------------------------------------------------------------------- /src/docker/nemo-aotc-24.07/README.md: -------------------------------------------------------------------------------- 1 | # Nemo 24.07 AotC Image 2 | 3 | This Dockerfile builds a container image designed for NVIDIA NeMo training workloads. It includes the AotC library, 4 | which contains Google-optimized implementations of NeMo-based workflows. -------------------------------------------------------------------------------- /src/frameworks/a4/vllm-configs/deepseek-r1-671b.yaml: -------------------------------------------------------------------------------- 1 | tensor-parallel-size: 8 2 | trust-remote-code: true 3 | max-num-batched-tokens: 32768 4 | max-model-len: 32768 5 | max-num-seqs: 1024 6 | gpu-memory-utilization: 0.95 7 | disable-log-requests: true 8 | enable-chunked-prefill: true -------------------------------------------------------------------------------- /src/docker/maxtext/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'docker' 3 | args: 4 | - 'build' 5 | - '--tag=${_ARTIFACT_REGISTRY}/maxtext-benchmark' 6 | - '--file=maxtext.Dockerfile' 7 | - '.' 8 | automapSubstitutions: true 9 | 10 | images: 11 | - '${_ARTIFACT_REGISTRY}/maxtext-benchmark' -------------------------------------------------------------------------------- /src/helm-charts/storage/README.md: -------------------------------------------------------------------------------- 1 | # Storage Helm charts 2 | 3 | This folder contains Helm charts that automate the creation of Kubernetes Persistent Volumes (PVs) and Persistent Volume Claims (PVCs), which encapsulate best practices for configuring Kubernetes CSI drivers for Google Cloud Storage and Parallelstore for training data and training checkpoints. -------------------------------------------------------------------------------- /src/docker/nemo-aotc-24.07/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by pip-compile with Python 3.11 2 | # by the following command: 3 | # 4 | # pip-compile --generate-hashes requirements.in 5 | # 6 | dllogger @ https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip \ 7 | --hash=sha256:07d0cd9b9b56f454f0c186a0889137e9f94e1979fca3d35911967c874c93c191 8 | # via -r requirements.in -------------------------------------------------------------------------------- /CONTRIBUTING: -------------------------------------------------------------------------------- 1 | # Contributions 2 | 3 | We appreciate your interest in contributing! This project is currently **not** accepting external contributions. We may revisit this policy in the future. 4 | 5 | While we aren't accepting code contributions at this time, you can still get involved by reporting any bugs, feature requests or documentation improvements via GitHub issues. 6 | 7 | Thank you for your understanding! -------------------------------------------------------------------------------- /src/docker/nemo-24.05/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile --generate-hashes requirements.in 6 | # 7 | dllogger @ https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip \ 8 | --hash=sha256:07d0cd9b9b56f454f0c186a0889137e9f94e1979fca3d35911967c874c93c191 9 | # via -r requirements.in 10 | -------------------------------------------------------------------------------- /src/docker/nemo-24.07/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.11 3 | # by the following command: 4 | # 5 | # pip-compile --generate-hashes requirements.in 6 | # 7 | dllogger @ https://github.com/NVIDIA/dllogger/archive/refs/tags/v1.0.0.zip \ 8 | --hash=sha256:07d0cd9b9b56f454f0c186a0889137e9f94e1979fca3d35911967c874c93c191 9 | # via -r requirements.in 10 | -------------------------------------------------------------------------------- /src/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utils 2 | 3 | This folder contains utilities used by the recipes. 4 | 5 | - [training_metrics](./training_metrics/): The utility to calculate training 6 | metrics, including average_step_time, TFLOPS per accelerator, and MFU. 7 | - [resiliency_metrics](./resiliency_metrics/): The utility to calculate 8 | resiliency metrics, including Goodput Percentage, Effective Computation 9 | time, and others. 10 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install ninggu-ubench-8yfm . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-405b-bf16-gbs64-gpu128.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/ninggu-ubench-8yfm --set queue=a4x -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install ninggu-ubench-7crr . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-bf16-gbs2048-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/ninggu-ubench-7crr --set queue=a4x -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install ninggu-ubench-3975 . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-gbs1024-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/ninggu-ubench-3975 --set queue=a4x -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install vishwasreddy-ubench-7jew . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-seq8192-gbs256-mbs1-gpus16.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-7jew -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install vishwasreddy-ubench-8xnx . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-bf16-seq8192-gbs256-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-8xnx -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install ninggu-ubench-5mdk . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-fp8cs-gbs2048-gpus128.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/ninggu-ubench-5mdk --set queue=a4x -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install ninggu-ubench-8kyf . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-fp8cs-gbs2048-gpus256.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/ninggu-ubench-8kyf --set queue=a4x -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install sahiladu-ubench-distributed-5g3f . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-fp8cs-gbs2048-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/sahiladu-ubench-distributed-5g3f -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install vishwasreddy-ubench-distributed-vaxt . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-seq8192-gbs1024-mbs1-gpus16.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-distributed-vaxt -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install vishwasreddy-ubench-distributed-3ubr . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-seq8192-gbs2048-mbs1-gpus16.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-distributed-3ubr -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install vishwasreddy-ubench-distributed-8yb2 . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-70b-seq8192-gbs2048-mbs1-gpus256.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-distributed-8yb2 -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/recipe_launch_command.sh: -------------------------------------------------------------------------------- 1 | helm install tonyjohnchen-ubench-distributed-8zpy . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-fp8cs-gbs128-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/tonyjohnchen-ubench-distributed-8zpy --set queue=a4x -------------------------------------------------------------------------------- /src/frameworks/a3mega/maxtext-configs/llama-2-7b-128gpus-a3mega-bf16.yaml: -------------------------------------------------------------------------------- 1 | hardware: gpu 2 | dcn_data_parallelism: 16 3 | ici_fsdp_parallelism: 8 4 | per_device_batch_size: 4 5 | max_target_length: 4096 6 | model_name: llama2-7b 7 | enable_checkpointing: false 8 | attention: cudnn_flash_te 9 | remat_policy: minimal_flash 10 | use_iota_embed: true 11 | scan_layers: false 12 | dataset_type: synthetic 13 | logits_dot_in_fp32: false 14 | enable_goodput_recording: false 15 | monitor_goodput: false 16 | save_config_to_gcs: true 17 | 18 | -------------------------------------------------------------------------------- /src/frameworks/a3mega/maxtext-configs/llama-2-7b-256gpus-a3mega-bf16.yaml: -------------------------------------------------------------------------------- 1 | hardware: gpu 2 | dcn_data_parallelism: 32 3 | ici_fsdp_parallelism: 8 4 | per_device_batch_size: 4 5 | max_target_length: 4096 6 | model_name: llama2-7b 7 | enable_checkpointing: false 8 | attention: cudnn_flash_te 9 | remat_policy: minimal_flash 10 | use_iota_embed: true 11 | scan_layers: false 12 | dataset_type: synthetic 13 | logits_dot_in_fp32: false 14 | enable_goodput_recording: false 15 | monitor_goodput: false 16 | save_config_to_gcs: true 17 | 18 | -------------------------------------------------------------------------------- /src/utils/checkpointing_metrics/README.md: -------------------------------------------------------------------------------- 1 | # Checkpoint statistics calculator 2 | 3 | This Python utility calculates checkpoint write time statistics from NVIDIA NeMo log files. 4 | 5 | ## Usage 6 | 7 | ``` 8 | python calculate_checkpoint_metrics.py --gcs_logs_path 9 | 10 | ``` 11 | ### Required arguments 12 | 13 | - `--gcs_logs_path`: The path to NeMo logs in a GCS bucket. E.g. `gs://logs_bucket/experiment_name/experiment_version` 14 | 15 | ### Dependencies 16 | 17 | The utility uses the `google-cloud-storage` Python package. You can install the package to your Python environment using the following command. 18 | 19 | ``` 20 | pip install google-cloud-storage` 21 | ``` -------------------------------------------------------------------------------- /src/frameworks/a4x/trtllm-configs/deepseek-r1-nvfp4.yaml: -------------------------------------------------------------------------------- 1 | tp_size: 4 2 | ep_size: 4 3 | pp_size: 1 4 | backend: pytorch 5 | kv_cache_free_gpu_mem_fraction: 0.85 6 | llm_api_args: 7 | cuda_graph_config: 8 | batch_sizes: 9 | - 1 10 | - 2 11 | - 4 12 | - 8 13 | - 16 14 | - 20 15 | - 24 16 | - 32 17 | - 64 18 | - 96 19 | - 128 20 | - 160 21 | - 192 22 | - 256 23 | - 320 24 | - 384 25 | - 512 26 | enable_padding: true 27 | enable_attention_dp: true 28 | enable_chunked_prefill: true 29 | kv_cache_config: 30 | dtype: auto 31 | enable_block_reuse: false 32 | free_gpu_memory_fraction: 0.85 33 | moe_config: 34 | backend: CUTLASS 35 | print_iter_log: true -------------------------------------------------------------------------------- /src/utils/resiliency_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from goodput_measure.tracker import GoodputLogger 16 | 17 | logging = GoodputLogger() 18 | -------------------------------------------------------------------------------- /src/docker/vllm/requirements.in: -------------------------------------------------------------------------------- 1 | aiohappyeyeballs==2.6.1 2 | aiohttp==3.11.16 3 | aiosignal==1.3.2 4 | attrs==25.3.0 5 | certifi==2025.1.31 6 | charset-normalizer==3.4.1 7 | datasets==3.5.0 8 | dill==0.3.8 9 | filelock==3.18.0 10 | frozenlist==1.5.0 11 | fsspec==2024.12.0 12 | hf_transfer==0.1.9 13 | huggingface-hub==0.30.1 14 | idna==3.10 15 | multidict==6.3.2 16 | multiprocess==0.70.16 17 | numpy==2.1.0 18 | packaging==24.2 19 | pandas==2.2.3 20 | propcache==0.3.1 21 | pyarrow==19.0.1 22 | python-dateutil==2.9.0.post0 23 | pytz==2025.2 24 | PyYAML==6.0.2 25 | regex==2024.11.6 26 | requests==2.32.3 27 | safetensors==0.5.3 28 | six==1.17.0 29 | tokenizers==0.21.1 30 | tqdm==4.67.1 31 | transformers==4.51.0 32 | typing_extensions==4.13.1 33 | tzdata==2025.2 34 | urllib3==2.3.0 35 | xxhash==3.5.0 36 | yarl==1.19.0 -------------------------------------------------------------------------------- /src/helm-charts/a4/job/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4u_job_workload 17 | description: a4u_job_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/job/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a3m_job_workload 17 | description: a3m_job_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/job/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a3u_job_workload 17 | description: a3u_job_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4/paligemma2/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/jobset/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4u_jobset_workload 17 | description: a4u_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/jobset/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a3u_jobset_workload 17 | description: a3u_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/jobset/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a3mega_jobset_workload 17 | description: a3mega_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/nemo-training/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: nemo_training_workload 17 | description: nemo_training_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nemo-training-v2/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: nemo_training_workload 17 | description: nemo_training_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nccl-tests/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: nccl-test 17 | description: Running NVIDIA NCCL Tests on GKE with A3+ and A3U GPUs 18 | type: application 19 | version: 0.1.0 20 | appVersion: "v2.14.0" -------------------------------------------------------------------------------- /src/helm-charts/a4/maxtext-training/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: maxtext_training_workload_a4 17 | description: maxtext_training_workload_a4 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/job/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/a3mega/job/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/job/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/storage/gcs-fuse/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: gcs-fuse-pv-pvc 17 | description: "GCS Fuse Persistent Volumes and Persistent Volume Claims" 18 | type: application 19 | version: 0.1.0 20 | appVersion: "0.1.0" -------------------------------------------------------------------------------- /src/helm-charts/a3mega/sglang-inference/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: sglang-deepseek-r1-671b-inference 17 | description: sglang-deepseek-r1-671b-inference 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/nemo-training/templates/nemo-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | data: 20 | nemo-configuration.yaml: |- 21 | {{ .Values.nemo_config | nindent 4 }} -------------------------------------------------------------------------------- /RL/a4/recipes/llama3.1-8b/nemoRL/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | description: rl-jobset-workload 17 | name: ray-cluster 18 | version: 1.1.1 19 | icon: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png 20 | -------------------------------------------------------------------------------- /RL/a4/recipes/qwen2.5-1.5b/nemoRL/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | description: rl-jobset-workload 17 | name: ray-cluster 18 | version: 1.1.1 19 | icon: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_header_logo.png 20 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nccl-tests/templates/nccl-test-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | data: 20 | run_test.sh: |- 21 | {{ .Files.Get "run_test.sh" | nindent 4 }} 22 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nccl-tests/templates/nccl-tests-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/a3mega/vllm-inference/multi-host/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: vllm-deepseek-r1-671b-inference 17 | description: vllm-deepseek-r1-671b-inference 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/vllm-inference/single-host/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: vllm-deepseek-r1-671b-inference 17 | description: vllm-deepseek-r1-671b-inference 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/nemo-training/templates/nemo-launcher-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4_jobset_workload 17 | description: a4_jobset_workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nemo-training-v2/templates/nemo-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | data: 20 | nemo-configuration.yaml: |- 21 | {{ .Values.nemo_config | nindent 4 }} -------------------------------------------------------------------------------- /src/helm-charts/a3mega/nemo-training-v2/templates/nemo-launcher-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/dynamo-deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: dynamo-single-node-deployment 17 | description: dynamo-single-node-deployment 18 | type: application 19 | version: 0.1.0 20 | appVersion: "0.4.0" -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/trtllm-inference/single-node/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: trtllm-llama-3-1-405b-inference 17 | description: trtllm-llama-3-1-405b-inference 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/storage/parallelstore/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: parallelstore-pv-pvc 17 | description: "Parallelstore Persistent Volumes and Persistent Volume Claims" 18 | type: application 19 | version: 0.1.0 20 | appVersion: "0.1.0" -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/16node-bf16-seq8192-gbs512-gcs/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: a4-jobset-workload 17 | description: a4-jobset-workload 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4/jobset/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" -------------------------------------------------------------------------------- /src/helm-charts/a4/maxtext-training/templates/maxtext-launcher-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | job-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/ksa-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: supervisor-sa 5 | namespace: default 6 | 7 | --- 8 | 9 | apiVersion: rbac.authorization.k8s.io/v1 10 | kind: ClusterRole 11 | metadata: 12 | name: supervisor-role 13 | namespace: default 14 | rules: 15 | - apiGroups: ["", "jobset.x-k8s.io"] 16 | resources: ["pods", "nodes", "jobsets"] 17 | verbs: ["get", "list", "delete", "deletecollection", "patch", "create"] 18 | 19 | --- 20 | 21 | apiVersion: rbac.authorization.k8s.io/v1 22 | kind: ClusterRoleBinding 23 | metadata: 24 | name: supervisor-binding 25 | namespace: default 26 | subjects: 27 | - kind: ServiceAccount 28 | name: supervisor-sa 29 | namespace: default 30 | roleRef: 31 | kind: ClusterRole 32 | name: supervisor-role 33 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /training/a4/paligemma2/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /src/helm-charts/a4/inference-templates/deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: single-host-serving-deployment-template 17 | description: single-host-serving-deployment-template 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates/deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: single-host-serving-deployment-template 17 | description: single-host-serving-deployment-template 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: single-host-serving-deployment-template 17 | description: single-host-serving-deployment-template 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates-gcs/deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | name: single-host-serving-deployment-template 17 | description: single-host-serving-deployment-template 18 | type: application 19 | version: 0.1.0 20 | appVersion: "1.16.0" 21 | -------------------------------------------------------------------------------- /training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/ksa-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: supervisor-sa 5 | namespace: default 6 | 7 | --- 8 | 9 | apiVersion: rbac.authorization.k8s.io/v1 10 | kind: ClusterRole 11 | metadata: 12 | name: supervisor-role 13 | namespace: default 14 | rules: 15 | - apiGroups: ["", "jobset.x-k8s.io"] 16 | resources: ["pods", "nodes", "jobsets"] 17 | verbs: ["get", "list", "delete", "deletecollection", "patch", "create", "watch"] 18 | 19 | --- 20 | 21 | apiVersion: rbac.authorization.k8s.io/v1 22 | kind: ClusterRoleBinding 23 | metadata: 24 | name: supervisor-binding 25 | namespace: default 26 | subjects: 27 | - kind: ServiceAccount 28 | name: supervisor-sa 29 | namespace: default 30 | roleRef: 31 | kind: ClusterRole 32 | name: supervisor-role 33 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/ksa-setup.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: supervisor-sa 5 | namespace: default 6 | 7 | --- 8 | 9 | apiVersion: rbac.authorization.k8s.io/v1 10 | kind: ClusterRole 11 | metadata: 12 | name: supervisor-role 13 | namespace: default 14 | rules: 15 | - apiGroups: ["", "jobset.x-k8s.io"] 16 | resources: ["pods", "nodes", "jobsets"] 17 | verbs: ["get", "list", "delete", "deletecollection", "patch", "create", "update", "watch"] 18 | 19 | --- 20 | 21 | apiVersion: rbac.authorization.k8s.io/v1 22 | kind: ClusterRoleBinding 23 | metadata: 24 | name: supervisor-binding 25 | namespace: default 26 | subjects: 27 | - kind: ServiceAccount 28 | name: supervisor-sa 29 | namespace: default 30 | roleRef: 31 | kind: ClusterRole 32 | name: supervisor-role 33 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/values-gcs.yaml: -------------------------------------------------------------------------------- 1 | # See the License for the specific language governing permissions and 2 | # limitations under the License. 3 | 4 | # Copyright 2025 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | gcsVolumes: 20 | - name: gcs-checkpoints 21 | type: checkpoints 22 | bucketName: -------------------------------------------------------------------------------- /training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/values-gcs.yaml: -------------------------------------------------------------------------------- 1 | # See the License for the specific language governing permissions and 2 | # limitations under the License. 3 | 4 | # Copyright 2025 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | gcsVolumes: 20 | - name: gcs-checkpoints 21 | type: checkpoints 22 | bucketName: -------------------------------------------------------------------------------- /training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/values-gcs.yaml: -------------------------------------------------------------------------------- 1 | # See the License for the specific language governing permissions and 2 | # limitations under the License. 3 | 4 | # Copyright 2025 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | gcsVolumes: 20 | - name: gcs-checkpoints 21 | type: checkpoints 22 | bucketName: -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/16node-bf16-seq8192-gbs512-gcs/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/templates/workload-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: "{{ .Release.Name }}" 19 | spec: 20 | clusterIP: None 21 | selector: 22 | jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" 23 | -------------------------------------------------------------------------------- /training/a4/paligemma2/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: main.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: PYTHON_MAIN 31 | value: /workload/configs/main.py 32 | gpus: 32 33 | image: nvcr.io/nvidia/pytorch:25.01-py3 34 | -------------------------------------------------------------------------------- /src/docker/nemo-24.05/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--tag=${_ARTIFACT_REGISTRY}/nemo_workload:24.05' 20 | - '--file=nemo.Dockerfile' 21 | - '.' 22 | automapSubstitutions: true 23 | 24 | images: 25 | - '${_ARTIFACT_REGISTRY}/nemo_workload:24.05' 26 | 27 | -------------------------------------------------------------------------------- /src/docker/nemo-24.07/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--tag=${_ARTIFACT_REGISTRY}/nemo_workload:24.07' 20 | - '--file=nemo.Dockerfile' 21 | - '.' 22 | automapSubstitutions: true 23 | 24 | images: 25 | - '${_ARTIFACT_REGISTRY}/nemo_workload:24.07' 26 | 27 | -------------------------------------------------------------------------------- /src/docker/nemo-aotc-24.07/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--tag=${_ARTIFACT_REGISTRY}/nemo_workload:24.07' 20 | - '--file=nemo.Dockerfile' 21 | - '.' 22 | automapSubstitutions: true 23 | 24 | images: 25 | - '${_ARTIFACT_REGISTRY}/nemo_workload:24.07' -------------------------------------------------------------------------------- /src/helm-charts/a3mega/job/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4/job/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4/jobset/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3mega/jobset/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/job/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/jobset/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | workload-configuration: |- 21 | {{- if .Values.workload_config }} 22 | {{ .Values.workload_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4/inference-templates/deployment/templates/serving-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | serving-configuration: |- 21 | {{- if .Values.serving_config }} 22 | {{ .Values.serving_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates/deployment/templates/serving-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | serving-configuration: |- 21 | {{- if .Values.serving_config }} 22 | {{ .Values.serving_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/deployment/templates/serving-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | serving-configuration: |- 21 | {{- if .Values.serving_config }} 22 | {{ .Values.serving_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | data: 20 | serving-configuration: |- 21 | {{- if .Values.serving_config }} 22 | {{ .Values.serving_config | nindent 4 }} 23 | {{- else }} 24 | {{ "config: null" | nindent 4 }} 25 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/storage/parallelstore/values.yaml: -------------------------------------------------------------------------------- 1 | # See the License for the specific language governing permissions and 2 | # limitations under the License. 3 | 4 | # Copyright 2025 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | projectID: 19 | zone: 20 | network: 21 | accessPoints: 22 | instanceName: 23 | capacity: 24 | 25 | psVolumes: 26 | - name: ps-data 27 | type: data 28 | - name: ps-checkpoints 29 | type: checkpoints 30 | -------------------------------------------------------------------------------- /training/a4/paligemma2/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /RL/a4/recipes/llama3.1-8b/nemoRL/templates/fluent-bit-config.yaml.j2: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- if .Values.configMap.fluentbit }} 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: {{ include "ray-cluster.fullname" . }}-fluentbit-config 20 | labels: 21 | {{- include "ray-cluster.labels" . | nindent 4 }} 22 | data: 23 | {{- toYaml .Values.configMap.fluentbit.data | nindent 2 }} 24 | {{- end }} -------------------------------------------------------------------------------- /RL/a4/recipes/qwen2.5-1.5b/nemoRL/templates/fluent-bit-config.yaml.j2: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- if .Values.configMap.fluentbit }} 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: {{ include "ray-cluster.fullname" . }}-fluentbit-config 20 | labels: 21 | {{- include "ray-cluster.labels" . | nindent 4 }} 22 | data: 23 | {{- toYaml .Values.configMap.fluentbit.data | nindent 2 }} 24 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4/job/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /src/docker/vllm/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--build-arg' 20 | - "VLLM_VERSION=${_VLLM_VERSION}" 21 | - '--tag=${_ARTIFACT_REGISTRY}/${_VLLM_IMAGE}:${_VLLM_VERSION}' 22 | - '--file=vllm.Dockerfile' 23 | - '.' 24 | automapSubstitutions: true 25 | 26 | images: 27 | - '${_ARTIFACT_REGISTRY}/${_VLLM_IMAGE}:${_VLLM_VERSION}' -------------------------------------------------------------------------------- /src/helm-charts/a3mega/job/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/job/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4/inference-templates/deployment/templates/serving-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.service.ports.http }} 25 | targetPort: {{ .Values.service.ports.http }} 26 | type: {{ .Values.service.type }} -------------------------------------------------------------------------------- /src/helm-charts/a4/jobset/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates/deployment/templates/serving-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.service.ports.http }} 25 | targetPort: {{ .Values.service.ports.http }} 26 | type: {{ .Values.service.type }} -------------------------------------------------------------------------------- /src/helm-charts/storage/gcs-fuse/values.yaml: -------------------------------------------------------------------------------- 1 | # See the License for the specific language governing permissions and 2 | # limitations under the License. 3 | 4 | # Copyright 2025 Google LLC 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | gcsVolumes: 20 | - name: gcs-data 21 | type: data 22 | bucketName: 23 | - name: gcs-checkpoints 24 | type: checkpoints 25 | bucketName: 26 | - name: gcs-serving-model 27 | type: serving-model 28 | bucketName: 29 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/jobset/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/deployment/templates/serving-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.service.ports.http }} 25 | targetPort: {{ .Values.service.ports.http }} 26 | type: {{ .Values.service.type }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/jobset/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} 28 | -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.service.ports.http }} 25 | targetPort: {{ .Values.service.ports.http }} 26 | type: {{ .Values.service.type }} -------------------------------------------------------------------------------- /src/docker/sglang/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--build-arg' 20 | - "SGLANG_VERSION=${_SGLANG_VERSION}" 21 | - '--tag=${_ARTIFACT_REGISTRY}/${_SGLANG_IMAGE}:${_SGLANG_VERSION}' 22 | - '--file=sglang.Dockerfile' 23 | - '.' 24 | automapSubstitutions: true 25 | 26 | images: 27 | - '${_ARTIFACT_REGISTRY}/${_SGLANG_IMAGE}:${_SGLANG_VERSION}' -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.6 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-8b-fp8cs-gbs128-gpus64.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-8b-fp8cs-gbs128-gpus64.py 32 | gpus: 64 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-seq8192-gbs256-mbs1-gpus16.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-seq8192-gbs256-mbs1-gpus16.py 32 | gpus: 16 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-bf16-seq8192-gbs256-gpus64.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-bf16-seq8192-gbs256-gpus64.py 32 | gpus: 64 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.6 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-fp8cs-gbs2048-gpus64.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-fp8cs-gbs2048-gpus64.py 32 | gpus: 64 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /src/helm-charts/a4/inference-templates/deployment/templates/serving-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates/deployment/templates/serving-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-seq8192-gbs1024-mbs1-gpus16.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-seq8192-gbs1024-mbs1-gpus16.py 32 | gpus: 16 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-seq8192-gbs2048-mbs1-gpus16.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-seq8192-gbs2048-mbs1-gpus16.py 32 | gpus: 16 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/paligemma2/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /src/helm-charts/a3mega/vllm-inference/single-host/templates/model-serve-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.vllm.service.ports.http }} 25 | targetPort: {{ .Values.vllm.service.ports.http }} 26 | type: {{ .Values.vllm.service.type }} 27 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/deployment/templates/serving-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/dynamo-deployment/templates/dynamo-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-config" 19 | namespace: {{ .Values.dynamo.namespace }} 20 | data: 21 | serving-configuration: |- 22 | {{- if .Values.serving_config }} 23 | {{ .Values.serving_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/a4x/inference-templates-gcs/deployment/templates/serving-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | data: 20 | launch-workload.sh: |- 21 | {{- if .Values.workload_launcher }} 22 | {{ .Values.workload_launcher | nindent 4 }} 23 | {{- else }} 24 | #!/bin/bash 25 | echo "No workload launcher specified" 26 | exit 1 27 | {{- end }} -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | queue: null 11 | tasSettings: 12 | topologyRequest: 13 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 14 | volumes: 15 | gcsMounts: 16 | - bucketName: null 17 | mountPath: null 18 | gcsVolumes: true 19 | psVolumes: false 20 | workload: 21 | arguments[]: null 22 | configFile: llama3-1-70b-seq8192-gbs2048-mbs1-gpus256.py 23 | configPath: /workload/configs/ 24 | defaultArguments[]: null 25 | envs: 26 | - name: ARTIFACT_DIR 27 | value: null 28 | - name: GLOO_SOCKET_IFNAME 29 | value: eth0 30 | - name: NEMO_LAUNCH_SCRIPT 31 | value: /workload/configs/llama3-1-70b-seq8192-gbs2048-mbs1-gpus256.py 32 | gpus: 256 33 | image: nvcr.io/nvidia/nemo:25.07 34 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload_config }} 23 | {{ .Values.workload_config | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/16node-bf16-seq8192-gbs512-gcs/templates/workload-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-config" 20 | data: 21 | workload-configuration: |- 22 | {{- if .Values.workload.configFile }} 23 | {{ .Files.Get .Values.workload.configFile | nindent 4 }} 24 | {{- else }} 25 | {{ "config: null" | nindent 4 }} 26 | {{- end }} 27 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs1024/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/2node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-FP8CS-GBS128/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/32node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-FP8CS-GBS2048/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /src/helm-charts/a3ultra/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: "{{ .Release.Name }}-launcher" 19 | namespace: {{ .Values.dynamo.namespace }} 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} -------------------------------------------------------------------------------- /training/a4/llama3-1-70b/nemo-pretraining-gke/16node-bf16-seq8192-gbs512-gcs/templates/workload-launcher-configmap.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable 2 | # Copyright 2025 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ConfigMap 18 | metadata: 19 | name: "{{ .Release.Name }}-launcher" 20 | data: 21 | launch-workload.sh: |- 22 | {{- if .Values.workload_launcher }} 23 | {{ .Files.Get .Values.workload_launcher | nindent 4 }} 24 | {{- else }} 25 | #!/bin/bash 26 | echo "No workload launcher specified" 27 | exit 1 28 | {{- end }} 29 | -------------------------------------------------------------------------------- /src/docker/trtllm/cloudbuild.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - name: 'gcr.io/cloud-builders/docker' 17 | args: 18 | - 'build' 19 | - '--build-arg' 20 | - "TRTLLM_VERSION=${_TRTLLM_VERSION}" 21 | - '--build-arg' 22 | - "TRITON_SERVER_VERSION=${_TRITON_SERVER_VERSION}" 23 | - '--tag=${_ARTIFACT_REGISTRY}/${_TRTLLM_IMAGE}:${_TRTLLM_VERSION}' 24 | - '--file=trtllm.Dockerfile' 25 | - '.' 26 | automapSubstitutions: true 27 | 28 | images: 29 | - '${_ARTIFACT_REGISTRY}/${_TRTLLM_IMAGE}:${_TRTLLM_VERSION}' -------------------------------------------------------------------------------- /src/helm-charts/a3mega/sglang-inference/templates/lws-deployment-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.sglang.service.ports.http }} 25 | targetPort: {{ .Values.sglang.service.ports.http }} 26 | selector: 27 | leaderworkerset.sigs.k8s.io/name: {{ .Release.Name }} 28 | role: leader 29 | type: {{ .Values.sglang.service.type }} -------------------------------------------------------------------------------- /src/helm-charts/a3mega/vllm-inference/multi-host/templates/lws-deployment-svc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ .Release.Name }}-svc 19 | spec: 20 | selector: 21 | app: {{ .Release.Name }}-serving 22 | ports: 23 | - name: http 24 | port: {{ .Values.vllm.service.ports.http }} 25 | targetPort: {{ .Values.vllm.service.ports.http }} 26 | selector: 27 | leaderworkerset.sigs.k8s.io/name: {{ .Release.Name }} 28 | role: leader 29 | type: {{ .Values.vllm.service.type }} 30 | -------------------------------------------------------------------------------- /src/helm-charts/storage/parallelstore/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- $root := . -}} 16 | {{- range $ps := .Values.psVolumes }} 17 | apiVersion: v1 18 | kind: PersistentVolumeClaim 19 | metadata: 20 | name: "{{ $ps.name }}-pvc" 21 | namespace: {{ default "default" $ps.namespace }} 22 | spec: 23 | accessModes: 24 | - ReadWriteMany 25 | resources: 26 | requests: 27 | storage: {{ default "12000Gi" $root.Values.capacity }} 28 | storageClassName: parallelstore-storage 29 | volumeName: {{ $ps.name }}-pv 30 | --- 31 | {{- end }} -------------------------------------------------------------------------------- /src/helm-charts/storage/gcs-fuse/templates/pvc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- range $gcs := .Values.gcsVolumes }} 16 | {{- if $gcs.bucketName }} 17 | apiVersion: v1 18 | kind: PersistentVolumeClaim 19 | metadata: 20 | name: "{{ $gcs.name }}-pvc" 21 | namespace: {{ default "default" $gcs.namespace }} 22 | spec: 23 | accessModes: 24 | - ReadWriteMany 25 | resources: 26 | requests: 27 | storage: {{ default "2048Gi" $gcs.capacity }} 28 | storageClassName: gcs-fuse-storage 29 | volumeName: {{ $gcs.name }}-pv 30 | --- 31 | {{- end }} 32 | {{- end }} 33 | -------------------------------------------------------------------------------- /src/utils/resiliency_metrics/constant.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Goodput Measure related constant variables""" 16 | 17 | USER_SCHEDULED = "user_scheduled" 18 | USER_TERMINATED = "user_terminated" 19 | JOB_STARTED = "job_started" 20 | JOB_TERMINATED = "job_terminated" 21 | CHECKPOINT_LOADED = "checkpoint_loaded" 22 | CHECKPOINT_SAVED = "checkpoint_saved" 23 | 24 | 25 | EVENT_TYPE_ORDER = { 26 | USER_SCHEDULED: 0, 27 | JOB_STARTED: 1, 28 | CHECKPOINT_LOADED: 2, 29 | CHECKPOINT_SAVED: 3, 30 | USER_TERMINATED: 4, 31 | JOB_TERMINATED: 5, 32 | } 33 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-405b-256gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_fsdp_parallelism: 32 20 | ici_fsdp_parallelism: 8 21 | per_device_batch_size: 2 22 | max_target_length: 8192 23 | learning_rate: 0.001 24 | enable_checkpointing: false 25 | quantization: fp8 26 | attention: cudnn_flash_te 27 | remat_policy: full 28 | use_iota_embed: true 29 | dataset_type: synthetic 30 | logits_dot_in_fp32: false 31 | enable_goodput_recording: false 32 | monitor_goodput: false 33 | save_config_to_gcs: true 34 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-405b-256gpus-a4-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 2 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: full 28 | use_iota_embed: true 29 | dataset_type: synthetic 30 | logits_dot_in_fp32: false 31 | enable_goodput_recording: false 32 | monitor_goodput: false 33 | save_config_to_gcs: true 34 | 35 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-405b/nemo-pretraining-gke/32node-BF16-GBS64/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | nodeSelector: 11 | cloud.google.com/gke-accelerator: nvidia-gb200 12 | cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c 13 | queue: null 14 | tasSettings: 15 | topologyRequest: 16 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 17 | volumes: 18 | gcsMounts: 19 | - bucketName: null 20 | mountPath: null 21 | gcsVolumes: true 22 | psVolumes: false 23 | workload: 24 | arguments[]: null 25 | configFile: llama3-1-405b-bf16-gbs64-gpu128.py 26 | configPath: /workload/configs/ 27 | defaultArguments[]: null 28 | envs: 29 | - name: ARTIFACT_DIR 30 | value: null 31 | - name: GLOO_SOCKET_IFNAME 32 | value: eth0 33 | - name: NEMO_LAUNCH_SCRIPT 34 | value: /workload/configs/llama3-1-405b-bf16-gbs64-gpu128.py 35 | gpus: 128 36 | image: nvcr.io/nvidia/nemo:25.07 37 | serviceAccountName: null 38 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/16node-BF16-GBS2048/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | nodeSelector: 11 | cloud.google.com/gke-accelerator: nvidia-gb200 12 | cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c 13 | queue: null 14 | tasSettings: 15 | topologyRequest: 16 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 17 | volumes: 18 | gcsMounts: 19 | - bucketName: null 20 | mountPath: null 21 | gcsVolumes: true 22 | psVolumes: false 23 | workload: 24 | arguments[]: null 25 | configFile: llama3-1-70b-bf16-gbs2048-gpus64.py 26 | configPath: /workload/configs/ 27 | defaultArguments[]: null 28 | envs: 29 | - name: ARTIFACT_DIR 30 | value: null 31 | - name: GLOO_SOCKET_IFNAME 32 | value: eth0 33 | - name: NEMO_LAUNCH_SCRIPT 34 | value: /workload/configs/llama3-1-70b-bf16-gbs2048-gpus64.py 35 | gpus: 64 36 | image: nvcr.io/nvidia/nemo:25.07 37 | serviceAccountName: null 38 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-8b/nemo-pretraining-gke/16node-BF16-GBS1024/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | nodeSelector: 11 | cloud.google.com/gke-accelerator: nvidia-gb200 12 | cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c 13 | queue: null 14 | tasSettings: 15 | topologyRequest: 16 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 17 | volumes: 18 | gcsMounts: 19 | - bucketName: null 20 | mountPath: null 21 | gcsVolumes: true 22 | psVolumes: false 23 | workload: 24 | arguments[]: null 25 | configFile: llama3-1-8b-bf16-gbs1024-gpus64.py 26 | configPath: /workload/configs/ 27 | defaultArguments[]: null 28 | envs: 29 | - name: ARTIFACT_DIR 30 | value: null 31 | - name: GLOO_SOCKET_IFNAME 32 | value: eth0 33 | - name: NEMO_LAUNCH_SCRIPT 34 | value: /workload/configs/llama3-1-8b-bf16-gbs1024-gpus64.py 35 | gpus: 64 36 | image: nvcr.io/nvidia/nemo:25.07 37 | serviceAccountName: null 38 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-405b-512gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | 19 | hardware: gpu 20 | dcn_fsdp_parallelism: 64 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: full 29 | use_iota_embed: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/32node-FP8CS-GBS2048/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | nodeSelector: 11 | cloud.google.com/gke-accelerator: nvidia-gb200 12 | cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c 13 | queue: null 14 | tasSettings: 15 | topologyRequest: 16 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 17 | volumes: 18 | gcsMounts: 19 | - bucketName: null 20 | mountPath: null 21 | gcsVolumes: true 22 | psVolumes: false 23 | workload: 24 | arguments[]: null 25 | configFile: llama3-1-70b-fp8cs-gbs2048-gpus128.py 26 | configPath: /workload/configs/ 27 | defaultArguments[]: null 28 | envs: 29 | - name: ARTIFACT_DIR 30 | value: null 31 | - name: GLOO_SOCKET_IFNAME 32 | value: eth0 33 | - name: NEMO_LAUNCH_SCRIPT 34 | value: /workload/configs/llama3-1-70b-fp8cs-gbs2048-gpus128.py 35 | gpus: 128 36 | image: nvcr.io/nvidia/nemo:25.07 37 | serviceAccountName: null 38 | -------------------------------------------------------------------------------- /training/a4x/llama3-1-70b/nemo-pretraining-gke/64node-FP8CS-GBS2048/recipe/values.yaml: -------------------------------------------------------------------------------- 1 | dwsSettings: 2 | maxRunDurationSeconds: null 3 | network: 4 | gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.0.7 5 | hostNetwork: true 6 | ncclSettings: 7 | - name: NCCL_DEBUG 8 | value: WARN 9 | subnetworks[]: null 10 | nodeSelector: 11 | cloud.google.com/gke-accelerator: nvidia-gb200 12 | cloud.google.com/placement-policy-name: a4x-workload-policy-95cbc61c 13 | queue: null 14 | tasSettings: 15 | topologyRequest: 16 | kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname 17 | volumes: 18 | gcsMounts: 19 | - bucketName: null 20 | mountPath: null 21 | gcsVolumes: true 22 | psVolumes: false 23 | workload: 24 | arguments[]: null 25 | configFile: llama3-1-70b-fp8cs-gbs2048-gpus256.py 26 | configPath: /workload/configs/ 27 | defaultArguments[]: null 28 | envs: 29 | - name: ARTIFACT_DIR 30 | value: null 31 | - name: GLOO_SOCKET_IFNAME 32 | value: eth0 33 | - name: NEMO_LAUNCH_SCRIPT 34 | value: /workload/configs/llama3-1-70b-fp8cs-gbs2048-gpus256.py 35 | gpus: 256 36 | image: nvcr.io/nvidia/nemo:25.07 37 | serviceAccountName: null 38 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-70b-256gpus-a4-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 1 20 | dcn_fsdp_parallelism: 32 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 8 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: full 28 | use_iota_embed: true 29 | dataset_type: synthetic 30 | logits_dot_in_fp32: false 31 | scan_layers: True 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-405b-1024gpus-a4-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 8 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: full 28 | use_iota_embed: true 29 | scan_layers: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-405b-1024gpus-a4-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 4 20 | dcn_fsdp_parallelism: 32 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: full 29 | use_iota_embed: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-405b-256gpus-a4-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 2 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: full 29 | use_iota_embed: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /src/launchers/vllm-launcher.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | set -eux # Exit immediately if a command exits with a non-zero status. 18 | 19 | echo "vLLM server arguments received:" 20 | echo " $@" 21 | echo "" 22 | 23 | echo "Launching vLLM server" 24 | 25 | # MODEL_NAME should be passed as an environment variable from deployment 26 | if [ -z "$MODEL_NAME" ]; then 27 | echo "Error: MODEL_NAME environment variable is not set." 28 | exit 1 29 | fi 30 | echo "Using MODEL_NAME: $MODEL_NAME" 31 | 32 | # Launch the server 33 | vllm serve "$MODEL_NAME" \ 34 | "$@" 35 | 36 | echo "Server bringup is complete. vLLM server command finished." -------------------------------------------------------------------------------- /src/utils/checkpointing_metrics/log_patterns.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """The log patterns for processing checkpointing metrics.""" 16 | 17 | # The pattern of the log file name. 18 | NEMO_LOG_FILE_NAME = r"nemo_log_globalrank-(\d+)_localrank-(\d+)" 19 | 20 | # The timestamp pattern in NeMo logs. 21 | NEMO_LOG_TIMESTAMP = r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})" 22 | 23 | # The pattern of the checkpoint saving start log. 24 | CHECKPOINT_WRITE_START = r"Checkpoint save for step (\d+) started" 25 | # The pattern of the checkpoint saving end log. 26 | CHECKPOINT_WRITE_END = ( 27 | r"Async checkpoint save for step (\d+) .* finalized successfully" 28 | ) -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-70b-1024gpus-a4-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 8 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: save_dot_with_context_except_mlp 28 | use_iota_embed: true 29 | dataset_type: synthetic 30 | logits_dot_in_fp32: false 31 | scan_layers: True 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-70b-256gpus-a4-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 1 20 | dcn_fsdp_parallelism: 32 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 8 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | 26 | enable_checkpointing: false 27 | quantization: fp8 28 | attention: cudnn_flash_te 29 | remat_policy: full 30 | use_iota_embed: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | scan_layers: True 34 | enable_goodput_recording: false 35 | monitor_goodput: false 36 | save_config_to_gcs: true 37 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-512gpus-a3u-fp8-gbs2048.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 4 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 4 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: save_out_proj 29 | use_iota_embed: true 30 | scan_layers: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | enable_goodput_recording: false 34 | monitor_goodput: false 35 | save_config_to_gcs: true -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-256gpus-a3u-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 2 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: save_dot_with_context_except_mlp 28 | use_iota_embed: true 29 | scan_layers: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-512gpus-a3u-bf16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 4 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | attention: cudnn_flash_te 27 | remat_policy: save_dot_with_context_except_mlp 28 | use_iota_embed: true 29 | scan_layers: true 30 | dataset_type: synthetic 31 | logits_dot_in_fp32: false 32 | enable_goodput_recording: false 33 | monitor_goodput: false 34 | save_config_to_gcs: true 35 | 36 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-405b-1024gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | 19 | hardware: gpu 20 | dcn_data_parallelism: 2 21 | ici_data_parallelism: 1 22 | dcn_fsdp_parallelism: 64 23 | ici_fsdp_parallelism: 8 24 | per_device_batch_size: 2 25 | max_target_length: 8192 26 | learning_rate: 0.001 27 | enable_checkpointing: false 28 | quantization: fp8 29 | attention: cudnn_flash_te 30 | remat_policy: full 31 | use_iota_embed: true 32 | dataset_type: synthetic 33 | logits_dot_in_fp32: false 34 | enable_goodput_recording: false 35 | monitor_goodput: false 36 | save_config_to_gcs: true 37 | 38 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-405b-768gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-405b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 3 20 | dcn_fsdp_parallelism: 32 21 | ici_fsdp_parallelism: -1 22 | ici_tensor_parallelism: 1 23 | per_device_batch_size: 2 24 | max_target_length: 8192 25 | learning_rate: 0.001 26 | enable_checkpointing: false 27 | quantization: fp8 28 | attention: cudnn_flash_te 29 | remat_policy: full 30 | use_iota_embed: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | enable_goodput_recording: false 34 | monitor_goodput: false 35 | save_config_to_gcs: true 36 | 37 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-1024gpus-a3u-fp8-gbs2048.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 16 20 | dcn_fsdp_parallelism: 8 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: save_dot_except_mlp 29 | use_iota_embed: true 30 | scan_layers: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | enable_goodput_recording: false 34 | monitor_goodput: false 35 | save_config_to_gcs: true -------------------------------------------------------------------------------- /src/frameworks/a4/maxtext-configs/llama3-1-70b-1024gpus-a4-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 8 20 | dcn_fsdp_parallelism: 16 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | 26 | enable_checkpointing: false 27 | quantization: fp8 28 | attention: cudnn_flash_te 29 | remat_policy: save_dot_with_context_except_mlp 30 | use_iota_embed: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | scan_layers: True 34 | enable_goodput_recording: false 35 | monitor_goodput: false 36 | save_config_to_gcs: true 37 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-256gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 4 20 | dcn_fsdp_parallelism: 8 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: save_dot_with_context_except_mlp 29 | use_iota_embed: true 30 | scan_layers: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | enable_goodput_recording: false 34 | monitor_goodput: false 35 | save_config_to_gcs: true 36 | 37 | -------------------------------------------------------------------------------- /src/frameworks/a3ultra/maxtext-configs/llama3-1-70b-512gpus-a3u-fp8.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | base_config: "base.yml" 16 | model_name: llama3.1-70b 17 | 18 | hardware: gpu 19 | dcn_data_parallelism: 8 20 | dcn_fsdp_parallelism: 8 21 | ici_fsdp_parallelism: 8 22 | per_device_batch_size: 2 23 | max_target_length: 8192 24 | learning_rate: 0.001 25 | enable_checkpointing: false 26 | quantization: fp8 27 | attention: cudnn_flash_te 28 | remat_policy: save_dot_with_context_except_mlp 29 | use_iota_embed: true 30 | scan_layers: true 31 | dataset_type: synthetic 32 | logits_dot_in_fp32: false 33 | enable_goodput_recording: false 34 | monitor_goodput: false 35 | save_config_to_gcs: true 36 | 37 | -------------------------------------------------------------------------------- /src/launchers/dynamo-vllm-launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2025 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -eux # Exit immediately if a command exits with a non-zero status. 18 | 19 | echo "Dynamo vLLM launcher starting" 20 | echo "Arguments received: $@" 21 | 22 | # MODEL_NAME should be passed as an environment variable from deployment 23 | if [ -z "$MODEL_NAME" ]; then 24 | echo "Error: MODEL_NAME environment variable is not set." 25 | exit 1 26 | fi 27 | echo "Using MODEL_NAME: $MODEL_NAME" 28 | 29 | # Launch the Dynamo vLLM server 30 | echo "Launching Dynamo vLLM server with model: $MODEL_NAME" 31 | python3 -m dynamo.vllm \ 32 | --model "$MODEL_NAME" \ 33 | "$@" 34 | 35 | echo "Dynamo vLLM server command finished." -------------------------------------------------------------------------------- /training/a3mega/llama3-1-70b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | controller_manager_config.yaml: | 3 | apiVersion: config.kueue.x-k8s.io/v1beta1 4 | kind: Configuration 5 | health: 6 | healthProbeBindAddress: :8081 7 | metrics: 8 | bindAddress: :8443 9 | webhook: 10 | port: 9443 11 | leaderElection: 12 | leaderElect: true 13 | resourceName: c1f6bfd2.kueue.x-k8s.io 14 | controller: 15 | groupKindConcurrency: 16 | Job.batch: 5 17 | Pod: 5 18 | Workload.kueue.x-k8s.io: 5 19 | LocalQueue.kueue.x-k8s.io: 1 20 | Cohort.kueue.x-k8s.io: 1 21 | ClusterQueue.kueue.x-k8s.io: 1 22 | ResourceFlavor.kueue.x-k8s.io: 1 23 | clientConnection: 24 | qps: 50 25 | burst: 100 26 | waitForPodsReady: 27 | enable: true 28 | timeout: 1m 29 | recoveryTimeout: 1m 30 | integrations: 31 | frameworks: 32 | - "batch/job" 33 | - "kubeflow.org/mpijob" 34 | - "ray.io/rayjob" 35 | - "ray.io/raycluster" 36 | - "jobset.x-k8s.io/jobset" 37 | - "kubeflow.org/paddlejob" 38 | - "kubeflow.org/pytorchjob" 39 | - "kubeflow.org/tfjob" 40 | - "kubeflow.org/xgboostjob" 41 | - "workload.codeflare.dev/appwrapper" -------------------------------------------------------------------------------- /training/a3ultra/mixtral-8x7b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | controller_manager_config.yaml: | 3 | apiVersion: config.kueue.x-k8s.io/v1beta1 4 | kind: Configuration 5 | health: 6 | healthProbeBindAddress: :8081 7 | metrics: 8 | bindAddress: :8443 9 | webhook: 10 | port: 9443 11 | leaderElection: 12 | leaderElect: true 13 | resourceName: c1f6bfd2.kueue.x-k8s.io 14 | controller: 15 | groupKindConcurrency: 16 | Job.batch: 5 17 | Pod: 5 18 | Workload.kueue.x-k8s.io: 5 19 | LocalQueue.kueue.x-k8s.io: 1 20 | Cohort.kueue.x-k8s.io: 1 21 | ClusterQueue.kueue.x-k8s.io: 1 22 | ResourceFlavor.kueue.x-k8s.io: 1 23 | clientConnection: 24 | qps: 50 25 | burst: 100 26 | waitForPodsReady: 27 | enable: true 28 | timeout: 1m 29 | recoveryTimeout: 1m 30 | integrations: 31 | frameworks: 32 | - "batch/job" 33 | - "kubeflow.org/mpijob" 34 | - "ray.io/rayjob" 35 | - "ray.io/raycluster" 36 | - "jobset.x-k8s.io/jobset" 37 | - "kubeflow.org/paddlejob" 38 | - "kubeflow.org/pytorchjob" 39 | - "kubeflow.org/tfjob" 40 | - "kubeflow.org/xgboostjob" 41 | - "workload.codeflare.dev/appwrapper" -------------------------------------------------------------------------------- /training/a3ultra/llama3-1-405b/nemo-pretraining-gke-resiliency/kueue-merge-patch.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | controller_manager_config.yaml: | 3 | apiVersion: config.kueue.x-k8s.io/v1beta1 4 | kind: Configuration 5 | health: 6 | healthProbeBindAddress: :8081 7 | metrics: 8 | bindAddress: :8443 9 | webhook: 10 | port: 9443 11 | leaderElection: 12 | leaderElect: true 13 | resourceName: c1f6bfd2.kueue.x-k8s.io 14 | controller: 15 | groupKindConcurrency: 16 | Job.batch: 5 17 | Pod: 5 18 | Workload.kueue.x-k8s.io: 5 19 | LocalQueue.kueue.x-k8s.io: 1 20 | Cohort.kueue.x-k8s.io: 1 21 | ClusterQueue.kueue.x-k8s.io: 1 22 | ResourceFlavor.kueue.x-k8s.io: 1 23 | clientConnection: 24 | qps: 50 25 | burst: 100 26 | waitForPodsReady: 27 | enable: true 28 | timeout: 1m 29 | recoveryTimeout: 1m 30 | integrations: 31 | frameworks: 32 | - "batch/job" 33 | - "kubeflow.org/mpijob" 34 | - "ray.io/rayjob" 35 | - "ray.io/raycluster" 36 | - "jobset.x-k8s.io/jobset" 37 | - "kubeflow.org/paddlejob" 38 | - "kubeflow.org/pytorchjob" 39 | - "kubeflow.org/tfjob" 40 | - "kubeflow.org/xgboostjob" 41 | - "workload.codeflare.dev/appwrapper" -------------------------------------------------------------------------------- /src/docker/vllm/vllm.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG VLLM_VERSION="latest" 16 | 17 | FROM docker.io/vllm/vllm-openai:${VLLM_VERSION} 18 | ARG VLLM_VERSION 19 | 20 | WORKDIR /workspace 21 | COPY ray_init.sh /workspace/ray_init.sh 22 | 23 | RUN apt-get update && apt-get install -y --no-install-recommends pciutils 24 | 25 | COPY requirements.txt /workspace/requirements.txt 26 | RUN pip install --no-cache-dir --require-hashes -r requirements.txt 27 | 28 | RUN echo "Cloning vLLM version: ${VLLM_VERSION}" && \ 29 | git clone https://github.com/vllm-project/vllm.git && \ 30 | cd vllm && \ 31 | (git checkout "$VLLM_VERSION" 2>/dev/null || true) && \ 32 | mv vllm vllm_1 33 | 34 | ENTRYPOINT [ "/bin/bash" ] -------------------------------------------------------------------------------- /src/launchers/dynamo-sglang-launcher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2025 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -eux # Exit immediately if a command exits with a non-zero status. 18 | 19 | echo "Dynamo SGLang launcher starting" 20 | echo "Arguments received: $@" 21 | 22 | # MODEL_NAME should be passed as an environment variable from deployment 23 | if [ -z "$MODEL_NAME" ]; then 24 | echo "Error: MODEL_NAME environment variable is not set." 25 | exit 1 26 | fi 27 | echo "Using MODEL_NAME: $MODEL_NAME" 28 | 29 | # Launch the Dynamo SGLang server 30 | echo "Launching Dynamo SGLang server with model: $MODEL_NAME" 31 | python3 -m dynamo.sglang \ 32 | --model "$MODEL_NAME" \ 33 | "$@" 34 | 35 | echo "Dynamo SGLang server command finished." -------------------------------------------------------------------------------- /src/launchers/sglang-launcher.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | 17 | set -eux # Exit immediately if a command exits with a non-zero status. 18 | 19 | echo "SGLang server arguments received:" 20 | echo " $@" 21 | echo "" 22 | 23 | echo "Launching SGLang server" 24 | 25 | export HF_HOME=/ssd 26 | 27 | # MODEL_NAME should be passed as an environment variable from deployment 28 | if [ -z "$MODEL_NAME" ]; then 29 | echo "Error: MODEL_NAME environment variable is not set." 30 | exit 1 31 | fi 32 | echo "Using MODEL_NAME: $MODEL_NAME" 33 | 34 | # Launch the server 35 | python3 -m sglang.launch_server \ 36 | --model "$MODEL_NAME" \ 37 | "$@" 38 | 39 | echo "Server bringup is complete. SGLang server command finished." --------------------------------------------------------------------------------