├── .github
    ├── pull_request_template.md
    └── workflows
    │   ├── close.yaml
    │   ├── label-pr.yaml
    │   ├── obsolete.yaml
    │   └── stale.yaml
├── .gitignore
├── .gitmodules
├── CODEOWNERS
├── LICENSE
├── Makefile
├── README.md
├── applications
    ├── jupyter
    │   └── README.md
    └── rag
    │   └── README.md
├── benchmarks
    ├── 65k-cpu-nodes-simulated-ai-benchmark.md
    ├── README.md
    ├── accelerator-based-ai-benchmark.md
    ├── benchmark
    │   ├── README.md
    │   ├── dataset
    │   │   ├── README.md
    │   │   └── ShareGPT_v3_unflitered_cleaned_split
    │   │   │   ├── README.md
    │   │   │   ├── requirements.txt
    │   │   │   └── upload_sharegpt.py
    │   └── tools
    │   │   ├── CL2-benchmark
    │   │       ├── config.yaml
    │   │       ├── headless-service.yaml
    │   │       ├── modules
    │   │       │   ├── measurements.yaml
    │   │       │   ├── scheduling-throughput.yaml
    │   │       │   └── statefulsets.yaml
    │   │       ├── priorityclass.yaml
    │   │       └── statefulset.yaml
    │   │   ├── README.md
    │   │   ├── dlio
    │   │       ├── README.md
    │   │       ├── main.tf
    │   │       ├── modules
    │   │       │   ├── dlio
    │   │       │   │   ├── job.tf
    │   │       │   │   ├── podspec.tpl
    │   │       │   │   ├── variables.tf
    │   │       │   │   └── versions.tf
    │   │       │   ├── parallelstore_storage
    │   │       │   │   ├── dataloader_job.tpl
    │   │       │   │   ├── ps_pv.tpl
    │   │       │   │   ├── ps_pv_pvc.tf
    │   │       │   │   ├── ps_pvc.tpl
    │   │       │   │   ├── variables.tf
    │   │       │   │   └── versions.tf
    │   │       │   └── storage
    │   │       │   │   ├── gcs_pv_pvc.tf
    │   │       │   │   ├── pv_podspec.tpl
    │   │       │   │   ├── pvc_podspec.tpl
    │   │       │   │   ├── variables.tf
    │   │       │   │   └── versions.tf
    │   │       ├── parser.py
    │   │       ├── variables.tf
    │   │       └── versions.tf
    │   │   ├── locust-load-inference
    │   │       ├── README.md
    │   │       ├── build.tf
    │   │       ├── locust-custom-exporter
    │   │       │   ├── Dockerfile
    │   │       │   ├── Makefile
    │   │       │   ├── Makefile.common
    │   │       │   ├── README.md
    │   │       │   ├── go.mod
    │   │       │   ├── go.sum
    │   │       │   └── main.go
    │   │       ├── locust-docker
    │   │       │   ├── Dockerfile
    │   │       │   └── locust-tasks
    │   │       │   │   ├── custom_metric_aggregator.py
    │   │       │   │   ├── load_data.py
    │   │       │   │   ├── requirements.txt
    │   │       │   │   ├── run.sh
    │   │       │   │   └── tasks.py
    │   │       ├── locust-run.tf
    │   │       ├── locust-runner
    │   │       │   ├── Dockerfile
    │   │       │   ├── app
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── data_model.py
    │   │       │   │   └── main.py
    │   │       │   ├── metrics.yaml
    │   │       │   └── requirements.txt
    │   │       ├── main.tf
    │   │       ├── manifest-templates
    │   │       │   ├── locust-master-controller.yaml.tpl
    │   │       │   ├── locust-master-service.yaml.tpl
    │   │       │   ├── locust-worker-controller.yaml.tpl
    │   │       │   └── pod-monitoring.yaml.tpl
    │   │       ├── providers.tf
    │   │       ├── runner-manifest-template
    │   │       │   ├── locust-runner-service.yaml.tpl
    │   │       │   └── locust-runner.yaml.tftpl
    │   │       ├── sample-dashboards
    │   │       │   └── tgi-dashboard.yaml
    │   │       ├── sample-tfvars
    │   │       │   ├── jetstream-sample.tfvars
    │   │       │   └── tgi-sample.tfvars
    │   │       └── variables.tf
    │   │   ├── model-load-benchmark
    │   │       ├── README.md
    │   │       ├── base-config.yaml
    │   │       ├── benchmarker.ini
    │   │       ├── config
    │   │       │   ├── config.go
    │   │       │   ├── config_test.go
    │   │       │   └── utils.go
    │   │       ├── deployment
    │   │       │   ├── consts.go
    │   │       │   └── deployment.go
    │   │       ├── example-pod.yaml
    │   │       ├── go.mod
    │   │       ├── go.sum
    │   │       ├── k8sclient
    │   │       │   └── k8sclient.go
    │   │       ├── main.go
    │   │       ├── plot.py
    │   │       ├── rbac.yaml
    │   │       ├── requirements.txt
    │   │       ├── results
    │   │       │   ├── case_0.yaml
    │   │       │   ├── case_1.yaml
    │   │       │   ├── case_10.yaml
    │   │       │   ├── case_11.yaml
    │   │       │   ├── case_12.yaml
    │   │       │   ├── case_13.yaml
    │   │       │   ├── case_14.yaml
    │   │       │   ├── case_15.yaml
    │   │       │   ├── case_16.yaml
    │   │       │   ├── case_17.yaml
    │   │       │   ├── case_18.yaml
    │   │       │   ├── case_19.yaml
    │   │       │   ├── case_2.yaml
    │   │       │   ├── case_20.yaml
    │   │       │   ├── case_21.yaml
    │   │       │   ├── case_22.yaml
    │   │       │   ├── case_23.yaml
    │   │       │   ├── case_3.yaml
    │   │       │   ├── case_4.yaml
    │   │       │   ├── case_5.yaml
    │   │       │   ├── case_6.yaml
    │   │       │   ├── case_7.yaml
    │   │       │   ├── case_8.yaml
    │   │       │   ├── case_9.yaml
    │   │       │   ├── elapsed_time_vs_cpu_request.png
    │   │       │   ├── elapsed_time_vs_download_chunk_size_mb.png
    │   │       │   ├── elapsed_time_vs_ephemeral_storage_request.png
    │   │       │   ├── elapsed_time_vs_max_parallel_downloads.png
    │   │       │   ├── elapsed_time_vs_memory_request.png
    │   │       │   └── elapsed_time_vs_parallel_downloads_per_file.png
    │   │       ├── runner
    │   │       │   └── runner.go
    │   │       ├── suite-generator
    │   │       │   ├── generator.go
    │   │       │   └── generator_test.go
    │   │       └── volumeAttributes.yaml
    │   │   └── profile-generator
    │   │       ├── README.md
    │   │       ├── build.tf
    │   │       ├── container
    │   │           ├── Dockerfile
    │   │           ├── benchmark_serving.py
    │   │           ├── latency_throughput_curve.sh
    │   │           └── requirements.txt
    │   │       ├── main.tf
    │   │       ├── modules
    │   │           └── latency-profile
    │   │           │   ├── main.tf
    │   │           │   ├── manifest-templates
    │   │           │       ├── latency-profile-generator-podmonitoring.yaml.tpl
    │   │           │       └── latency-profile-generator.yaml.tpl
    │   │           │   ├── sample.tfvars
    │   │           │   └── variables.tf
    │   │       ├── sample.tfvars
    │   │       └── variables.tf
    ├── inference-server
    │   ├── README.md
    │   ├── jetstream
    │   │   ├── README.md
    │   │   ├── jetstream.yaml
    │   │   └── model-conversion
    │   │   │   └── kaggle_converter.yaml
    │   ├── templates
    │   │   └── secret-templates
    │   │   │   └── secret-provider.tftpl
    │   ├── text-generation-inference
    │   │   ├── README.md
    │   │   ├── autoscaling.md
    │   │   ├── hpa-templates
    │   │   │   ├── dcgm-podmonitoring.yaml.tftpl
    │   │   │   ├── hpa.cpu.yaml.tftpl
    │   │   │   └── hpa.tgi.custom_metric.yaml.tftpl
    │   │   ├── main.tf
    │   │   ├── manifest-templates
    │   │   │   ├── text-generation-inference-svc.tftpl
    │   │   │   └── text-generation-inference.tftpl
    │   │   ├── monitoring-templates
    │   │   │   └── tgi-podmonitoring.yaml.tftpl
    │   │   ├── providers.tf
    │   │   ├── sample-terraform.tfvars
    │   │   └── variables.tf
    │   ├── triton
    │   │   ├── README.md
    │   │   ├── main.tf
    │   │   ├── manifest-templates
    │   │   │   ├── triton-tensorrtllm-inference-docker.tftpl
    │   │   │   └── triton-tensorrtllm-inference-gs.tftpl
    │   │   ├── providers.tf
    │   │   ├── sample-terraform.tfvars
    │   │   └── variables.tf
    │   └── vllm
    │   │   ├── README.md
    │   │   ├── hpa-templates
    │   │       └── hpa.vllm.custom_metric.yaml.tftpl
    │   │   ├── main.tf
    │   │   ├── manifest-templates
    │   │       ├── vllm-service.tftpl
    │   │       └── vllm.tftpl
    │   │   ├── monitoring-templates
    │   │       └── vllm-podmonitoring.yaml.tftpl
    │   │   ├── providers.tf
    │   │   ├── sample-terraform.tfvars
    │   │   └── variables.tf
    ├── infra
    │   ├── 65k-cpu-cluster
    │   │   ├── main.tf
    │   │   ├── provider.tf
    │   │   ├── sample-tfvars
    │   │   │   └── 65k-sample.tfvars
    │   │   └── variables.tf
    │   └── accelerator-cluster
    │   │   ├── README.md
    │   │   ├── stage-1
    │   │       ├── README.md
    │   │       ├── main.tf
    │   │       ├── modules
    │   │       │   └── gke-infra
    │   │       │   │   ├── README.md
    │   │       │   │   ├── cluster.tf
    │   │       │   │   ├── filestore.tf
    │   │       │   │   ├── main.tf
    │   │       │   │   ├── outputs.tf
    │   │       │   │   └── variables.tf
    │   │       ├── outputs.tf
    │   │       ├── sample-tfvars
    │   │       │   ├── gpu-sample.tfvars
    │   │       │   └── jetstream-sample.tfvars
    │   │       └── variables.tf
    │   │   └── stage-2
    │   │       ├── README.md
    │   │       ├── main.tf
    │   │       ├── modules
    │   │           └── gke-setup
    │   │           │   ├── main.tf
    │   │           │   ├── modules
    │   │           │       ├── gcs-fuse
    │   │           │       │   ├── main.tf
    │   │           │       │   ├── outputs.tf
    │   │           │       │   └── variables.tf
    │   │           │       ├── nvidia-dcgm
    │   │           │       │   ├── main.tf
    │   │           │       │   ├── manifest-templates
    │   │           │       │   │   ├── 01-ds-dcgm.yaml
    │   │           │       │   │   ├── 02-ds-exporter.yaml
    │   │           │       │   │   ├── 03-cm-dcgm.yaml
    │   │           │       │   │   └── pod-monitoring.yaml
    │   │           │       │   └── variables.tf
    │   │           │       ├── output-benchmark
    │   │           │       │   ├── main.tf
    │   │           │       │   ├── outputs.tf
    │   │           │       │   └── variables.tf
    │   │           │       ├── secret-manager
    │   │           │       │   ├── csi-driver-gcp-plugin
    │   │           │       │   │   └── provider-gcp-plugin.yaml
    │   │           │       │   ├── csi-driver
    │   │           │       │   │   ├── csidriver.yaml
    │   │           │       │   │   ├── rbac-secretproviderclass.yaml
    │   │           │       │   │   ├── rbac-secretprovidersyncing.yaml
    │   │           │       │   │   ├── secrets-store-csi-driver.yaml
    │   │           │       │   │   ├── secrets-store.csi.x-k8s.io_secretproviderclasses.yaml
    │   │           │       │   │   └── secrets-store.csi.x-k8s.io_secretproviderclasspodstatuses.yaml
    │   │           │       │   ├── main.tf
    │   │           │       │   ├── outputs.tf
    │   │           │       │   └── variables.tf
    │   │           │       └── workload-identity
    │   │           │       │   ├── gcp.tf
    │   │           │       │   ├── kubernetes.tf
    │   │           │       │   ├── outputs.tf
    │   │           │       │   └── variables.tf
    │   │           │   ├── outputs.tf
    │   │           │   ├── providers.tf
    │   │           │   └── variables.tf
    │   │       ├── outputs.tf
    │   │       ├── sample-tfvars
    │   │           ├── gpu-sample.tfvars
    │   │           └── jetstream-sample.tfvars
    │   │       └── variables.tf
    └── orchestration
    │   ├── README.md
    │   ├── config
    │       ├── stage-1.tfvars
    │       ├── stage-2.tfvars
    │       └── text-generation-inference.tfvars
    │   ├── templates
    │       ├── stage-2.auto.tfvars.tpl
    │       └── text-generation-inference.auto.tfvars.tpl
    │   ├── text-generation-inference-apply.sh
    │   └── text-generation-inference-destroy.sh
├── best-practices
    ├── README.md
    ├── gke-batch-refarch
    │   └── README.md
    ├── hotswap.md
    ├── ml-platform
    │   └── README.md
    └── startup-latency.md
├── charts
    ├── gmp-engine
    │   ├── Chart.yaml
    │   ├── charts
    │   │   └── gmp-frontend
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       ├── deployment.yaml
    │   │   │       └── service.yaml
    │   │   │   └── values.yaml
    │   ├── templates
    │   │   └── podmonitoring.yaml
    │   └── values.yaml
    ├── nvidia-dra-driver-gpu
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── LICENSE
    │   ├── NOTICE
    │   ├── crds
    │   │   └── resource.nvidia.com_computedomains.yaml
    │   ├── templates
    │   │   ├── _helpers.tpl
    │   │   ├── clusterrole.yaml
    │   │   ├── clusterrolebinding.yaml
    │   │   ├── controller.yaml
    │   │   ├── deviceclass-compute-domain-daemon.yaml
    │   │   ├── deviceclass-compute-domain-default-channel.yaml
    │   │   ├── deviceclass-gpu.yaml
    │   │   ├── deviceclass-mig.yaml
    │   │   ├── kubeletplugin.yaml
    │   │   ├── openshiftprivilegedrolebinging.yaml
    │   │   ├── serviceaccount.yaml
    │   │   ├── validatingadmissionpolicy.yaml
    │   │   ├── validatingadmissionpolicybinding.yaml
    │   │   └── validation.yaml
    │   └── values.yaml
    └── tpu-dra-driver
    │   ├── Chart.yaml
    │   ├── README.md
    │   ├── install-tpu-dra-driver.sh
    │   ├── templates
    │       ├── _helpers.tpl
    │       ├── clusterrole.yaml
    │       ├── clusterrolebinding.yaml
    │       ├── deviceclass.yaml
    │       ├── kubeletplugin.yaml
    │       ├── serviceaccount.yaml
    │       ├── validatingadmissionpolicy.yaml
    │       ├── validatingadmissionpolicybinding.yaml
    │       └── validation.yml
    │   └── values.yaml
├── cloudbuild_cleanup.yaml
├── contributing.md
├── gke-batch-refarch
    └── README.md
├── infrastructure
    ├── README.md
    ├── backend.tf
    ├── main.tf
    ├── outputs.tf
    ├── platform.tfvars
    ├── tfvars_examples
    │   ├── autopilot-gke-with-existing-network.platform.tfvars
    │   ├── autopilot-gke-with-new-network.platform.tfvars
    │   ├── platform.complete.tfvars
    │   ├── standard-gke-with-exisiting-network.platform.tfvars
    │   └── standard-gke-with-new-network.platform.tfvars
    ├── tfvars_tests
    │   └── standard-gke-public.platform.tfvars
    ├── variables.tf
    └── versions.tf
├── jupyter-on-gke
├── modules
    ├── cloudsql
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── custom-metrics-stackdriver-adapter
    │   ├── README.md
    │   ├── main.tf
    │   ├── templates
    │   │   ├── apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl
    │   │   ├── apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl
    │   │   ├── apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl
    │   │   ├── clusterrole_custom-metrics-resource-reader.yaml.tftpl
    │   │   ├── clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl
    │   │   ├── clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl
    │   │   ├── clusterrolebinding_external-metrics-reader.yaml.tftpl
    │   │   ├── deployment_custom-metrics-stackdriver-adapter.yaml.tftpl
    │   │   ├── rolebinding_custom-metrics-auth-reader.yaml.tftpl
    │   │   └── service_custom-metrics-stackdriver-adapter.yaml.tftpl
    │   └── variables.tf
    ├── gcp-network
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── gcs
    │   ├── README.md
    │   ├── main.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── gke-autopilot-private-cluster
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── gke-autopilot-public-cluster
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   └── variables.tf
    ├── gke-standard-private-cluster
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── gke-standard-public-cluster
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   └── variables.tf
    ├── iap
    │   ├── charts
    │   │   └── iap
    │   │   │   ├── Chart.yaml
    │   │   │   ├── templates
    │   │   │       ├── backend-config.yaml
    │   │   │       ├── iap-secret.yaml
    │   │   │       ├── managed-cert.yaml
    │   │   │       └── static-ingress.yaml
    │   │   │   └── values.yaml
    │   ├── iap.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── inference-service
    │   ├── README.md
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── jetstream-maxtext-deployment
    │   ├── README.md
    │   ├── main.tf
    │   ├── templates
    │   │   ├── custom-metrics-stackdriver-adapter
    │   │   │   └── hpa.jetstream.yaml.tftpl
    │   │   ├── deployment.yaml.tftpl
    │   │   ├── podmonitoring-tpu.yaml.tftpl
    │   │   ├── podmonitoring.yaml.tftpl
    │   │   ├── prometheus-adapter
    │   │   │   ├── hpa.jetstream.yaml.tftpl
    │   │   │   └── values.yaml.tftpl
    │   │   └── service.yaml.tftpl
    │   └── variables.tf
    ├── jupyter
    │   ├── authentication
    │   │   ├── README.MD
    │   │   ├── authenticator
    │   │   │   ├── gcpiapjwtauthenticator
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── gcpiapjwtauthenticator.py
    │   │   │   │   └── gcpiapjwtauthenticator_test.py
    │   │   │   └── setup.py
    │   │   └── docker_image
    │   │   │   ├── Dockerfile
    │   │   │   └── cloudbuild.yaml
    │   ├── images
    │   │   ├── IAP_screenshot.png
    │   │   ├── brand_screenshot.png
    │   │   ├── gcs_bucket.png
    │   │   ├── iap_enable_api_screenshot.png
    │   │   ├── image.png
    │   │   └── oauth_consent_screenshot.png
    │   ├── jupyter_config
    │   │   ├── config-selfauth-autopilot.yaml
    │   │   └── config-selfauth.yaml
    │   ├── jupyter_image
    │   │   └── notebook_image
    │   │   │   ├── Dockerfile
    │   │   │   ├── README.md
    │   │   │   ├── cloudbuild.yaml
    │   │   │   └── requirements.txt
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── tests
    │   │   ├── change_jupyter_config.py
    │   │   └── test_hub.py
    │   ├── variables.tf
    │   └── versions.tf
    ├── kuberay-cluster
    │   ├── kuberay_image
    │   │   ├── Dockerfile
    │   │   ├── cloudbuild.yaml
    │   │   └── requirements.txt
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── values.yaml
    │   ├── variables.tf
    │   └── versions.tf
    ├── kuberay-monitoring
    │   ├── gmpvalues.yaml
    │   ├── grafana
    │   │   └── values.yaml
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    ├── kubernetes-namespace
    │   ├── charts
    │   │   └── namespace
    │   │   │   └── Chart.yaml
    │   ├── main.tf
    │   ├── outputs.tf
    │   ├── variables.tf
    │   └── versions.tf
    └── prometheus-adapter
    │   ├── README.md
    │   ├── main.tf
    │   └── variables.tf
├── ray-on-gke
    ├── README.md
    ├── examples
    │   ├── notebooks
    │   │   ├── gpt-j-online.ipynb
    │   │   ├── jax-tpu.ipynb
    │   │   ├── ray-dist-mnist.ipynb
    │   │   ├── ray-fine-tune-hugging-face.ipynb
    │   │   ├── ray_basic.ipynb
    │   │   ├── ray_mnist.ipynb
    │   │   ├── raytrain-stablediffusion.ipynb
    │   │   ├── stable-diffusion-tpu.ipynb
    │   │   └── stable_diffusion.ipynb
    │   └── tfvars
    ├── guides
    │   ├── observability
    │   │   └── README.md
    │   └── raytrain-with-gcsfusecsi
    │   │   ├── README.md
    │   │   ├── images
    │   │       ├── ray-cluster-on-gke.png
    │   │       ├── ray-head-resources.png
    │   │       └── ray-worker-resources.png
    │   │   └── jupyter-spec.yaml
    └── tpu
    │   └── kuberay-tpu-webhook
    │       └── README.md
├── scripts
    ├── ci
    │   └── wait_for_pods.sh
    └── network-setup
    │   ├── v6e-increase-rmem.yaml
    │   └── v6e-network-optimization.yaml
├── security_test
    ├── README.md
    ├── allowlist
    │   └── category
    │   │   ├── cluster
    │   │       ├── continuous-image-puller
    │   │       │   ├── capabilities.json
    │   │       │   ├── distroless.json
    │   │       │   ├── imagedigest.json
    │   │       │   ├── imagefreshness.json
    │   │       │   ├── imagepath.json
    │   │       │   ├── readonlyrootfs.json
    │   │       │   ├── sbom.json
    │   │       │   └── seccompprofile.json
    │   │       ├── hub
    │   │       │   ├── capabilities.json
    │   │       │   ├── distroless.json
    │   │       │   ├── imagedigest.json
    │   │       │   ├── imagefreshness.json
    │   │       │   ├── imagepath.json
    │   │       │   ├── rbac.json
    │   │       │   ├── readonlyrootfs.json
    │   │       │   └── seccompprofile.json
    │   │       ├── kuberay-operator-leader-election
    │   │       │   └── rbac.json
    │   │       ├── kuberay-operator
    │   │       │   └── rbac.json
    │   │       ├── mistral-7b-instruct
    │   │       │   ├── allowprivilegeescalation.json
    │   │       │   ├── capabilities.json
    │   │       │   ├── distroless.json
    │   │       │   ├── imagedigest.json
    │   │       │   ├── imagefreshness.json
    │   │       │   ├── imagepath.json
    │   │       │   ├── readonlyrootfs.json
    │   │       │   ├── rootless.json
    │   │       │   ├── sbom.json
    │   │       │   └── seccompprofile.json
    │   │       ├── proxy
    │   │       │   ├── capabilities.json
    │   │       │   ├── distroless.json
    │   │       │   ├── imagedigest.json
    │   │       │   ├── imagefreshness.json
    │   │       │   ├── imagepath.json
    │   │       │   ├── readonlyrootfs.json
    │   │       │   ├── sbom.json
    │   │       │   └── seccompprofile.json
    │   │       ├── rag-frontend
    │   │       │   ├── allowprivilegeescalation.json
    │   │       │   ├── capabilities.json
    │   │       │   ├── distroless.json
    │   │       │   ├── imagedigest.json
    │   │       │   ├── imagefreshness.json
    │   │       │   ├── imagepath.json
    │   │       │   ├── readonlyrootfs.json
    │   │       │   ├── rootless.json
    │   │       │   ├── sbom.json
    │   │       │   └── seccompprofile.json
    │   │       ├── ray-cluster-kuberay
    │   │       │   └── rbac.json
    │   │       ├── rayjob-editor-role
    │   │       │   └── rbac.json
    │   │       ├── rayjob-viewer-role
    │   │       │   └── rbac.json
    │   │       ├── rayservice-editor-role
    │   │       │   └── rbac.json
    │   │       └── rayservice-viewer-role
    │   │       │   └── rbac.json
    │   │   └── helm
    │   │       ├── iap
    │   │           └── defaultnamespace.json
    │   │       └── kuberay-tpu-webhook
    │   │           ├── allowprivilegeescalation.json
    │   │           ├── capabilities.json
    │   │           ├── imagedigest.json
    │   │           ├── imagefreshness.json
    │   │           ├── imagepath.json
    │   │           ├── readonlyrootfs.json
    │   │           ├── rootless.json
    │   │           └── seccompprofile.json
    └── config.yaml
├── slurm-on-gke
    └── README.md
├── tools
    ├── README.md
    ├── dcgm-on-gke
    │   └── README.md
    ├── gke-disk-image-builder
    │   └── README.md
    └── saxml-on-gke
    │   └── README.md
├── tpu-provisioner
    ├── .dockerignore
    ├── .gitignore
    ├── Dockerfile
    ├── Makefile
    ├── PROJECT
    ├── README.md
    ├── admission_controller
    │   ├── .gitignore
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── __init__.py
    │   ├── admission_controller.py
    │   ├── certificates
    │   │   └── README.md
    │   ├── manifests
    │   │   └── manifest.yaml
    │   ├── requirements.txt
    │   ├── skaffold.yaml
    │   └── test
    │   │   ├── __init__.py
    │   │   ├── admission_controller_test.py
    │   │   └── e2e
    │   │       ├── manifests
    │   │           ├── test-disabled-provisioning.yaml
    │   │           ├── test-location-hint-no-reservation.yaml
    │   │           ├── test-location-hint-with-reservation.yaml
    │   │           └── test-nonjobset-job.yaml
    │   │       └── test.sh
    ├── cloudbuild.yaml
    ├── cmd
    │   └── main.go
    ├── config
    │   ├── default
    │   │   ├── kustomization.yaml
    │   │   ├── manager_auth_proxy_patch.yaml
    │   │   └── manager_config_patch.yaml
    │   ├── manager
    │   │   ├── configmap.yaml
    │   │   ├── kustomization.yaml
    │   │   └── manager.yaml
    │   ├── prometheus
    │   │   ├── kustomization.yaml
    │   │   └── monitor.yaml
    │   └── rbac
    │   │   ├── auth_proxy_client_clusterrole.yaml
    │   │   ├── auth_proxy_role.yaml
    │   │   ├── auth_proxy_role_binding.yaml
    │   │   ├── auth_proxy_service.yaml
    │   │   ├── kustomization.yaml
    │   │   ├── leader_election_role.yaml
    │   │   ├── leader_election_role_binding.yaml
    │   │   ├── role.yaml
    │   │   ├── role_binding.yaml
    │   │   └── service_account.yaml
    ├── docs
    │   ├── cleanup.excalidraw.png
    │   └── provisioning.excalidraw.png
    ├── examples
    │   └── jobset.yaml
    ├── go.mod
    ├── go.sum
    ├── internal
    │   ├── auth
    │   │   └── gcp
    │   │   │   ├── README.md
    │   │   │   ├── gcp.go
    │   │   │   └── gcp_test.go
    │   ├── cloud
    │   │   ├── common.go
    │   │   ├── gke.go
    │   │   ├── gke_context.go
    │   │   ├── gke_service.go
    │   │   ├── gke_test.go
    │   │   └── mock.go
    │   └── controller
    │   │   ├── creation_controller.go
    │   │   ├── deletion_controller.go
    │   │   ├── nodepool_garbage_collector.go
    │   │   └── pod_utils.go
    └── test
    │   ├── crds
    │       └── jobset-v0.5.0.yaml
    │   └── integration
    │       └── controller
    │           ├── creation_controller_test.go
    │           ├── deletion_controller_test.go
    │           ├── mock_provider.go
    │           └── suite_test.go
└── tutorials-and-examples
    ├── cloudshell-tutorial.md
    ├── flyte
        └── README.md
    ├── genAI-LLM
        ├── e2e-genai-langchain-app
        │   ├── README.md
        │   ├── backend_ip.png
        │   ├── e2e-genai-langchain.ipynb
        │   ├── frontend_app.png
        │   ├── frontend_ip.png
        │   ├── open_jupyter.png
        │   └── src
        │   │   ├── backend
        │   │       ├── Dockerfile
        │   │       ├── deploy.yaml
        │   │       ├── main.py
        │   │       ├── model.py
        │   │       ├── requirements.in
        │   │       └── requirements.txt
        │   │   └── frontend
        │   │       ├── .gitignore
        │   │       ├── Dockerfile
        │   │       ├── deploy.yaml
        │   │       ├── package-lock.json
        │   │       ├── package.json
        │   │       ├── src
        │   │           ├── index.html
        │   │           └── index.tsx
        │   │       ├── tsconfig.json
        │   │       └── webpack.config.js
        └── finetuning-gemma-2b-on-l4
        │   └── README.md
    ├── gpu-examples
        ├── a100-jax
        │   └── README.md
        ├── online-serving-single-gpu
        │   └── README.md
        └── training-single-gpu
        │   ├── README.md
        │   ├── data
        │       └── mnist_predict
        │       │   ├── 0.png
        │       │   ├── 1.png
        │       │   ├── 2.png
        │       │   ├── 3.png
        │       │   ├── 4.png
        │       │   ├── 5.png
        │       │   ├── 6.png
        │       │   ├── 7.png
        │       │   ├── 8.png
        │       │   └── 9.png
        │   └── src
        │       ├── gke-config
        │           ├── standard-tensorflow-bash.yaml
        │           ├── standard-tf-mnist-batch-predict.yaml
        │           └── standard-tf-mnist-train.yaml
        │       └── tensorflow-mnist-example
        │           ├── requirements.txt
        │           ├── tensorflow_mnist_batch_predict.py
        │           └── tensorflow_mnist_train_distributed.py
    ├── hf-tgi
        └── README.md
    ├── inference-servers
        ├── checkpoints
        │   └── README.md
        ├── jetstream
        │   ├── README.md
        │   ├── http-server
        │   │   ├── Dockerfile
        │   │   └── http_server.py
        │   ├── maxtext
        │   │   ├── maxengine-server
        │   │   │   ├── Dockerfile
        │   │   │   └── maxengine_server_entrypoint.sh
        │   │   └── single-host-inference
        │   │   │   ├── README.md
        │   │   │   ├── checkpoint-job.yaml
        │   │   │   ├── kubectl
        │   │   │       └── deployment.yaml
        │   │   │   └── terraform
        │   │   │       ├── main.tf
        │   │   │       ├── providers.tf
        │   │   │       ├── sample-terraform.tfvars
        │   │   │       ├── variables.tf
        │   │   │       ├── versions.tf
        │   │   │       └── versions_override.tf
        │   └── pytorch
        │   │   ├── jetstream-pytorch-server
        │   │       ├── Dockerfile
        │   │       └── jetstream_pytorch_server_entrypoint.sh
        │   │   └── single-host-inference
        │   │       ├── README.md
        │   │       ├── checkpoint-job.yaml
        │   │       ├── deployment.yaml
        │   │       ├── pd-deployment.yaml
        │   │       └── storage.yaml
        └── maxdiffusion
        │   └── README.md
    ├── kserve
        └── README.md
    ├── langchain-chatbot
        └── README.md
    ├── llamaindex
        └── rag
        │   └── README.md
    ├── metaflow
        └── README.md
    ├── mlflow
        └── finetune-gemma
        │   └── README.md
    ├── models-as-oci
        └── README.md
    ├── nvidia-bionemo
        └── README.md
    ├── nvidia-nim
        ├── README.md
        └── blueprints
        │   └── README.md
    ├── skypilot
        ├── README.md
        └── dws-and-kueue
        │   └── README.md
    ├── storage
        ├── hyperdisk-ml
        │   └── README.md
        └── parallelstore-backup-and-recovery
        │   ├── README.md
        │   ├── parallelstore-sa.yaml
        │   └── ps-to-gcs-backup.yaml
    ├── tpu-examples
        ├── single-host-inference
        │   ├── jax
        │   │   ├── bert
        │   │   │   ├── bert_request.py
        │   │   │   ├── export_bert_model.py
        │   │   │   ├── install-bert.yaml
        │   │   │   ├── loadbalancer.yaml
        │   │   │   └── serve-bert.yaml
        │   │   ├── requirements.txt
        │   │   └── stable-diffusion
        │   │   │   ├── README.md
        │   │   │   ├── app.py
        │   │   │   ├── export_stable_diffusion_model.py
        │   │   │   ├── install-stable-diffusion.yaml
        │   │   │   ├── loadbalancer.yaml
        │   │   │   ├── serve-stable-diffusion-tpu-v4.yaml
        │   │   │   ├── serve-stable-diffusion-v5e.yaml
        │   │   │   ├── serve-stable-diffusion.yaml
        │   │   │   └── stable_diffusion_request.py
        │   ├── pt
        │   │   └── densenet161
        │   │   │   ├── deployment.yml
        │   │   │   ├── loadbalancer.yml
        │   │   │   ├── model-archive.yml
        │   │   │   ├── request.py
        │   │   │   └── requirements.txt
        │   ├── pvc-pv.yaml
        │   └── tf
        │   │   └── resnet50
        │   │       ├── banana.jpeg
        │   │       ├── deployment.yml
        │   │       ├── export_resnet_model.py
        │   │       ├── loadbalancer.yml
        │   │       ├── model-conversion.yml
        │   │       ├── request.py
        │   │       └── requirements.txt
        └── training
        │   ├── diffusion
        │       └── Dockerfile
        │   ├── gpt
        │       ├── Dockerfile
        │       ├── fsdp_config.json
        │       └── my_config_2.json
        │   └── mnist-single-tpu
        │       ├── README.md
        │       ├── data
        │           └── mnist_predict
        │           │   ├── 0.png
        │           │   ├── 1.png
        │           │   ├── 2.png
        │           │   ├── 3.png
        │           │   ├── 4.png
        │           │   ├── 5.png
        │           │   ├── 6.png
        │           │   ├── 7.png
        │           │   ├── 8.png
        │           │   └── 9.png
        │       └── src
        │           ├── gke-config
        │               ├── standard-tensorflow-bash-v4.yaml
        │               ├── standard-tensorflow-bash-v5e.yaml
        │               ├── standard-tf-mnist-batch-predict-v4.yaml
        │               ├── standard-tf-mnist-batch-predict-v5e.yaml
        │               ├── standard-tf-mnist-train-v4.yaml
        │               └── standard-tf-mnist-train-v5e.yaml
        │           └── tensorflow-mnist-example
        │               ├── requirements.txt
        │               ├── tensorflow_mnist_batch_predict.py
        │               └── tensorflow_mnist_train_distributed.py
    ├── vector-databases
        └── readme.md
    └── workflow-orchestration
        ├── dws-examples
            └── README.md
        ├── dws-multiclusters-example
            └── README.md
        ├── indexed-job
            └── README.md
        └── jobset
            └── pytorch
                └── README.md


/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | <!--  Thanks for sending a pull request!  Here are some tips for you:
 2 | 
 3 | 1. If this is your first time, please read our contributor guidelines: https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/contributing.md
 4 | 2. Please label this pull request according to what type of issue you are addressing.
 5 | 3. Ensure you have added or ran the appropriate tests for your PR.
 6 | -->
 7 | 
 8 | **What type of PR is this?**
 9 | > Uncomment only one ` /kind <>` line, press enter to put that in a new line, and remove leading whitespace from that line:
10 | >
11 | > /kind breaking
12 | > /kind bug
13 | > /kind cleanup
14 | > /kind documentation
15 | > /kind enhancement
16 | 
17 | **What this PR does / Why we need it**:
18 | 
19 | **Which issue(s) this PR fixes**:
20 | <!--
21 | *Automatically closes linked issue when PR is merged.
22 | Usage: `Closes #<issue number>`, or `Closes (paste link of issue)`.
23 | -->
24 | Closes #
25 | 
26 | **Special notes for your reviewer**:
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | ## Archives
16 | **/*.tar
17 | **/*.tar.gz
18 | **/*.zip
19 | 
20 | # Directories
21 | bin/
22 | deploy/
23 | 
24 | # IDEs
25 | .idea/
26 | .vscode/
27 | 
28 | # Python
29 | __pycache__/
30 | 
31 | # Terraform
32 | default.tfstate
33 | default.tfstate.backup
34 | .terraform*
35 | terraform.tfstate*
36 | terraform.tfvars
37 | tfplan
38 | .vscode/
39 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "best-practices/accelerated-platforms"]
2 | 	path = best-practices/accelerated-platforms
3 | 	url = https://github.com/GoogleCloudPlatform/accelerated-platforms.git
4 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # @name owns any files in the /benchmarks/
2 | # directory at the root of the repository and any of its
3 | # subdirectories.
4 | /benchmarks/  @achandrasekar @ahg-g @annapendleton @Bslabe123 @jjk-g
5 | /tpu-provisioner/ @echiugoog @nstogner
6 | 


--------------------------------------------------------------------------------
/applications/jupyter/README.md:
--------------------------------------------------------------------------------
1 | # JupyterHub on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the JupyterHub on GKE example have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [JupyterHub on GKE](https://gke-ai-labs.dev/docs/blueprints/jupyter-on-gke/).


--------------------------------------------------------------------------------
/applications/rag/README.md:
--------------------------------------------------------------------------------
1 | # RAG on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the RAG on GKE example have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [RAG on GKE](https://gke-ai-labs.dev/docs/blueprints/rag-on-gke/).


--------------------------------------------------------------------------------
/benchmarks/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | >[!WARNING]
 2 | >This guide and associated code are **deprecated** and no longer maintained.
 3 | >
 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
 5 | 
 6 | This directory contains the benchmark datasets and tools
 7 | used to run benchmarks.
 8 | 
 9 | To prepare benchmark datasets, see the `datasets` directory.
10 | 
11 | To run a benchmarking tool, see the available tools in the `tools` directory.


--------------------------------------------------------------------------------
/benchmarks/benchmark/dataset/README.md:
--------------------------------------------------------------------------------
1 | >[!WARNING]
2 | >This guide and associated code are **deprecated** and no longer maintained.
3 | >
4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
5 | 
6 | This directory contains datasets for various models that are used in
7 | the benchmark runs.


--------------------------------------------------------------------------------
/benchmarks/benchmark/dataset/ShareGPT_v3_unflitered_cleaned_split/requirements.txt:
--------------------------------------------------------------------------------
1 | wget
2 | google-cloud-storage
3 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/CL2-benchmark/headless-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: headless-service
 5 | spec:
 6 |   clusterIP: None
 7 |   selector:
 8 |     svc-headless: headless
 9 |   ports:
10 |   - port: 80
11 |     targetPort: 80
12 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/CL2-benchmark/modules/scheduling-throughput.yaml:
--------------------------------------------------------------------------------
 1 | {{$query := `sum(irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[1m]))[%v:5s]`}}
 2 | 
 3 | steps:
 4 | - name: "{{.action}}ing scheduling throughput measurement"
 5 |   measurements:
 6 |   - Identifier: SchedulingThroughput_{{.basename}}
 7 |     Method: GenericPrometheusQuery
 8 |     Params:
 9 |       action: {{.action}}
10 |       enableViolations: true
11 |       metricName: {{.basename}}
12 |       metricVersion: v1
13 |       unit: 1/s
14 |       queries:
15 |       - name: Max
16 |         query: max_over_time({{$query}})
17 |       - name: Avg
18 |         query: avg_over_time({{$query}})
19 |       - name: Perc99
20 |         query: quantile_over_time(0.99, {{$query}})
21 |         {{if .threshold}}
22 |         threshold: {{.threshold}}
23 |         lowerBound: true
24 |         {{end}}
25 |       - name: Perc90
26 |         query: quantile_over_time(0.90, {{$query}})
27 |       - name: Perc50
28 |         query: quantile_over_time(0.50, {{$query}})
29 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/CL2-benchmark/priorityclass.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: scheduling.k8s.io/v1
2 | kind: PriorityClass
3 | metadata:
4 |   name: {{.Name}}
5 | value: {{.Value}}
6 | globalDefault: false
7 | description: "Priority class for user workloads"
8 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/README.md:
--------------------------------------------------------------------------------
1 | >[!WARNING]
2 | >This guide and associated code are **deprecated** and no longer maintained.
3 | >
4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
5 | 
6 | This directory contains the benchmark tools for measuring performance across multiple inferencing frameworks.


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/dlio/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source  = "hashicorp/helm"
19 |       version = "~> 2.8.0"
20 |     }
21 |     kubernetes = {
22 |       source  = "hashicorp/kubernetes"
23 |       version = "2.18.1"
24 |     }
25 |     kubectl = {
26 |       source  = "alekc/kubectl"
27 |       version = "2.0.1"
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv.tpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: ${pv_name}
 5 | spec:
 6 |   storageClassName: ""
 7 |   capacity:
 8 |     storage: 12Ti
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   volumeMode: Filesystem
13 |   csi:
14 |     driver: parallelstore.csi.storage.gke.io
15 |     volumeHandle: ${project}/${ps_location}/${ps_instance_name}/default-pool/default-container
16 |     volumeAttributes:
17 |       ip: "${ps_ip_address_1}, ${ps_ip_address_2}, ${ps_ip_address_3}"
18 |       network: ${ps_network_name}


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pvc.tpl:
--------------------------------------------------------------------------------
 1 | kind: PersistentVolumeClaim
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: ${pvc_name}
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   accessModes:
 8 |     - ReadWriteMany
 9 |   storageClassName: ${storageclass}
10 |   volumeName: ${pv_name}
11 |   resources:
12 |     requests:
13 |       storage: 12000Gi


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source  = "hashicorp/helm"
19 |       version = "~> 2.8.0"
20 |     }
21 |     kubernetes = {
22 |       source  = "hashicorp/kubernetes"
23 |       version = "2.18.1"
24 |     }
25 |     kubectl = {
26 |       source  = "alekc/kubectl"
27 |       version = "2.0.1"
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/storage/pv_podspec.tpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: ${pv_name}
 5 | spec:
 6 |   accessModes:
 7 |   - ReadWriteMany
 8 |   capacity:
 9 |     # This is a placeholder, can be any number. It needs to match with the PVC resource.requests.storage field
10 |     storage: 1Gi
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: dummy-storage-class
13 |   mountOptions:
14 |   - stat-cache-capacity=${gcsfuse_stat_cache_capacity}
15 |   - stat-cache-ttl=${gcsfuse_stat_cache_ttl}
16 |   - type-cache-ttl=${gcsfuse_type_cache_ttl}
17 |   claimRef:
18 |     namespace: ${namespace}
19 |     name: ${pvc_name}
20 |   csi:
21 |     driver: gcsfuse.csi.storage.gke.io
22 |     volumeHandle: ${gcs_bucket} # unique bucket name


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/storage/pvc_podspec.tpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: ${pvc_name}
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   accessModes:
 8 |   - ReadWriteMany
 9 |   resources:
10 |     requests:
11 |       storage: 1Gi
12 |   volumeName: ${pv_name}
13 |   storageClassName: dummy-storage-class


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/modules/storage/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source  = "hashicorp/helm"
19 |       version = "~> 2.8.0"
20 |     }
21 |     kubernetes = {
22 |       source  = "hashicorp/kubernetes"
23 |       version = "2.18.1"
24 |     }
25 |     kubectl = {
26 |       source  = "alekc/kubectl"
27 |       version = "2.0.1"
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/dlio/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     helm = {
21 |       source  = "hashicorp/helm"
22 |       version = "~> 2.8.0"
23 |     }
24 |     kubernetes = {
25 |       source  = "hashicorp/kubernetes"
26 |       version = "2.18.1"
27 |     }
28 |     kubectl = {
29 |       source  = "alekc/kubectl"
30 |       version = "2.0.1"
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/build.tf:
--------------------------------------------------------------------------------
 1 | resource "null_resource" "build_and_push_image" {
 2 | 
 3 |   depends_on = [resource.google_project_service.cloudbuild]
 4 |   provisioner "local-exec" {
 5 |     working_dir = path.module
 6 |     command     = "gcloud builds submit --tag ${var.artifact_registry}/locust-tasks:latest locust-docker"
 7 |   }
 8 | }
 9 | 
10 | resource "null_resource" "build_and_push_runner_image" {
11 | 
12 |   provisioner "local-exec" {
13 |     working_dir = path.module
14 |     command     = "gcloud builds submit --tag ${var.artifact_registry}/locust-runner:latest locust-runner"
15 |   }
16 | }
17 | 
18 | resource "null_resource" "build_and_push_exporter_image" {
19 | 
20 |   provisioner "local-exec" {
21 |     working_dir = path.module
22 |     command     = "gcloud builds submit --tag ${var.artifact_registry}/locust-custom-exporter:latest locust-custom-exporter"
23 |   }
24 | }


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.20
 2 | 
 3 | # Set destination for COPY
 4 | WORKDIR /app
 5 | 
 6 | # Download Go modules
 7 | COPY go.mod go.sum ./
 8 | RUN go mod download
 9 | 
10 | # Copy the source code. Note the slash at the end, as explained in
11 | # https://docs.docker.com/reference/dockerfile/#copy
12 | COPY *.go ./
13 | 
14 | # Build
15 | RUN CGO_ENABLED=0 GOOS=linux go build -o /locust_exporter
16 | 
17 | EXPOSE 8080
18 | 
19 | # Run
20 | CMD ["/locust_exporter"]
21 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 The Prometheus Authors
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | # http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | include Makefile.common
15 | 
16 | ## This is a copy!
17 | ## https://github.com/prometheus/prometheus/blob/main/Makefile.common
18 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/ContainerSolutions/locust_exporter
 2 | 
 3 | go 1.12
 4 | 
 5 | require (
 6 | 	github.com/prometheus/client_golang v1.11.1
 7 | 	github.com/prometheus/common v0.26.0
 8 | 	gopkg.in/alecthomas/kingpin.v2 v2.2.6
 9 | )
10 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt:
--------------------------------------------------------------------------------
 1 | Brotli==1.0.9
 2 | certifi==2023.7.22
 3 | chardet==4.0.0
 4 | charset-normalizer==2.0.12
 5 | click==8.1.2
 6 | ConfigArgParse==1.5.5
 7 | Flask==2.2.5
 8 | Flask-BasicAuth==0.2.0
 9 | Flask-Cors==4.0.2
10 | gevent==23.9.0
11 | geventhttpclient==2.0.11
12 | greenlet==2.0.0
13 | google-cloud-storage
14 | idna==3.7
15 | importlib-metadata==4.11.3
16 | itsdangerous==2.1.2
17 | Jinja2==3.1.6
18 | locust==2.20.1
19 | MarkupSafe==2.1.1
20 | msgpack==1.0.3
21 | msgpack-python==0.5.6
22 | psutil==5.9.1
23 | pyzmq==25.0.0
24 | requests==2.31.0
25 | roundrobin==0.0.2
26 | six==1.16.0
27 | transformers==4.48.0
28 | typing_extensions==4.1.1
29 | urllib3==1.26.18
30 | Werkzeug==3.0.3
31 | zipp==3.8.0
32 | zope.event==4.5.0
33 | zope.interface==5.4.0
34 | TensorFlow >= 2.0
35 | google-jetstream==0.2.0
36 | grpcio==1.62.2
37 | grpc-interceptor==0.15.4
38 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-runner/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Google Inc. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | FROM python:3.9
16 |  
17 | WORKDIR /code
18 |  
19 | COPY ./requirements.txt /code/requirements.txt
20 | COPY ./metrics.yaml /code/metrics.yaml
21 |  
22 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
23 |  
24 | COPY ./app /code/app
25 |  
26 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-runner/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/locust-load-inference/locust-runner/app/__init__.py


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/locust-runner/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | requests
3 | uvicorn
4 | google-cloud-monitoring
5 | google-cloud-storage
6 | pathlib
7 | PyYAML


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/pod-monitoring.yaml.tpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: locust-scrapper
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app: locust-master
10 |   endpoints:
11 |   - port: 8080
12 |     interval: 5s
13 |   - port: 9646
14 |     interval: 5s


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/runner-manifest-template/locust-runner-service.yaml.tpl:
--------------------------------------------------------------------------------
 1 | kind: Service
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: locust-runner-api
 5 |   namespace: ${namespace}
 6 |   annotations:
 7 |     networking.gke.io/load-balancer-type: "External"
 8 |   labels:
 9 |     app: locust-runner
10 | spec:
11 |   ports:
12 |     - port: 8000
13 |       targetPort: 8000
14 |       protocol: TCP
15 | %{ for runner_endpoint_ip in runner_endpoint_ip_list ~}
16 |   loadBalancerIP: ${runner_endpoint_ip}
17 | %{ endfor ~}
18 |   selector:
19 |     app: locust-runner
20 |   type: LoadBalancer
21 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/locust-load-inference/runner-manifest-template/locust-runner.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: locust-runner
 5 |   namespace: ${namespace}
 6 |   labels:
 7 |     app: locust-runner
 8 |     examples.ai.gke.io/source: ai-on-gke-benchmarks
 9 | spec:
10 |   serviceAccountName: ${ksa}
11 |   containers:
12 |     - name: locust-runner
13 |       image: ${artifact_registry}/locust-runner:latest
14 |       env:
15 |       - name: PROJECT_ID
16 |         value: ${project_id}
17 |       - name: BUCKET
18 |         value: ${bucket}
19 |       - name: DURATION
20 |         value: ${duration}
21 |       - name: USERS
22 |         value: ${users}
23 |       - name: RATE
24 |         value: ${rate}
25 |       - name: NAMESPACE
26 |         value: ${namespace}


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/benchmarker.ini:
--------------------------------------------------------------------------------
1 | [default]
2 | MODEL_LOAD_BENCHMARK_CONFIG = base-config.yaml
3 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/config/utils.go:
--------------------------------------------------------------------------------
 1 | package config
 2 | 
 3 | import (
 4 | 	"strconv"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Helper function to parse values with units
 9 | func parseValueUnit(value string) (int, string) {
10 | 	numStr := strings.TrimRightFunc(value, func(r rune) bool {
11 | 		return r < '0' || r > '9'
12 | 	})
13 | 	unit := strings.TrimPrefix(value, numStr)
14 | 	num, _ := strconv.Atoi(numStr)
15 | 	return num, unit
16 | }
17 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: ClusterRoleBinding
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: pod-creator-binding
 5 | subjects:
 6 | - kind: User
 7 |   name: gke_kunjanp-gke-dev-2_us-west4_gpu-dev-cluster 
 8 |   apiGroup: rbac.authorization.k8s.io
 9 | roleRef:
10 |   kind: ClusterRole
11 |   name: cluster-admin  
12 |   apiGroup: rbac.authorization.k8s.io
13 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | pyyaml
3 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_cpu_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_cpu_request.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_download_chunk_size_mb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_download_chunk_size_mb.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_ephemeral_storage_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_ephemeral_storage_request.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_max_parallel_downloads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_max_parallel_downloads.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_memory_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_memory_request.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_parallel_downloads_per_file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_parallel_downloads_per_file.png


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/model-load-benchmark/volumeAttributes.yaml:
--------------------------------------------------------------------------------
1 | volumeAttributes:
2 |   bucketName: BUCKET_NAME
3 |   mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:4,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:3"
4 |   fileCacheCapacity: "-1"
5 |   fileCacheForRangeRead: "true"
6 |   metadataStatCacheCapacity: "-1"
7 |   metadataTypeCacheCapacity: "-1"
8 |   metadataCacheTTLSeconds: "600"
9 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/profile-generator/build.tf:
--------------------------------------------------------------------------------
1 | resource "null_resource" "build_and_push_image" {
2 |   count      = var.build_latency_profile_generator_image ? 1 : 0
3 |   depends_on = [resource.google_project_service.cloudbuild]
4 |   provisioner "local-exec" {
5 |     working_dir = path.module
6 |     command     = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container"
7 |   }
8 | }


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/profile-generator/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.20-slim-bookworm as dev
 2 | 
 3 | RUN apt-get update -y \
 4 |     && apt-get install -y python3-pip git vim curl wget
 5 | RUN pip3 install --upgrade pip
 6 | WORKDIR /workspace
 7 | 
 8 | # install build and runtime dependencies
 9 | COPY requirements.txt requirements.txt
10 | RUN pip install -r requirements.txt
11 | 
12 | RUN pip install -U "huggingface_hub[cli]"
13 | 
14 | RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
15 | 
16 | COPY benchmark_serving.py benchmark_serving.py
17 | COPY latency_throughput_curve.sh latency_throughput_curve.sh
18 | 
19 | RUN chmod +x latency_throughput_curve.sh
20 | RUN chmod +x benchmark_serving.py


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/profile-generator/container/requirements.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | toml==0.10.2
 4 | ruff==0.1.5
 5 | 
 6 | # type checking
 7 | mypy==0.991
 8 | types-PyYAML
 9 | types-requests
10 | types-setuptools
11 | 
12 | # testing
13 | pytest
14 | pytest-forked
15 | pytest-asyncio
16 | httpx
17 | einops # required for MPT
18 | openai
19 | requests
20 | 
21 | # run
22 | ninja  # For faster builds.
23 | psutil
24 | ray >= 2.9
25 | sentencepiece  # Required for LLaMA tokenizer.
26 | numpy < 2.0
27 | torch == 2.6.0
28 | transformers >= 4.42.0 # Required for Qwen2
29 | xformers == 0.0.23
30 | fastapi
31 | uvicorn[standard]
32 | pydantic >= 2.0  # Required for OpenAI server.
33 | aioprometheus[starlette]
34 | pynvml == 11.5.0
35 | accelerate
36 | aiohttp
37 | google-auth
38 | google-cloud-storage >= 2.18.2
39 | prometheus_client >= 0.21.0


--------------------------------------------------------------------------------
/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: "lpg-driver-podmonitoring"
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       name: latency-profile-generator
10 |   endpoints:
11 |   - port: 9090
12 |     interval: 15s
13 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/README.md:
--------------------------------------------------------------------------------
 1 | >[!WARNING]
 2 | >This guide and associated code are **deprecated** and no longer maintained.
 3 | >
 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
 5 | 
 6 | This directory contains the inference server specific setup and the
 7 | Terraform templates associated with them.
 8 | 
 9 | The current supported options are:
10 | - Text Generation Inference (aka TGI)
11 | - TensorRT-LLM on Triton Inference Server 
12 | 
13 | You may also choose to manually deploy your own inference server.
14 | 
15 | To deploy an inference server, cd into the respective directory and follow
16 | instructions on the respective README.md


--------------------------------------------------------------------------------
/benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: data-loader-7b
 5 | spec:
 6 |   ttlSecondsAfterFinished: 30
 7 |   template:
 8 |     spec:
 9 |       serviceAccountName: benchmark-sa
10 |       restartPolicy: Never
11 |       containers:
12 |       - name: inference-checkpoint
13 |         image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.0
14 |         args:
15 |         - -b=GEMMA_BUCKET_NAME
16 |         - -m=google/gemma/maxtext/7b-it/2
17 |         volumeMounts:
18 |         - mountPath: "/kaggle/"
19 |           name: kaggle-credentials
20 |           readOnly: true
21 |         resources:
22 |           requests:
23 |             google.com/tpu: 4
24 |           limits:
25 |             google.com/tpu: 4
26 |       nodeSelector:
27 |         cloud.google.com/gke-tpu-topology: 2x2
28 |         cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
29 |       volumes:
30 |       - name: kaggle-credentials
31 |         secret:
32 |           defaultMode: 0400
33 |           secretName: kaggle-secret


--------------------------------------------------------------------------------
/benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl:
--------------------------------------------------------------------------------
 1 | %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
 2 | 
 3 | apiVersion: secrets-store.csi.x-k8s.io/v1
 4 | kind: SecretProviderClass
 5 | metadata:
 6 |   name: gcp-secret-provider
 7 |   namespace: ${namespace}
 8 | spec:
 9 |   provider: gcp
10 |   parameters:
11 |     secrets: |
12 |       - resourceName: "${hugging_face_token_secret}"
13 |         fileName: "secret.txt"
14 |   secretObjects:                               
15 |     - data:
16 |       - key: HF_TOKEN                          
17 |         objectName: secret.txt
18 |       secretName: hf-token                     # name of the Kubernetes Secret object
19 |       type: Opaque                             # name of the mounted content to sync. this could be the object name or the object alias
20 |     
21 | %{ endfor ~}
22 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/text-generation-inference/autoscaling.md:
--------------------------------------------------------------------------------
 1 | # Autoscaling TGI
 2 | 
 3 | ## tl;dr
 4 | 
 5 | Recommendation: TODO
 6 | 
 7 | ## Autoscaling Options
 8 | 
 9 | ### CPU
10 | 
11 | CPU scaling is a poor choice for this workload - the TGI workload starts up,
12 | pulls the model weights, and then spends a minute or two worth of cpu time
13 | crunching some numbers. This causes hpa to add a replica, which then spends
14 | more cpu time, which causes hpa to add a replica, etc. Eventually, things
15 | settle, and hpa scales down the replicas. This whole process could take up to
16 | an hour.
17 | 
18 | ### Custom Metrics
19 | 
20 | Workload/custom metrics can be viewed in
21 | https://console.cloud.google.com/monitoring/metrics-explorer. (Just search for
22 | the metric name, e.g. "tgi_batch_current_size". The full name should be
23 | "prometheus/tgi_batch_current_size/gauge")
24 | 
25 | #### `tgi_batch_current_size`
26 | 
27 | TODO
28 | 
29 | ### External Metrics
30 | 
31 | TODO
32 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: autoscaling/v1
 2 | kind: HorizontalPodAutoscaler
 3 | metadata:
 4 |   name: tgi
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   scaleTargetRef:
 8 |     apiVersion: apps/v1
 9 |     kind: Deployment
10 |     name: tgi
11 |   minReplicas: ${hpa_min_replicas}
12 |   maxReplicas: ${hpa_max_replicas}
13 |   targetCPUUtilizationPercentage: ${hpa_averagevalue_target}
14 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: autoscaling/v2
 2 | kind: HorizontalPodAutoscaler
 3 | metadata:
 4 |   name: tgi
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   scaleTargetRef:
 8 |     apiVersion: apps/v1
 9 |     kind: Deployment
10 |     name: tgi
11 |   minReplicas: ${hpa_min_replicas}
12 |   maxReplicas: ${hpa_max_replicas}
13 |   metrics:
14 | %{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 }
15 |   - type: External
16 |     external:
17 |       metric:
18 |         name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown
19 |       target:
20 |         type: AverageValue
21 |         averageValue: ${hpa_averagevalue_target}
22 | %{ else }
23 |   - type: Pods
24 |     pods:
25 |       metric:
26 |         name: prometheus.googleapis.com|${custom_metric_name}|gauge
27 |       target:
28 |         type: AverageValue
29 |         averageValue: ${hpa_averagevalue_target}
30 | %{ endif }
31 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference-svc.tftpl:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: Service
17 | metadata:
18 |   name: tgi
19 |   namespace: ${namespace}
20 |   labels:
21 |     app: tgi
22 | spec:
23 |   type: LoadBalancer
24 |   ports:
25 |     - port: 80
26 |       targetPort: 80
27 |       protocol: TCP
28 |   selector:
29 |     app: tgi
30 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/text-generation-inference/monitoring-templates/tgi-podmonitoring.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: "tgi-podmonitoring"
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app: tgi
10 |   endpoints:
11 |   - port: 80
12 |     interval: 15s
13 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/triton/sample-terraform.tfvars:
--------------------------------------------------------------------------------
1 | credentials_config = {
2 |   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
3 | }
4 | 
5 | namespace      = "benchmark"
6 | ksa            = "benchmark-ksa"
7 | model_id       = "meta-llama/Llama-2-7b-chat-hf"
8 | gpu_count      = 1
9 | gcs_model_path = ""


--------------------------------------------------------------------------------
/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: autoscaling/v2
 2 | kind: HorizontalPodAutoscaler
 3 | metadata:
 4 |   name: vllm
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   scaleTargetRef:
 8 |     apiVersion: apps/v1
 9 |     kind: Deployment
10 |     name: vllm
11 |   minReplicas: ${hpa_min_replicas}
12 |   maxReplicas: ${hpa_max_replicas}
13 |   metrics:
14 |   - type: Pods
15 |     pods:
16 |       metric:
17 |         name: prometheus.googleapis.com|${custom_metric_name}|gauge
18 |       target:
19 |         type: AverageValue
20 |         averageValue: ${hpa_averagevalue_target}
21 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: Service
17 | metadata:
18 |   name: vllm
19 |   namespace: ${namespace}
20 |   labels:
21 |     app: vllm
22 | spec:
23 |   type: LoadBalancer
24 |   ports:
25 |     - port: 80
26 |       targetPort: 80
27 |       protocol: TCP
28 |   selector:
29 |     app: vllm
30 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: "vllm-podmonitoring"
 5 |   namespace: ${namespace}
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       app: vllm
10 |   endpoints:
11 |   - path: /metrics
12 |     port: 8000
13 |     interval: 15s
14 | 


--------------------------------------------------------------------------------
/benchmarks/inference-server/vllm/sample-terraform.tfvars:
--------------------------------------------------------------------------------
 1 | credentials_config = {
 2 |   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
 3 | }
 4 | 
 5 | namespace  = "benchmark"
 6 | ksa        = "benchmark-ksa"
 7 | model_id   = "tiiuae/falcon-7b"
 8 | gpu_count  = 1
 9 | project_id = "<project_id>"
10 | 
11 | # How to (horizontally) scale the workload. Allowed values are:
12 | # - Workload metrics (i.e. custom metrics):
13 | #   - "vllm:gpu_cache_usage_perc"
14 | #   - "vllm:num_requests_waiting"
15 | # - Other possibilities coming soon...
16 | #
17 | # See `autoscaling.md` for more details and recommendations.
18 | # hpa_type = "vllm:gpu_cache_usage_perc"
19 | 
20 | # Sets the averagevalue target of the hpa metric.
21 | # hpa_averagevalue_target = 0.95
22 | 
23 | # Adjust these if you want different min/max values
24 | # hpa_min_replicas = 1
25 | # hpa_max_replicas = 5


--------------------------------------------------------------------------------
/benchmarks/infra/65k-cpu-cluster/provider.tf:
--------------------------------------------------------------------------------
1 | provider "google" {
2 |   project = var.project_name
3 | }
4 | 


--------------------------------------------------------------------------------
/benchmarks/infra/65k-cpu-cluster/sample-tfvars/65k-sample.tfvars:
--------------------------------------------------------------------------------
 1 | project_name             = "$PROJECT_ID"
 2 | cluster_name             = "gke-benchmark"
 3 | region                   = "us-central1"
 4 | min_master_version       = "1.31.2"
 5 | vpc_network              = "$NETWORK"
 6 | node_locations           = ["us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"]
 7 | datapath_provider        = "ADVANCED_DATAPATH"
 8 | master_ipv4_cidr_block   = "172.16.0.0/28"
 9 | ip_cidr_range            = "10.0.0.0/9"
10 | cluster_ipv4_cidr_block  = "/10"
11 | services_ipv4_cidr_block = "/18"
12 | node_pool_count          = 16
13 | node_pool_size           = 1000
14 | initial_node_count       = 250
15 | node_pool_create_timeout = "60m"


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-1/modules/gke-infra/filestore.tf:
--------------------------------------------------------------------------------
 1 | resource "google_filestore_instance" "instance" {
 2 |   for_each = var.filestore_storage
 3 |   name     = each.value.name
 4 | 
 5 |   project = module.project.project_id
 6 | 
 7 |   location = var.gke_location
 8 |   tier     = each.value.tier
 9 | 
10 |   file_shares {
11 |     capacity_gb = each.value.capacity_gb
12 |     name        = "filestore_share"
13 |   }
14 | 
15 |   networks {
16 |     network      = local.cluster_vpc.network
17 |     modes        = ["MODE_IPV4"]
18 |     connect_mode = "DIRECT_PEERING"
19 |   }
20 | }


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-1/sample-tfvars/gpu-sample.tfvars:
--------------------------------------------------------------------------------
 1 | project_id   = "$PROJECT_ID"
 2 | cluster_name = "ai-benchmark"
 3 | region       = "us-central1"
 4 | gke_location = "us-central1-a"
 5 | prefix       = "ai-benchmark"
 6 | 
 7 | vpc_create = {
 8 |   name             = "ai-benchmark"
 9 |   enable_cloud_nat = true
10 | }
11 | 
12 | cluster_options = {
13 |   enable_gcs_fuse_csi_driver            = false
14 |   enable_gcp_filestore_csi_driver       = false
15 |   enable_gce_persistent_disk_csi_driver = false
16 | }
17 | 
18 | nodepools = {
19 |   nodepool-cpu = {
20 |     machine_type = "n2-standard-2",
21 |   },
22 |   nodepool-gpu = {
23 |     ephemeral_ssd_block_config = {
24 |       ephemeral_ssd_count = 1
25 |     }
26 |     machine_type = "g2-standard-16",
27 |     guest_accelerator = {
28 |       type  = "nvidia-l4",
29 |       count = 1,
30 |       gpu_driver = {
31 |         version = "LATEST"
32 |       }
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-1/sample-tfvars/jetstream-sample.tfvars:
--------------------------------------------------------------------------------
 1 | project_id   = "PROJECT_ID"
 2 | cluster_name = "ai-benchmark"
 3 | region       = "us-east1"
 4 | gke_location = "us-east1-c"
 5 | prefix       = "ai-benchmark"
 6 | spot_vms     = true
 7 | 
 8 | vpc_create = {
 9 |   name             = "ai-benchmark"
10 |   enable_cloud_nat = true
11 | }
12 | 
13 | cluster_options = {
14 |   enable_gcs_fuse_csi_driver            = false
15 |   enable_gcp_filestore_csi_driver       = false
16 |   enable_gce_persistent_disk_csi_driver = false
17 | }
18 | 
19 | nodepools = {
20 |   nodepool-tpu = {
21 |     machine_type = "ct5lp-hightpu-4t",
22 |     spot         = true,
23 |   },
24 |   nodepool-cpu = {
25 |     machine_type = "n2-standard-2",
26 |   },
27 | }
28 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/gcs-fuse/outputs.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2024 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | output "created_resources" {
18 |   description = "IDs of the resources created, if any."
19 |   value = merge(
20 |     var.bucket_create ? {} : {
21 |       bucket_name     = module.gcs-fuse-bucket.name
22 |       bucket_location = module.gcs-fuse-bucket.location
23 |     }
24 |   )
25 | }
26 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/output-benchmark/outputs.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2024 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | output "created_resources" {
18 |   description = "IDs of the resources created, if any."
19 |   value = merge(
20 |     {
21 |       bucket_name                    = module.gcs-result-bucket.name
22 |       benchmark_tool_runner_endpoint = resource.google_compute_address.benchmark-tool-runner-endpoint.address
23 |     }
24 |   )
25 | }


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/csi-driver/csidriver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: storage.k8s.io/v1
2 | kind: CSIDriver
3 | metadata:
4 |   name: secrets-store.csi.k8s.io
5 | spec:
6 |   podInfoOnMount: true
7 |   attachRequired: false
8 |   volumeLifecycleModes:
9 |   - Ephemeral


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/csi-driver/rbac-secretprovidersyncing.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: secretprovidersyncing-role
 5 | rules:
 6 | - apiGroups:
 7 |   - ""
 8 |   resources:
 9 |   - secrets
10 |   verbs:
11 |   - create
12 |   - delete
13 |   - get
14 |   - list
15 |   - patch
16 |   - update
17 |   - watch
18 | ---
19 | apiVersion: rbac.authorization.k8s.io/v1
20 | kind: ClusterRoleBinding
21 | metadata:
22 |   name: secretprovidersyncing-rolebinding
23 | roleRef:
24 |   apiGroup: rbac.authorization.k8s.io
25 |   kind: ClusterRole
26 |   name: secretprovidersyncing-role
27 | subjects:
28 | - kind: ServiceAccount
29 |   name: secrets-store-csi-driver
30 |   namespace: kube-system


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/outputs.tf:
--------------------------------------------------------------------------------
1 | output "created_resources" {
2 |   description = "IDs of the resources created, if any."
3 |   value = {
4 |     secret = module.secret-manager.ids
5 |   }
6 | }


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "created_resources" {
 2 |   description = "IDs of the resources created, if any."
 3 |   value = merge(
 4 |     var.secret_create == true ? module.secret-manager[0].created_resources : {},
 5 |     #var.gcs_fuse_create == true ? module.gcs-fuse[0].created_resources : {},
 6 |     var.workload_identity_create == true ? module.workload-identity[0].created_resources : {},
 7 |     #var.nvidia_dcgm_create == true ? module.nvidia-dcgm.created_resources : {}
 8 |     module.output-benchmark.created_resources,
 9 |   )
10 | }
11 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/outputs.tf:
--------------------------------------------------------------------------------
1 | output "created_resources" {
2 |   description = "Created resources"
3 |   value       = module.gke-setup
4 | }
5 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/sample-tfvars/gpu-sample.tfvars:
--------------------------------------------------------------------------------
 1 | # can be obtained from stage-1 by running:
 2 | # terraform output -json  | jq '."fleet_host".value'
 3 | credentials_config = {
 4 |   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
 5 | }
 6 | 
 7 | # can be obtained from stage-1 by running:
 8 | # terraform output -json  | jq '."project_id".value'
 9 | project_id = "$PROJECT_ID"
10 | 
11 | bucket_name     = "${PROJECT_ID}-ai-gke-benchmark-fuse"
12 | bucket_location = "US"
13 | 
14 | output_bucket_name     = "${PROJECT_ID}-benchmark-output"
15 | output_bucket_location = "US"
16 | 
17 | google_service_account     = "benchmark-sa"
18 | kubernetes_service_account = "benchmark-ksa"
19 | 
20 | benchmark_runner_google_service_account     = "sample-runner-sa"
21 | benchmark_runner_kubernetes_service_account = "sample-runner-ksa"
22 | 


--------------------------------------------------------------------------------
/benchmarks/infra/accelerator-cluster/stage-2/sample-tfvars/jetstream-sample.tfvars:
--------------------------------------------------------------------------------
 1 | # can be obtained from stage-1 by running:
 2 | # terraform output -json  | jq '."fleet_host".value'
 3 | credentials_config = {
 4 |   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
 5 | }
 6 | 
 7 | # can be obtained from stage-1 by running:
 8 | # terraform output -json  | jq '."project_id".value'
 9 | project_id = "PROJECT_ID"
10 | 
11 | bucket_name     = "${PROJECT_ID}-model-repo-bucket-01"
12 | bucket_location = "US"
13 | 
14 | output_bucket_name     = "${PROJECT_ID}-benchmark-output-bucket-01"
15 | output_bucket_location = "US"
16 | 
17 | google_service_account     = "benchmark-sa-01"
18 | kubernetes_service_account = "benchmark-sa"
19 | 
20 | benchmark_runner_google_service_account     = "sample-runner-sa-01"
21 | benchmark_runner_kubernetes_service_account = "sample-runner-sa"
22 | 
23 | nvidia_dcgm_create = "false"
24 | namespace          = "default"
25 | namespace_create   = false
26 | gcs_fuse_create    = true
27 | 
28 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/README.md:
--------------------------------------------------------------------------------
 1 | # AI on GKE Benchmark Framework Orchestration
 2 | 
 3 | >[!WARNING]
 4 | >This guide and associated code are **deprecated** and no longer maintained.
 5 | >
 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
 7 | 
 8 | ## Pre-requisites
 9 | * terraform
10 | * jq
11 | * sed
12 | 
13 | ### Configuration
14 | Configuration is split across config files where files you need to modify are and templates where files that are automatically filled based on outputs from previous stages.
15 | 
16 | ### Running scripts
17 | After you have filled the configuration in config folder run ``text-generation-inference-apply.sh`` which will run stage-1, stage-2 and text-generation-inference stages.
18 | 
19 | To destroy the resources that have been created run ``text-generation-inference-destroy.sh`` which will destroy text-generation-inference, stage-2 and stage-1 in that order.
20 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/config/stage-1.tfvars:
--------------------------------------------------------------------------------
 1 | project_id              = "example-project-id"
 2 | cluster_name            = "test-00"
 3 | region                  = "us-central1"
 4 | gke_location            = "us-central1-a"
 5 | enable_private_endpoint = false
 6 | 
 7 | vpc_create = {
 8 |   enable_cloud_nat = true
 9 | }
10 | 
11 | cluster_options = {
12 |   enable_gcs_fuse_csi_driver            = true
13 |   enable_gcp_filestore_csi_driver       = true
14 |   enable_gce_persistent_disk_csi_driver = true
15 | }
16 | 
17 | nodepools = {
18 |   nodepool-cpu = {
19 |     machine_type = "n2-standard-2",
20 |   },
21 |   nodepool-gpu = {
22 |     machine_type = "g2-standard-4",
23 |     guest_accelerator = {
24 |       type  = "nvidia-l4",
25 |       count = 1,
26 |       gpu_driver = {
27 |         version = "LATEST"
28 |       }
29 |     }
30 |   }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/config/stage-2.tfvars:
--------------------------------------------------------------------------------
1 | bucket_name     = "ai-gke-benchmark-fuse-demo"
2 | bucket_location = "US"
3 | 
4 | secret_name     = "hugging_face_secret"
5 | secret_location = "us-central1"
6 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/config/text-generation-inference.tfvars:
--------------------------------------------------------------------------------
1 | model_id                    = "bigscience/bloom-560m"
2 | hugging_face_secret_version = "1"
3 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/templates/stage-2.auto.tfvars.tpl:
--------------------------------------------------------------------------------
1 | # can be obtained from stage-1 by running:
2 | # terraform output -json  | jq '."fleet_host".value'
3 | credentials_config = {
4 |   fleet_host = FLEET_HOST
5 | }
6 | 
7 | #terraform output -json  | jq '."project_id".value'
8 | project_id = PROJECT_ID
9 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/templates/text-generation-inference.auto.tfvars.tpl:
--------------------------------------------------------------------------------
 1 | credentials_config = {
 2 |   fleet_host = FLEET_HOST
 3 | }
 4 | 
 5 | #terraform output -json  | jq '."project_id".value'
 6 | project_id = PROJECT_ID
 7 | 
 8 | hugging_face_secret = HUGGING_FACE_SECRET
 9 | 
10 | namespace = NAMESPACE_NAME
11 | ksa       = KSA_NAME
12 | 


--------------------------------------------------------------------------------
/benchmarks/orchestration/text-generation-inference-destroy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | 
 5 | cd ../inference-server/text-generation-inference/ || exit
 6 | terraform destroy -auto-approve
 7 | 
 8 | cd ../../infra/stage-2/ || exit
 9 | terraform destroy -auto-approve
10 | 
11 | cd ../stage-1/ || exit
12 | terraform destroy -auto-approve
13 | 


--------------------------------------------------------------------------------
/best-practices/gke-batch-refarch/README.md:
--------------------------------------------------------------------------------
1 | # Reference Architecture: Batch Processing Platform on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the Batch Processing Platform on GKE example have been moved to the [AI-on-GKE/batch-reference-architecture](https://github.com/ai-on-gke/batch-reference-architecture) repository. Please refer to that repository for the latest updates and instructions.
5 | 


--------------------------------------------------------------------------------
/best-practices/ml-platform/README.md:
--------------------------------------------------------------------------------
1 | # Moved to the [GoogleCloudPlatform/accelerated-platforms](https://github.com/GoogleCloudPlatform/accelerated-platforms/blob/main/docs/platforms/gke-aiml/README.md) repository which is included as a submodule in the [/best-practices](/best-practices) folder
2 | 
3 | ```
4 | git clone --recurse-submodules https://github.com/GoogleCloudPlatform/ai-on-gke.git
5 | cd ai-on-gke/best-practices/accelerated-platforms
6 | ```
7 | 


--------------------------------------------------------------------------------
/charts/gmp-engine/charts/gmp-frontend/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ .Values.name }}
 5 |   labels:
 6 |     app: {{ .Values.name }}
 7 | spec:
 8 |   clusterIP: None
 9 |   ports:
10 |   - name: web
11 |     port: 9090
12 |   selector:
13 |     app: {{ .Values.name }}
14 | 


--------------------------------------------------------------------------------
/charts/gmp-engine/charts/gmp-frontend/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for gmp-frontend.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | name: "gmp-frontend"
 6 | projectID: ""
 7 | serviceAccount: ""
 8 | 
 9 | image:
10 |   repository: gke.gcr.io/prometheus-engine/frontend
11 |   pullPolicy: IfNotPresent
12 |   tag: "v0.5.0-gke.0"
13 | 
14 | replicaCount: 2
15 | 
16 | cpu: "1m"
17 | memory: "5Mi"
18 | 


--------------------------------------------------------------------------------
/charts/gmp-engine/templates/podmonitoring.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | {{- range $pm := .Values.podMonitoring }}
17 | apiVersion: monitoring.googleapis.com/v1
18 | kind: PodMonitoring
19 | metadata:
20 |   name: {{ $pm.name}}
21 | spec:
22 |   selector:
23 |     matchLabels:
24 |       {{- $pm.selector | toYaml | nindent 8 }}
25 |   endpoints:
26 |   - port: {{ $pm.port }}
27 |     interval: {{ $pm.interval }}
28 | ---
29 | {{- end }}
30 | 
31 | 


--------------------------------------------------------------------------------
/charts/gmp-engine/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for iap_jupyter.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | podMonitoring: []
 6 | 
 7 | gmp-frontend:
 8 |   enabled: false
 9 |   projectID: ""
10 |   serviceAccount: ""
11 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright 2025 NVIDIA CORPORATION
2 | 
3 | This product includes software developed at
4 | NVIDIA CORPORATION (https://nvidia.com).
5 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name: {{ include "nvidia-dra-driver-gpu.name" . }}-role-binding
 6 |   namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }}
 7 | subjects:
 8 | - kind: ServiceAccount
 9 |   name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
10 |   namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }}
11 | roleRef:
12 |   kind: ClusterRole
13 |   name: {{ include "nvidia-dra-driver-gpu.name" . }}-role
14 |   apiGroup: rbac.authorization.k8s.io
15 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.resources.computeDomains.enabled }}
 2 | ---
 3 | apiVersion: resource.k8s.io/v1beta1
 4 | kind: DeviceClass
 5 | metadata:
 6 |   name: compute-domain-daemon.nvidia.com
 7 | spec:
 8 |   selectors:
 9 |   - cel:
10 |       expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'"
11 | {{- end }}
12 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.resources.computeDomains.enabled }}
 2 | ---
 3 | apiVersion: resource.k8s.io/v1beta1
 4 | kind: DeviceClass
 5 | metadata:
 6 |   name: compute-domain-default-channel.nvidia.com
 7 | spec:
 8 |   selectors:
 9 |   - cel:
10 |       expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0"
11 | {{- end }}
12 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/deviceclass-gpu.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.resources.gpus.enabled }}
 2 | ---
 3 | apiVersion: resource.k8s.io/v1beta1
 4 | kind: DeviceClass
 5 | metadata:
 6 |   name: gpu.nvidia.com
 7 | spec:
 8 |   selectors:
 9 |   - cel:
10 |       expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'"
11 | {{- end }}
12 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/deviceclass-mig.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.resources.gpus.enabled }}
 2 | ---
 3 | apiVersion: resource.k8s.io/v1beta1
 4 | kind: DeviceClass
 5 | metadata:
 6 |   name: mig.nvidia.com
 7 | spec:
 8 |   selectors:
 9 |   - cel:
10 |       expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'"
11 | {{- end }}
12 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/openshiftprivilegedrolebinging.yaml:
--------------------------------------------------------------------------------
 1 | # Apply only when running on OpenShift to let the kublet plugin run privileged
 2 | {{- if .Capabilities.APIVersions.Has "security.openshift.io/v1/SecurityContextConstraints" -}}
 3 | ---
 4 | apiVersion: rbac.authorization.k8s.io/v1
 5 | kind: RoleBinding
 6 | metadata:
 7 |   name: {{ include "nvidia-dra-driver-gpu.name" . }}-openshift-privileged-role-binding
 8 |   namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }}
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
12 |   namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }}
13 | roleRef:
14 |   kind: ClusterRole
15 |   name: system:openshift:scc:privileged
16 |   apiGroup: rbac.authorization.k8s.io
17 | {{- end }}
18 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
 6 |   namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }}
 7 |   labels:
 8 |     {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }}
 9 |   {{- with .Values.serviceAccount.annotations }}
10 |   annotations:
11 |     {{- toYaml . | nindent 4 }}
12 |   {{- end }}
13 | {{- end }}
14 | 


--------------------------------------------------------------------------------
/charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicyBinding
3 | metadata:
4 |   name: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }}
5 | spec:
6 |   policyName: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }}
7 |   validationActions: [Deny]
8 |   # All ResourceSlices are matched.
9 | 


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/README.md:
--------------------------------------------------------------------------------
 1 | # TPU DRA driver
 2 | 
 3 | This helm chart is for running TPU DRA driver on GKE. The driver is in Private Preview stage now. 
 4 | 
 5 | ## Overview
 6 | 
 7 | TPU DRA driver is only supported on GKE cluster version 1.32+
 8 | Make sure to disable the default tpu-device-plugin on the nodes. This can be done by add node label
 9 | `gke-no-default-tpu-device-plugin=true` and `gke-no-default-tpu-dra-plugin=true` when creating nodepool
10 | 
11 | Run `./install-tpu-dra-driver.sh` to install tpu-dra-driver on your GKE Cluster
12 | nodes with TPU resources


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/templates/clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: {{ include "tpu-dra-driver.fullname" . }}-role
 6 |   namespace: {{ include "tpu-dra-driver.namespace" . }}
 7 | rules:
 8 | - apiGroups: ["resource.k8s.io"]
 9 |   resources: ["resourceclaims"]
10 |   verbs: ["get"]
11 | - apiGroups: [""]
12 |   resources: ["nodes"]
13 |   verbs: ["get"]
14 | - apiGroups: ["resource.k8s.io"]
15 |   resources: ["resourceslices"]
16 |   verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
17 | 


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/templates/clusterrolebinding.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRoleBinding
 4 | metadata:
 5 |   name: {{ include "tpu-dra-driver.fullname" . }}-role-binding
 6 |   namespace: {{ include "tpu-dra-driver.namespace" . }}
 7 | subjects:
 8 | - kind: ServiceAccount
 9 |   name: {{ include "tpu-dra-driver.serviceAccountName" . }}
10 |   namespace: {{ include "tpu-dra-driver.namespace" . }}
11 | roleRef:
12 |   kind: ClusterRole
13 |   name: {{ include "tpu-dra-driver.fullname" . }}-role
14 |   apiGroup: rbac.authorization.k8s.io
15 | 


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/templates/deviceclass.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: resource.k8s.io/v1beta1
2 | kind: DeviceClass
3 | metadata:
4 |   name: tpu.google.com
5 | spec:
6 |   selectors:
7 |   - cel:
8 |       expression: device.driver == "tpu.google.com"


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "tpu-dra-driver.serviceAccountName" . }}
 6 |   namespace: {{ include "tpu-dra-driver.namespace" . }}
 7 |   labels:
 8 |     {{- include "tpu-dra-driver.labels" . | nindent 4 }}
 9 |   {{- with .Values.serviceAccount.annotations }}
10 |   annotations:
11 |     {{- toYaml . | nindent 4 }}
12 |   {{- end }}
13 | {{- end }}
14 | 


--------------------------------------------------------------------------------
/charts/tpu-dra-driver/templates/validatingadmissionpolicybinding.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: admissionregistration.k8s.io/v1
2 | kind: ValidatingAdmissionPolicyBinding
3 | metadata:
4 |   name: resourceslices-policy-{{ include "tpu-dra-driver.fullname" . }}
5 | spec:
6 |   policyName: resourceslices-policy-{{ include "tpu-dra-driver.fullname" . }}
7 |   validationActions: [Deny]
8 |   # All ResourceSlices are matched.
9 | 


--------------------------------------------------------------------------------
/gke-batch-refarch/README.md:
--------------------------------------------------------------------------------
1 | # Moved to [best-practices/gke-batch-refarch](/best-practices/gke-batch-refarch)
2 | 


--------------------------------------------------------------------------------
/infrastructure/backend.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # terraform {
16 | #   backend "gcs" {
17 | #     bucket = "BUCKET_NAME"
18 | #     prefix = "terraform/state"
19 | #   }
20 | # }


--------------------------------------------------------------------------------
/infrastructure/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |       # Creating Autopilot using GKE submodule is broken in v6.2.0.
23 |       version = ">= 5.40.0, <= 6.1.0"
24 |     }
25 |     helm = {
26 |       source  = "hashicorp/helm"
27 |       version = "~> 2.8.0"
28 |     }
29 |     kubernetes = {
30 |       source  = "hashicorp/kubernetes"
31 |       version = "2.18.1"
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/jupyter-on-gke:
--------------------------------------------------------------------------------
1 | applications/jupyter


--------------------------------------------------------------------------------
/modules/cloudsql/README.md:
--------------------------------------------------------------------------------
 1 | # CloudSQL
 2 | This module contains a Terraform template for creating a CloudSQL instance.
 3 | 
 4 | ## Usage
 5 | 
 6 | 1. Edit `variables.tf` with your GCP settings.
 7 | 2. Run `terraform init` and `terraform apply`
 8 | 3. Create an IAM service account & grant a cloudsql client role to it:
 9 | ```
10 | gcloud projects add-iam-policy-binding {PROJECT_ID} \
11 |   --member=serviceAccount:{SA_ACCOUNT}.iam.gserviceaccount.com \
12 |   --role="roles/cloudsql.client"
13 | ```
14 | 
15 | Note: Ensure that the regional subnet that is used (referenced by `network_name`) has [Private Service Connect](https://cloud.google.com/vpc/docs/private-service-connect) enabled.
16 | 
17 | See [sample RAG application](https://github.com/GoogleCloudPlatform/ai-on-gke/applications/rag/README.md) for example usage of the created instance.


--------------------------------------------------------------------------------
/modules/cloudsql/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     kubernetes = {
21 |       source = "hashicorp/kubernetes"
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: apiregistration.k8s.io/v1
 2 | kind: APIService
 3 | metadata:
 4 |   name: v1beta1.custom.metrics.k8s.io
 5 | spec:
 6 |   insecureSkipTLSVerify: true
 7 |   group: custom.metrics.k8s.io
 8 |   groupPriorityMinimum: 100
 9 |   versionPriority: 100
10 |   service:
11 |     name: custom-metrics-stackdriver-adapter
12 |     namespace: custom-metrics
13 |   version: v1beta1
14 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: apiregistration.k8s.io/v1
 2 | kind: APIService
 3 | metadata:
 4 |   name: v1beta1.external.metrics.k8s.io
 5 | spec:
 6 |   insecureSkipTLSVerify: true
 7 |   group: external.metrics.k8s.io
 8 |   groupPriorityMinimum: 100
 9 |   versionPriority: 100
10 |   service:
11 |     name: custom-metrics-stackdriver-adapter
12 |     namespace: custom-metrics
13 |   version: v1beta1
14 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: apiregistration.k8s.io/v1
 2 | kind: APIService
 3 | metadata:
 4 |   name: v1beta2.custom.metrics.k8s.io
 5 | spec:
 6 |   insecureSkipTLSVerify: true
 7 |   group: custom.metrics.k8s.io
 8 |   groupPriorityMinimum: 100
 9 |   versionPriority: 200
10 |   service:
11 |     name: custom-metrics-stackdriver-adapter
12 |     namespace: custom-metrics
13 |   version: v1beta2
14 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: custom-metrics-resource-reader
 5 | rules:
 6 | - apiGroups:
 7 |   - ""
 8 |   resources:
 9 |   - "pods"
10 |   - "nodes"
11 |   - "nodes/stats"
12 |   verbs:
13 |   - list
14 |   - get
15 |   - watch
16 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: custom-metrics-resource-reader
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: view
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: ${cmsa-serviceaccount-name}
12 |   namespace: custom-metrics
13 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: custom-metrics:system:auth-delegator
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: system:auth-delegator
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: ${cmsa-serviceaccount-name}
12 |   namespace: custom-metrics
13 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: external-metrics-reader
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: external-metrics-reader
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: horizontal-pod-autoscaler
12 |   namespace: kube-system
13 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: custom-metrics-auth-reader
 5 |   namespace: kube-system
 6 | roleRef:
 7 |   apiGroup: rbac.authorization.k8s.io
 8 |   kind: Role
 9 |   name: extension-apiserver-authentication-reader
10 | subjects:
11 | - kind: ServiceAccount
12 |   name: ${cmsa-serviceaccount-name}
13 |   namespace: custom-metrics
14 | 


--------------------------------------------------------------------------------
/modules/custom-metrics-stackdriver-adapter/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     run: custom-metrics-stackdriver-adapter
 6 |     k8s-app: custom-metrics-stackdriver-adapter
 7 |     kubernetes.io/cluster-service: 'true'
 8 |     kubernetes.io/name: Adapter
 9 |   name: custom-metrics-stackdriver-adapter
10 |   namespace: custom-metrics
11 | spec:
12 |   ports:
13 |   - port: 443
14 |     protocol: TCP
15 |     targetPort: 443
16 |   selector:
17 |     run: custom-metrics-stackdriver-adapter
18 |     k8s-app: custom-metrics-stackdriver-adapter
19 |   type: ClusterIP


--------------------------------------------------------------------------------
/modules/gcp-network/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "network_name" {
16 |   value = google_compute_network.network.name
17 | }
18 | 
19 | output "subnets_names" {
20 |   value = [for sb in google_compute_subnetwork.subnetwork : sb.name]
21 | }
22 | 
23 | output "subnets_ips" {
24 |   value = [for sb in google_compute_subnetwork.subnetwork : sb.ip_cidr_range]
25 | }


--------------------------------------------------------------------------------
/modules/gcp-network/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/modules/gcs/README.md:
--------------------------------------------------------------------------------
1 | # GCS bucket used in the RAG on GKE demo
2 | 
3 | This repository contains a Terraform template for creating the GCS bucket used
4 | in the RAG on GKE demo.
5 | 


--------------------------------------------------------------------------------
/modules/gcs/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | resource "google_storage_bucket" "static" {
16 |   name                        = var.bucket_name
17 |   location                    = var.region
18 |   storage_class               = "STANDARD"
19 |   uniform_bucket_level_access = true
20 |   force_destroy               = true
21 |   public_access_prevention    = "enforced"
22 | }
23 | 


--------------------------------------------------------------------------------
/modules/gcs/variables.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | variable "project_id" {
16 |   type        = string
17 |   description = "GCP project id"
18 | }
19 | 
20 | variable "region" {
21 |   type        = string
22 |   description = "GCS bucket region"
23 |   default     = "us-central1"
24 | }
25 | 
26 | variable "bucket_name" {
27 |   type        = string
28 |   description = "GCS bucket name"
29 | }


--------------------------------------------------------------------------------
/modules/gcs/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/modules/gke-autopilot-private-cluster/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "cluster" {
16 |   value = module.gke
17 | }
18 | 
19 | output "endpoint" {
20 |   value = module.gke.endpoint
21 | }
22 | 
23 | output "ca_certificate" {
24 |   value = module.gke.ca_certificate
25 | }
26 | 
27 | output "service_account" {
28 |   value = module.gke.service_account
29 | }
30 | 


--------------------------------------------------------------------------------
/modules/gke-autopilot-private-cluster/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/modules/gke-autopilot-public-cluster/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "cluster" {
16 |   value = module.gke
17 | }
18 | 
19 | output "endpoint" {
20 |   value = module.gke.endpoint
21 | }
22 | 
23 | output "ca_certificate" {
24 |   value = module.gke.ca_certificate
25 | }
26 | 
27 | output "service_account" {
28 |   value = module.gke.service_account
29 | }
30 | 


--------------------------------------------------------------------------------
/modules/gke-standard-private-cluster/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "cluster" {
16 |   value = module.gke
17 | }
18 | 
19 | output "endpoint" {
20 |   value = module.gke.endpoint
21 | }
22 | 
23 | output "ca_certificate" {
24 |   value = module.gke.ca_certificate
25 | }
26 | 
27 | output "service_account" {
28 |   value = module.gke.service_account
29 | }
30 | 


--------------------------------------------------------------------------------
/modules/gke-standard-private-cluster/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/modules/gke-standard-public-cluster/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "cluster" {
16 |   value = module.gke
17 | }
18 | 
19 | output "endpoint" {
20 |   value = module.gke.endpoint
21 | }
22 | 
23 | output "ca_certificate" {
24 |   value = module.gke.ca_certificate
25 | }
26 | 
27 | output "service_account" {
28 |   value = module.gke.service_account
29 | }
30 | 


--------------------------------------------------------------------------------
/modules/iap/charts/iap/templates/backend-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: cloud.google.com/v1
16 | kind: BackendConfig
17 | metadata:
18 |   name: {{ .Values.iap.backendConfig.name }}
19 | spec:
20 |   iap:
21 |     enabled: true
22 |     oauthclientCredentials:
23 |       secretName:  {{ .Values.iap.secret.name }}
24 | 


--------------------------------------------------------------------------------
/modules/iap/charts/iap/templates/iap-secret.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | apiVersion: v1
17 | kind: Secret
18 | metadata:
19 |   name: {{ .Values.iap.secret.name }}
20 | data:
21 |   client_id:  {{ .Values.iap.secret.client_id }}
22 |   client_secret:  {{ .Values.iap.secret.client_secret }}


--------------------------------------------------------------------------------
/modules/iap/charts/iap/templates/managed-cert.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | ## ignore template if domain is empty
15 | {{- if  .Values.iap.managedCertificate.domain }}
16 | 
17 | apiVersion: networking.gke.io/v1
18 | kind: ManagedCertificate
19 | metadata:
20 |   name:  {{ .Values.iap.managedCertificate.name }}
21 | spec:
22 |   domains: 
23 |   - {{ .Values.iap.managedCertificate.domain }}
24 | 
25 | {{- end -}}
26 | 


--------------------------------------------------------------------------------
/modules/iap/charts/iap/values.yaml:
--------------------------------------------------------------------------------
 1 | # Default values for iap_jupyter.
 2 | # This is a YAML-formatted file.
 3 | # Declare variables to be passed into your templates.
 4 | 
 5 | 
 6 | iap: 
 7 |   backendConfig:
 8 |     name: "iap-config-default"
 9 | 
10 |   secret:
11 |     name: "iap-secret"
12 |     client_id: ""
13 |     client_secret: ""
14 | 
15 |   managedCertificate:
16 |     name: "iap-managed-cert"
17 |     domain: ""
18 | 
19 |   ingress:
20 |     name: "iap-ingress"
21 |     staticIpName: "xyz"
22 |     backendServiceName: "proxy-public"
23 |     backendServicePort: 80
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/modules/iap/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "domain" {
16 |   value = local.domain
17 | }
18 | 
19 | output "ip_address" {
20 |   value = google_compute_global_address.ip_address.address
21 | }


--------------------------------------------------------------------------------
/modules/iap/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |     }
23 |     kubernetes = {
24 |       source = "hashicorp/kubernetes"
25 |     }
26 |   }
27 | }


--------------------------------------------------------------------------------
/modules/inference-service/README.md:
--------------------------------------------------------------------------------
1 | # Inference Service
2 | This module is currently designed specifically for the Mistral-7B-Instruct-v0.1 model. Future developments will expand the module to support the creation of customized models more broadly.
3 | 


--------------------------------------------------------------------------------
/modules/inference-service/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |     }
23 |     kubernetes = {
24 |       source = "hashicorp/kubernetes"
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/modules/jetstream-maxtext-deployment/templates/podmonitoring-tpu.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: tpu-metrics-exporter
 5 |   namespace: kube-system
 6 |   labels:
 7 |     k8s-app: tpu-device-plugin
 8 | spec:
 9 |   endpoints:
10 |     - port: 2112
11 |       interval: ${metrics_scrape_interval}s
12 |   selector:
13 |     matchLabels:
14 |       k8s-app: tpu-device-plugin


--------------------------------------------------------------------------------
/modules/jetstream-maxtext-deployment/templates/podmonitoring.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.googleapis.com/v1
 2 | kind: PodMonitoring
 3 | metadata:
 4 |   name: jetstream-podmonitoring
 5 |   namespace: default
 6 | spec:
 7 |   endpoints:
 8 |   - interval: ${metrics_scrape_interval}s
 9 |     path: "/"
10 |     port: ${metrics_port}
11 |   targetLabels:
12 |     metadata:
13 |     - pod
14 |     - container
15 |     - node


--------------------------------------------------------------------------------
/modules/jetstream-maxtext-deployment/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: autoscaling/v2
 2 | kind: HorizontalPodAutoscaler
 3 | metadata:
 4 |   name: jetstream-hpa
 5 |   namespace: default
 6 | spec:
 7 |   scaleTargetRef:
 8 |     apiVersion: apps/v1
 9 |     kind: Deployment
10 |     name: maxengine-server
11 |   minReplicas: ${hpa_min_replicas}
12 |   maxReplicas: ${hpa_max_replicas}
13 |   metrics:
14 | %{ for rule in rules }
15 |   - type: External
16 |     external:
17 |       metric:
18 |         name: ${rule.target_query}
19 |       target:
20 |         type: AverageValue
21 |         averageValue: ${rule.average_value_target}
22 | %{ endfor ~}


--------------------------------------------------------------------------------
/modules/jetstream-maxtext-deployment/templates/service.yaml.tftpl:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: jetstream-svc
 5 |   namespace: default
 6 | spec:
 7 |   selector:
 8 |     app: maxengine-server
 9 |   ports:
10 |   - protocol: TCP
11 |     name: jetstream-http
12 |     port: 8000
13 |     targetPort: 8000
14 |   - protocol: TCP
15 |     name: jetstream-grpc
16 |     port: 9000
17 |     targetPort: 9000


--------------------------------------------------------------------------------
/modules/jupyter/authentication/authenticator/gcpiapjwtauthenticator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .gcpiapjwtauthenticator import (
15 |     GCPIAPAuthenticator,
16 |     IAPUserLoginHandler
17 | )
18 | 
19 | __all__ =['GCPIAPAuthenticator', 'IAPUserLoginHandler']


--------------------------------------------------------------------------------
/modules/jupyter/authentication/docker_image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyterhub/k8s-hub:3.3.0
2 | 
3 | RUN pip3 install --no-cache-dir git+https://github.com/GoogleCloudPlatform/ai-on-gke/@main#subdirectory=modules/jupyter/authentication/authenticator


--------------------------------------------------------------------------------
/modules/jupyter/authentication/docker_image/cloudbuild.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory
16 | steps:
17 | - name: 'gcr.io/cloud-builders/docker'
18 |   args: [ 'pull', 'docker.io/jupyterhub/k8s-hub:3.0.0' ]
19 | - name: 'gcr.io/cloud-builders/docker'
20 |   args: [ 'build', '-t', '<Artifact Registry Repo Path>/<name of image>', '.' ]
21 | images:
22 | - '<Artifact Registry Repo Path>/<name of image>'


--------------------------------------------------------------------------------
/modules/jupyter/images/IAP_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/IAP_screenshot.png


--------------------------------------------------------------------------------
/modules/jupyter/images/brand_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/brand_screenshot.png


--------------------------------------------------------------------------------
/modules/jupyter/images/gcs_bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/gcs_bucket.png


--------------------------------------------------------------------------------
/modules/jupyter/images/iap_enable_api_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/iap_enable_api_screenshot.png


--------------------------------------------------------------------------------
/modules/jupyter/images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/image.png


--------------------------------------------------------------------------------
/modules/jupyter/images/oauth_consent_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/oauth_consent_screenshot.png


--------------------------------------------------------------------------------
/modules/jupyter/jupyter_image/notebook_image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/tensorflow-notebook:python-3.10
2 | COPY requirements.txt ./requirements.txt
3 | RUN pip install --no-cache-dir -r ./requirements.txt
4 | 


--------------------------------------------------------------------------------
/modules/jupyter/jupyter_image/notebook_image/README.md:
--------------------------------------------------------------------------------
 1 | To build a new jupyter notebook image and use it for the RAG QSS:
 2 | 1. Update the cloudbuild.yaml with the new image tag.
 3 | 
 4 |     The iamge tag should follow the pattern `sample-public-image-v<VERSION_NUMBER>-rag`.The prefix `sample-public-image-` is needed to so the images will internally be considered as vulnerability remediated and no more bugs will be filed for them.
 5 | 2. Then in this path, run:
 6 | 
 7 |     `gcloud config set project ai-on-gke`
 8 | 
 9 |     `gcloud builds submit --config cloudbuild.yaml .`
10 | 
11 |     This will build and push the new image to the registry `us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke`
12 | 3. Update the `notebook_image_tag` in `/applications/rag/main.tf` to the new image tag.
13 | 


--------------------------------------------------------------------------------
/modules/jupyter/jupyter_image/notebook_image/cloudbuild.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory
16 | steps:
17 | - name: 'gcr.io/cloud-builders/docker'
18 |   args: [ 'pull', 'docker.io/jupyter/tensorflow-notebook:python-3.10' ]
19 | - name: 'gcr.io/cloud-builders/docker'
20 |   args: [ 'build', '-t', 'us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image:<NEW_IMAGE_TAG>', '.' ]
21 | images:
22 | - 'us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image:<NEW_IMAGE_TAG>'


--------------------------------------------------------------------------------
/modules/jupyter/jupyter_image/notebook_image/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.3.7
2 | ray==2.43.0 
3 | datasets==2.18.0 
4 | sentence-transformers==2.5.1 
5 | kaggle==1.6.6


--------------------------------------------------------------------------------
/modules/jupyter/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "jupyterhub_uri" {
16 |   value = var.add_auth ? module.iap_auth[0].domain : ""
17 | }
18 | 
19 | output "jupyterhub_user" {
20 |   value = var.add_auth ? "" : "admin"
21 | }
22 | 
23 | output "jupyterhub_password" {
24 |   value     = var.add_auth ? "" : random_password.generated_password[0].result
25 |   sensitive = true
26 | }
27 | output "jupyterhub_ip_address" {
28 |   value = var.add_auth ? module.iap_auth[0].ip_address : ""
29 | }


--------------------------------------------------------------------------------
/modules/jupyter/tests/change_jupyter_config.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import sys
 3 | 
 4 | config_file = "../jupyter_config/config-selfauth.yaml"
 5 | if len(sys.argv) == 2:
 6 |     autopilot = (sys.argv[1] == "true")
 7 |     if autopilot:
 8 |         config_file = "../jupyter_config/config-selfauth-autopilot.yaml"
 9 | 
10 | with open(config_file, "r") as yaml_file:
11 |     data = yaml.safe_load(yaml_file)
12 | 
13 | data["hub"]["config"]["DummyAuthenticator"]["password"] = "dummy"
14 | 
15 | with open(config_file, 'w') as yaml_file:
16 |     yaml.dump(data, yaml_file)
17 | 


--------------------------------------------------------------------------------
/modules/jupyter/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     google-beta = {
21 |       source = "hashicorp/google-beta"
22 |     }
23 |     helm = {
24 |       source  = "hashicorp/helm"
25 |       version = "~> 2.8.0"
26 |     }
27 |     kubernetes = {
28 |       source = "hashicorp/kubernetes"
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/modules/kuberay-cluster/kuberay_image/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rayproject/ray:2.9.3-py310-gpu
2 | COPY requirements.txt ./requirements.txt
3 | RUN pip install --no-cache-dir -r ./requirements.txt
4 | 


--------------------------------------------------------------------------------
/modules/kuberay-cluster/kuberay_image/cloudbuild.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory
16 | steps:
17 | - name: 'gcr.io/cloud-builders/docker'
18 |   args: [ 'pull', 'docker.io/rayproject/ray:2.9.3-py310-gpu' ]
19 | - name: 'gcr.io/cloud-builders/docker'
20 |   args: [ 'build', '-t', '<Artiact registry repo>/<image name>', '.' ]
21 | images:
22 | - '<Artiact registry repo>/<image name>'


--------------------------------------------------------------------------------
/modules/kuberay-cluster/kuberay_image/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain==0.1.9
2 | transformers==4.38.1
3 | sentence-transformers==2.5.1
4 | pyarrow
5 | datasets==2.18.0
6 | torch==2.0.1
7 | cloud-sql-python-connector[pg8000]==1.7.0
8 | SQLAlchemy==2.0.7
9 | huggingface_hub==0.21.3


--------------------------------------------------------------------------------
/modules/kuberay-cluster/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source  = "hashicorp/helm"
19 |       version = "~> 2.8.0"
20 |     }
21 |     kubernetes = {
22 |       source = "hashicorp/kubernetes"
23 |     }
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/modules/kuberay-monitoring/gmpvalues.yaml:
--------------------------------------------------------------------------------
 1 | podMonitoring:
 2 | - name: ray-monitoring
 3 |   selector:
 4 |     ray.io/is-ray-node: "yes"
 5 |   port: metrics
 6 |   interval: 30s
 7 | 
 8 | gmp-frontend:
 9 |   enabled: true
10 | 


--------------------------------------------------------------------------------
/modules/kuberay-monitoring/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "grafana_uri" {
16 |   value = var.enable_grafana_on_ray_dashboard ? (data.kubernetes_service.example[0].status != null ? (data.kubernetes_service.example[0].status[0].load_balancer != null ? "${data.kubernetes_service.example[0].status[0].load_balancer[0].ingress[0].ip}" : "") : "") : ""
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/modules/kuberay-monitoring/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source  = "hashicorp/helm"
19 |       version = "~> 2.8.0"
20 |     }
21 |     kubernetes = {
22 |       source  = "hashicorp/kubernetes"
23 |       version = "2.18.1"
24 |     }
25 |     time = {
26 |       source  = "hashicorp/time"
27 |       version = "0.11.1"
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/modules/kubernetes-namespace/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Helm Chart 
16 | resource "helm_release" "app-namespace" {
17 |   name             = "app-namespace"
18 |   chart            = "${path.module}/charts/namespace/"
19 |   namespace        = var.namespace
20 |   create_namespace = var.create_namespace
21 | }
22 | 


--------------------------------------------------------------------------------
/modules/kubernetes-namespace/outputs.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | output "namespace" {
16 |   value = var.namespace
17 | }


--------------------------------------------------------------------------------
/modules/kubernetes-namespace/variables.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | variable "namespace" {
17 |   type        = string
18 |   description = "Kubernetes namespace where resources are deployed"
19 | }
20 | 
21 | variable "create_namespace" {
22 |   type = bool
23 | }
24 | 


--------------------------------------------------------------------------------
/modules/kubernetes-namespace/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     helm = {
18 |       source = "hashicorp/helm"
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/modules/prometheus-adapter/README.md:
--------------------------------------------------------------------------------
 1 | This module deploys a [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) and a [Prometheus frontend](https://github.com/GoogleCloudPlatform/prometheus-engine/blob/main/examples/frontend.yaml) to a cluster. See [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) repo for more details.
 2 | 
 3 | ## Installation via bash and helm
 4 | 
 5 | Assure the following environment variables are set:
 6 |    - PROJECT_ID: GKE Project ID
 7 |    - (optional) PROMETHEUS_HELM_VALUES_FILE: Values file to pass when deploying `prometheus-community/prometheus-adapter` chart
 8 | 
 9 | ```
10 | curl https://raw.githubusercontent.com/GoogleCloudPlatform/prometheus-engine/v0.10.0/examples/frontend.yaml | envsubst | kubectl apply -f -
11 | 
12 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
13 | helm repo update
14 | 
15 | if [ -z "$PROMETHEUS_HELM_VALUES_FILE" ]
16 |     helm install example-release prometheus-community/prometheus-adapter
17 | else
18 |     helm install example-release prometheus-community/prometheus-adapter -f "$PROMETHEUS_HELM_VALUES_FILE"
19 | fi
20 | ```
21 | 


--------------------------------------------------------------------------------
/ray-on-gke/README.md:
--------------------------------------------------------------------------------
1 | # Running Ray on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the Ray on GKE Guide have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [Ray on GKE](https://gke-ai-labs.dev/docs/blueprints/ray-on-gke).
5 | 


--------------------------------------------------------------------------------
/ray-on-gke/examples/tfvars:
--------------------------------------------------------------------------------
1 | ../../applications/ray/tfvars_examples


--------------------------------------------------------------------------------
/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-cluster-on-gke.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-cluster-on-gke.png


--------------------------------------------------------------------------------
/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-head-resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-head-resources.png


--------------------------------------------------------------------------------
/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-worker-resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-worker-resources.png


--------------------------------------------------------------------------------
/ray-on-gke/tpu/kuberay-tpu-webhook/README.md:
--------------------------------------------------------------------------------
1 | # Running KubeRay with TPUs on GKE 
2 | 
3 | >[!WARNING]
4 | >The files for the KubeRay TPU webhook have been moved to the [AI-on-GKE/kuberay-tpu-webhook](https://github.com/ai-on-gke/kuberay-tpu-webhook) repository. For more information on installing the webhook and running TPUs with KubeRay, please refer to [Ray on TPUs with GKE](https://gke-ai-labs.dev/docs/tutorials/ray-gke-tpus/).
5 | 


--------------------------------------------------------------------------------
/scripts/ci/wait_for_pods.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define the namespace to watch
 4 | NAMESPACE=$1
 5 | TIMEOUT=$2
 6 | START_TIME=$(date +%s)
 7 | 
 8 | # Check if namespace is provided
 9 | if [[ -z "$NAMESPACE" ]]; then
10 |   echo "Usage: $0 <namespace>"
11 |   exit 1
12 | fi
13 | 
14 | echo "Waiting for any pod to exist in the namespace '$NAMESPACE' (timeout: ${TIMEOUT}s)..."
15 | 
16 | # Loop until a pod exists in the namespace or timeout occurs
17 | while true; do
18 |   POD_COUNT=$(kubectl get pods -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l)
19 | 
20 |   if [[ "$POD_COUNT" -gt 0 ]]; then
21 |     echo "Pod(s) found in the namespace '$NAMESPACE'."
22 |     break
23 |   fi
24 | 
25 |   CURRENT_TIME=$(date +%s)
26 |   ELAPSED_TIME=$((CURRENT_TIME - START_TIME))
27 | 
28 |   if [[ "$ELAPSED_TIME" -ge "$TIMEOUT" ]]; then
29 |     echo "Timeout reached after ${TIMEOUT} seconds. No pods found in the namespace '$NAMESPACE'."
30 |     exit 1
31 |   fi
32 | 
33 |   echo "No pods found yet in the namespace '$NAMESPACE'. Checking again in 30 seconds..."
34 |   sleep 30
35 | done
36 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/continuous-image-puller/capabilities.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "pause"
 6 |     },
 7 |     "message": "container \"pause\" in DaemonSet \"continuous-image-puller\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details",
 8 |     "policyName": "capabilities",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "DaemonSet",
12 |       "name": "continuous-image-puller",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/continuous-image-puller/readonlyrootfs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "pause"
 6 |     },
 7 |     "message": "container \"pause\" in DaemonSet \"continuous-image-puller\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.",
 8 |     "policyName": "readonlyrootfs",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "DaemonSet",
12 |       "name": "continuous-image-puller",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/continuous-image-puller/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in DaemonSet \"continuous-image-puller\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "DaemonSet",
 8 |       "name": "continuous-image-puller",
 9 |       "namespace": ".*",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/capabilities.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub"
 6 |     },
 7 |     "message": "container \"hub\" in Deployment \"hub\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details",
 8 |     "policyName": "capabilities",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "hub",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/distroless.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202"
 7 |     },
 8 |     "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" built from non-distroless base image \"Debian GNU/Linux 11 (bullseye)\". See: go/gke-distroless for more details",
 9 |     "policyName": "distroless",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "hub",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/imagedigest.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202"
 7 |     },
 8 |     "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e",
 9 |     "policyName": "imagedigest",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "hub",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/imagefreshness.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202"
 7 |     },
 8 |     "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" that does not have a valid digest.",
 9 |     "policyName": "imagefreshness",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "hub",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/imagepath.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202"
 7 |     },
 8 |     "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.",
 9 |     "policyName": "imagepath",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "hub",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/readonlyrootfs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "hub"
 6 |     },
 7 |     "message": "container \"hub\" in Deployment \"hub\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.",
 8 |     "policyName": "readonlyrootfs",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "hub",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/hub/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in Deployment \"hub\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "Deployment",
 8 |       "name": "hub",
 9 |       "namespace": ".*",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/mistral-7b-instruct/allowprivilegeescalation.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "mistral-7b-instruct"
 6 |     },
 7 |     "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not set allowPrivilegeEscalation: false in its securityContext. See go/gke-shipshape#allowprivilegeescalation for more details",
 8 |     "policyName": "allowprivilegeescalation",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "mistral-7b-instruct",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/mistral-7b-instruct/capabilities.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "mistral-7b-instruct"
 6 |     },
 7 |     "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details",
 8 |     "policyName": "capabilities",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "mistral-7b-instruct",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/mistral-7b-instruct/readonlyrootfs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "mistral-7b-instruct"
 6 |     },
 7 |     "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.",
 8 |     "policyName": "readonlyrootfs",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "mistral-7b-instruct",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/mistral-7b-instruct/rootless.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "mistral-7b-instruct"
 6 |     },
 7 |     "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" is running as root. Update the container to run as non-root. See go/gke-shipshape#rootless for more details",
 8 |     "policyName": "rootless",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "mistral-7b-instruct",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/mistral-7b-instruct/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in Deployment \"mistral-7b-instruct\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "Deployment",
 8 |       "name": "mistral-7b-instruct",
 9 |       "namespace": ".*",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/capabilities.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp"
 6 |     },
 7 |     "message": "container \"chp\" in Deployment \"proxy\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details",
 8 |     "policyName": "capabilities",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "proxy",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ]


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/distroless.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp",
 6 |       "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1"
 7 |     },
 8 |     "message": "image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" could not be found on gcr.io",
 9 |     "policyName": "distroless",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "proxy",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/imagedigest.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp",
 6 |       "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1"
 7 |     },
 8 |     "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e",
 9 |     "policyName": "imagedigest",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "proxy",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/imagefreshness.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp",
 6 |       "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1"
 7 |     },
 8 |     "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" that does not have a valid digest.",
 9 |     "policyName": "imagefreshness",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "proxy",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/imagepath.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1",
 6 |       "containerName": "chp"
 7 |     },
 8 |     "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.",
 9 |     "policyName": "imagepath",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "proxy",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/readonlyrootfs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp"
 6 |     },
 7 |     "message": "container \"chp\" in Deployment \"proxy\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.",
 8 |     "policyName": "readonlyrootfs",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "proxy",
13 |       "namespace": ".*",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/sbom.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "chp",
 6 |       "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1"
 7 |     },
 8 |     "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with no digest specified. Unable to find digest from registry.",
 9 |     "policyName": "sbom",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "proxy",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/proxy/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in Deployment \"proxy\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "Deployment",
 8 |       "name": "proxy",
 9 |       "namespace": ".*",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/rag-frontend/distroless.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "rag-frontend",
 6 |       "image": "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/frontend@sha256:2b14a3a95f433cc394087ba0d6376d160d8080b62f485f1a119c52b8a6119368"
 7 |     },
 8 |     "message": "container \"rag-frontend\" in Deployment \"rag-frontend\" has an image \"us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/frontend@sha256:2b14a3a95f433cc394087ba0d6376d160d8080b62f485f1a119c52b8a6119368\" built from non-distroless base image \"Debian GNU/Linux 12 (bookworm)\". See: go/gke-distroless for more details",
 9 |     "policyName": "distroless",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "rag-frontend",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/rag-frontend/imagedigest.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "image": "gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0",
 6 |       "containerName": "cloud-sql-proxy"
 7 |     },
 8 |     "message": "container \"cloud-sql-proxy\" in Deployment \"rag-frontend\" has an image \"gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e",
 9 |     "policyName": "imagedigest",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "rag-frontend",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/rag-frontend/imagefreshness.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "cloud-sql-proxy",
 6 |       "image": "gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0"
 7 |     },
 8 |     "message": "container \"cloud-sql-proxy\" in Deployment \"rag-frontend\" has an image \"gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0\" that does not have a valid digest.",
 9 |     "policyName": "imagefreshness",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "rag-frontend",
14 |       "namespace": ".*",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/cluster/rag-frontend/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in Deployment \"rag-frontend\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "Deployment",
 8 |       "name": "rag-frontend",
 9 |       "namespace": ".*",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ] 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/iap/defaultnamespace.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "Ingress \"iap-ingress\" is in the default namespace, which is not allowed.",
 4 |     "policyName": "defaultnamespace",
 5 |     "resourceKey": {
 6 |       "group": "networking.k8s.io",
 7 |       "kind": "Ingress",
 8 |       "name": "iap-ingress",
 9 |       "version": "v1"
10 |     }
11 |   },
12 |   {
13 |     "message": "Secret \"iap-secret\" is in the default namespace, which is not allowed.",
14 |     "policyName": "defaultnamespace",
15 |     "resourceKey": {
16 |       "kind": "Secret",
17 |       "name": "iap-secret",
18 |       "version": "v1"
19 |     }
20 |   }
21 | ]
22 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook"
 6 |     },
 7 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set allowPrivilegeEscalation: false in its securityContext. See go/gke-shipshape#allowprivilegeescalation for more details",
 8 |     "policyName": "allowprivilegeescalation",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "kuberay-tpu-webhook",
13 |       "namespace": "ray-system",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ]
18 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook"
 6 |     },
 7 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details",
 8 |     "policyName": "capabilities",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "kuberay-tpu-webhook",
13 |       "namespace": "ray-system",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ]
18 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0"
 7 |     },
 8 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e",
 9 |     "policyName": "imagedigest",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "kuberay-tpu-webhook",
14 |       "namespace": "ray-system",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook",
 6 |       "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0"
 7 |     },
 8 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" that does not have a valid digest.",
 9 |     "policyName": "imagefreshness",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "kuberay-tpu-webhook",
14 |       "namespace": "ray-system",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0",
 6 |       "containerName": "kuberay-tpu-webhook"
 7 |     },
 8 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.",
 9 |     "policyName": "imagepath",
10 |     "resourceKey": {
11 |       "group": "apps",
12 |       "kind": "Deployment",
13 |       "name": "kuberay-tpu-webhook",
14 |       "namespace": "ray-system",
15 |       "version": "v1"
16 |     }
17 |   }
18 | ]
19 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook"
 6 |     },
 7 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.",
 8 |     "policyName": "readonlyrootfs",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "kuberay-tpu-webhook",
13 |       "namespace": "ray-system",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ]
18 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "details": {
 4 |       "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails",
 5 |       "containerName": "kuberay-tpu-webhook"
 6 |     },
 7 |     "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" is running as root. Update the container to run as non-root. See go/gke-shipshape#rootless for more details",
 8 |     "policyName": "rootless",
 9 |     "resourceKey": {
10 |       "group": "apps",
11 |       "kind": "Deployment",
12 |       "name": "kuberay-tpu-webhook",
13 |       "namespace": "ray-system",
14 |       "version": "v1"
15 |     }
16 |   }
17 | ]
18 | 


--------------------------------------------------------------------------------
/security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "message": "pod in Deployment \"kuberay-tpu-webhook\" must set securityContext.seccompProfile.type to value RuntimeDefault",
 4 |     "policyName": "seccompprofile",
 5 |     "resourceKey": {
 6 |       "group": "apps",
 7 |       "kind": "Deployment",
 8 |       "name": "kuberay-tpu-webhook",
 9 |       "namespace": "ray-system",
10 |       "version": "v1"
11 |     }
12 |   }
13 | ]
14 | 


--------------------------------------------------------------------------------
/slurm-on-gke/README.md:
--------------------------------------------------------------------------------
1 | # Slurm on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the Slurm on GKE example have been moved to the [AI-on-GKE/slurm-on-gke](https://github.com/ai-on-gke/slurm-on-gke) repository. For more information, please refer to the [Slurm on GKE](https://gke-ai-labs.dev/docs/blueprints/slurm-on-gke/).


--------------------------------------------------------------------------------
/tools/dcgm-on-gke/README.md:
--------------------------------------------------------------------------------
1 | # DCGM on GKE
2 | 
3 | >[!WARNING]
4 | >This tool (DCGM on GKE) is now deprecated and is no longer being maintained.
5 | >
6 | >The files for this tool have been removed from this repository and will not be migrated to the new AI-on-GKE GitHub organization.
7 | 


--------------------------------------------------------------------------------
/tools/gke-disk-image-builder/README.md:
--------------------------------------------------------------------------------
1 | # GKE Disk Image Builder
2 | 
3 | >[!WARNING]
4 | >The files for the GKE Disk Image Builder have been moved to the [AI-on-GKE/tools](https://github.com/ai-on-gke/tools/tree/main/gke-disk-image-builder) repository.
5 | 


--------------------------------------------------------------------------------
/tools/saxml-on-gke/README.md:
--------------------------------------------------------------------------------
1 | # SaxML on GKE
2 | 
3 | >[!WARNING]
4 | >This tool (SaxML on GKE) is now deprecated and is no longer being maintained.
5 | >
6 | >The files for this tool have been removed from this repository and will not be migrated to the new AI-on-GKE GitHub organization.
7 | 


--------------------------------------------------------------------------------
/tpu-provisioner/.dockerignore:
--------------------------------------------------------------------------------
1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
2 | # Ignore build and test binaries.
3 | bin/
4 | testbin/
5 | 


--------------------------------------------------------------------------------
/tpu-provisioner/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Binaries for programs and plugins
 3 | *.exe
 4 | *.exe~
 5 | *.dll
 6 | *.so
 7 | *.dylib
 8 | bin
 9 | testbin/*
10 | Dockerfile.cross
11 | 
12 | # Test binary, build with `go test -c`
13 | *.test
14 | 
15 | # Output of the go coverage tool, specifically when used with LiteIDE
16 | *.out
17 | 
18 | # Kubernetes Generated files - skip generated files, except for vendored files
19 | 
20 | !vendor/**/zz_generated.*
21 | 
22 | # editor and IDE paraphernalia
23 | .idea
24 | *.swp
25 | *.swo
26 | *~
27 | 


--------------------------------------------------------------------------------
/tpu-provisioner/PROJECT:
--------------------------------------------------------------------------------
 1 | # Code generated by tool. DO NOT EDIT.
 2 | # This file is used to track the info used to scaffold your project
 3 | # and allow the plugins properly work.
 4 | # More info: https://book.kubebuilder.io/reference/project-config.html
 5 | domain: google.com
 6 | layout:
 7 | - go.kubebuilder.io/v4-alpha
 8 | projectName: tpu-provisioner
 9 | repo: github.com/GoogleCloudPlatform/ai-on-gke/tpu-provisioner
10 | resources: []
11 | version: "3"
12 | 


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/.gitignore:
--------------------------------------------------------------------------------
1 | # don't add certificates
2 | certificates/*.crt
3 | certificates/*.key
4 | 
5 | __pycache__/
6 | .pytest_cache/


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim-buster
2 | WORKDIR /webhook
3 | COPY requirements.txt /webhook
4 | COPY admission_controller.py /webhook
5 | RUN pip install --no-cache-dir --upgrade -r /webhook/requirements.txt
6 | CMD ["uvicorn", "admission_controller:app", "--host", "0.0.0.0", "--port", "5000","--ssl-keyfile=/certs/tls.key", "--ssl-certfile=/certs/tls.crt"]
7 | 


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/__init__.py


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/certificates/README.md:
--------------------------------------------------------------------------------
1 | Two files are required in this directory:
2 | 
3 | 1. `certificate.crt`
4 | 2. `private.key`
5 | 
6 | 
7 | These are used to configure TLS for network communication to/from the webhook.


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/requirements.txt


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/skaffold.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: skaffold/v4beta11
 2 | kind: Config
 3 | metadata:
 4 |   name: admission-controller
 5 | build:
 6 |   local: {}
 7 |   artifacts:
 8 |     - image: example.com/tpu-provisioner/admission-controller
 9 |       docker:
10 |         dockerfile: Dockerfile
11 | manifests:
12 |   rawYaml:
13 |     - manifests/manifest.yaml
14 | 


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/test/__init__.py


--------------------------------------------------------------------------------
/tpu-provisioner/admission_controller/test/e2e/manifests/test-nonjobset-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: test-nonjobset-job
 5 | spec:
 6 |   template:
 7 |     spec:
 8 |       containers:
 9 |       - name: sleeper
10 |         image: ubuntu
11 |         command: ["sleep",  "10000"]
12 |       restartPolicy: Never
13 |   backoffLimit: 0


--------------------------------------------------------------------------------
/tpu-provisioner/cloudbuild.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | steps:
16 |   - id: 'tpu provisioner tests'
17 |     name: 'golang:1.23'
18 |     dir: /workspace/tpu-provisioner
19 |     entrypoint: 'bash'
20 |     args:
21 |       - '-c'
22 |       - |
23 |         set -e
24 |         make test
25 |     allowFailure: false
26 |         
27 | options:
28 |   substitutionOption: 'ALLOW_LOOSE'
29 |   machineType: 'E2_HIGHCPU_8'
30 | timeout: 600s
31 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/default/manager_config_patch.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: controller-manager
 5 |   namespace: system
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |         - name: manager
11 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/manager/configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: manager
5 | data: {}
6 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/manager/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 |   - manager.yaml
3 |   - configmap.yaml
4 | apiVersion: kustomize.config.k8s.io/v1beta1
5 | kind: Kustomization
6 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/prometheus/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - monitor.yaml
3 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/prometheus/monitor.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Prometheus Monitor Service (Metrics)
 3 | apiVersion: monitoring.coreos.com/v1
 4 | kind: ServiceMonitor
 5 | metadata:
 6 |   labels:
 7 |     control-plane: controller-manager
 8 |     app.kubernetes.io/name: servicemonitor
 9 |     app.kubernetes.io/instance: controller-manager-metrics-monitor
10 |     app.kubernetes.io/component: metrics
11 |     app.kubernetes.io/created-by: tpu-provisioner
12 |     app.kubernetes.io/part-of: tpu-provisioner
13 |     app.kubernetes.io/managed-by: kustomize
14 |   name: controller-manager-metrics-monitor
15 |   namespace: system
16 | spec:
17 |   endpoints:
18 |     - path: /metrics
19 |       port: https
20 |       scheme: https
21 |       bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
22 |       tlsConfig:
23 |         insecureSkipVerify: true
24 |   selector:
25 |     matchLabels:
26 |       control-plane: controller-manager
27 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/auth_proxy_client_clusterrole.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrole
 6 |     app.kubernetes.io/instance: metrics-reader
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: metrics-reader
12 | rules:
13 | - nonResourceURLs:
14 |   - "/metrics"
15 |   verbs:
16 |   - get
17 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/auth_proxy_role.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrole
 6 |     app.kubernetes.io/instance: proxy-role
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: proxy-role
12 | rules:
13 | - apiGroups:
14 |   - authentication.k8s.io
15 |   resources:
16 |   - tokenreviews
17 |   verbs:
18 |   - create
19 | - apiGroups:
20 |   - authorization.k8s.io
21 |   resources:
22 |   - subjectaccessreviews
23 |   verbs:
24 |   - create
25 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/auth_proxy_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrolebinding
 6 |     app.kubernetes.io/instance: proxy-rolebinding
 7 |     app.kubernetes.io/component: kube-rbac-proxy
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: proxy-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: ClusterRole
15 |   name: proxy-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/auth_proxy_service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     control-plane: controller-manager
 6 |     app.kubernetes.io/name: service
 7 |     app.kubernetes.io/instance: controller-manager-metrics-service
 8 |     app.kubernetes.io/component: kube-rbac-proxy
 9 |     app.kubernetes.io/created-by: tpu-provisioner
10 |     app.kubernetes.io/part-of: tpu-provisioner
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: controller-manager-metrics-service
13 |   namespace: system
14 | spec:
15 |   ports:
16 |   - name: https
17 |     port: 8443
18 |     protocol: TCP
19 |     targetPort: https
20 |   selector:
21 |     control-plane: controller-manager
22 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   # All RBAC will be applied under this service account in
 3 |   # the deployment namespace. You may comment out this resource
 4 |   # if your manager will use a service account that exists at
 5 |   # runtime. Be sure to update RoleBinding and ClusterRoleBinding
 6 |   # subjects if changing service account names.
 7 |   - service_account.yaml
 8 |   - role.yaml
 9 |   - role_binding.yaml
10 |   - leader_election_role.yaml
11 |   - leader_election_role_binding.yaml
12 |   # Comment the following 4 lines if you want to disable
13 |   # the auth proxy (https://github.com/brancz/kube-rbac-proxy)
14 |   # which protects your /metrics endpoint.
15 |   - auth_proxy_service.yaml
16 |   - auth_proxy_role.yaml
17 |   - auth_proxy_role_binding.yaml
18 |   - auth_proxy_client_clusterrole.yaml
19 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/leader_election_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do leader election.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   labels:
 6 |     app.kubernetes.io/name: role
 7 |     app.kubernetes.io/instance: leader-election-role
 8 |     app.kubernetes.io/component: rbac
 9 |     app.kubernetes.io/created-by: tpu-provisioner
10 |     app.kubernetes.io/part-of: tpu-provisioner
11 |     app.kubernetes.io/managed-by: kustomize
12 |   name: leader-election-role
13 | rules:
14 | - apiGroups:
15 |   - ""
16 |   resources:
17 |   - configmaps
18 |   verbs:
19 |   - get
20 |   - list
21 |   - watch
22 |   - create
23 |   - update
24 |   - patch
25 |   - delete
26 | - apiGroups:
27 |   - coordination.k8s.io
28 |   resources:
29 |   - leases
30 |   verbs:
31 |   - get
32 |   - list
33 |   - watch
34 |   - create
35 |   - update
36 |   - patch
37 |   - delete
38 | - apiGroups:
39 |   - ""
40 |   resources:
41 |   - events
42 |   verbs:
43 |   - create
44 |   - patch
45 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/leader_election_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: rolebinding
 6 |     app.kubernetes.io/instance: leader-election-rolebinding
 7 |     app.kubernetes.io/component: rbac
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: leader-election-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: Role
15 |   name: leader-election-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: clusterrolebinding
 6 |     app.kubernetes.io/instance: manager-rolebinding
 7 |     app.kubernetes.io/component: rbac
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: manager-rolebinding
12 | roleRef:
13 |   apiGroup: rbac.authorization.k8s.io
14 |   kind: ClusterRole
15 |   name: manager-role
16 | subjects:
17 | - kind: ServiceAccount
18 |   name: controller-manager
19 |   namespace: system
20 | 


--------------------------------------------------------------------------------
/tpu-provisioner/config/rbac/service_account.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/name: serviceaccount
 6 |     app.kubernetes.io/instance: controller-manager-sa
 7 |     app.kubernetes.io/component: rbac
 8 |     app.kubernetes.io/created-by: tpu-provisioner
 9 |     app.kubernetes.io/part-of: tpu-provisioner
10 |     app.kubernetes.io/managed-by: kustomize
11 |   name: controller-manager
12 |   namespace: system
13 | 


--------------------------------------------------------------------------------
/tpu-provisioner/docs/cleanup.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/docs/cleanup.excalidraw.png


--------------------------------------------------------------------------------
/tpu-provisioner/docs/provisioning.excalidraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/docs/provisioning.excalidraw.png


--------------------------------------------------------------------------------
/tpu-provisioner/internal/auth/gcp/README.md:
--------------------------------------------------------------------------------
1 | # GCP Auth (v1.26+)
2 | 
3 | See: https://github.com/kubernetes/cloud-provider-gcp/tree/master/pkg/clientauthplugin


--------------------------------------------------------------------------------
/tpu-provisioner/internal/cloud/mock.go:
--------------------------------------------------------------------------------
 1 | package cloud
 2 | 
 3 | import (
 4 | 	corev1 "k8s.io/api/core/v1"
 5 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 6 | )
 7 | 
 8 | var _ Provider = &Mock{}
 9 | 
10 | // Mock is useful for local development or debugging purposes to understand what
11 | // the controller would do without it doing anything.
12 | type Mock struct{}
13 | 
14 | // TODO: Find a better mock node pool label key.
15 | func (m *Mock) NodePoolLabelKey() string                           { return "kubernetes.io/os" }
16 | func (m *Mock) EnsureNodePoolForPod(*corev1.Pod, string) error     { return nil }
17 | func (m *Mock) DeleteNodePoolForNode(*corev1.Node, string) error   { return nil }
18 | func (m *Mock) DeleteNodePool(string, client.Object, string) error { return nil }
19 | func (m *Mock) ListNodePools() ([]NodePoolRef, error)              { return nil, nil }
20 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/flyte/README.md:
--------------------------------------------------------------------------------
1 | # Running Flyte on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the Flyte in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Flyte in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/flyte/).


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/README.md:
--------------------------------------------------------------------------------
1 | # E2E GenAI application with Langchain, Ray, Flask API backend, React frontend
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.
7 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/backend_ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/backend_ip.png


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_app.png


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_ip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_ip.png


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/open_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/open_jupyter.png


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | # Set working directory
 4 | WORKDIR /app
 5 | 
 6 | # Copy requirements
 7 | COPY requirements.txt requirements.txt
 8 | 
 9 | # Install dependencies
10 | RUN pip install --require-hashes --no-cache-dir -r requirements.txt
11 | 
12 | # Copy all files
13 | COPY . .
14 | 
15 | # Expose port for Flask
16 | EXPOSE 5000
17 | 
18 | # Run main.py
19 | CMD ["python", "main.py"]
20 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/backend/requirements.in:
--------------------------------------------------------------------------------
1 | ray==2.43.0
2 | ray[serve] 
3 | requests 
4 | transformers 
5 | langchain
6 | torch
7 | flask
8 | Flask-CORS
9 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | /node_modules
 3 | 
 4 | # Production build output
 5 | /dist
 6 | 
 7 | # IDEs and editors
 8 | /.idea
 9 | .vscode/
10 | *.swp
11 | *.swo
12 | 
13 | # OS generated
14 | .DS_Store
15 | Thumbs.db
16 | 
17 | # TypeScript
18 | *.tsbuildinfo
19 | 
20 | # Log files
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 | 
25 | # Temporary files
26 | *.tmp
27 | *.tmp.json
28 | 
29 | # Debug logs from ESLint, stylelint etc.
30 | *.log
31 | 
32 | # Environment variables
33 | .env
34 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a Node.js image
 2 | FROM node:16-slim
 3 | 
 4 | # Set the working directory
 5 | WORKDIR /app
 6 | 
 7 | # Copy the build files from the local system to the container
 8 | COPY . ./
 9 | RUN npm install && \
10 |     npm run build && \
11 |     npm install serve && \
12 |     npm cache clean --force
13 | 
14 | # Command to run the application
15 | CMD ["npx", "serve", "-s", "dist", "-l", "3000"]
16 | 
17 | # Expose the port the app runs on
18 | EXPOSE 3000
19 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "frontend",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "start": "webpack serve --open",
 9 |     "build": "webpack"
10 |   },
11 |   "keywords": [],
12 |   "devDependencies": {
13 |     "@types/faker": "^5.5.3",
14 |     "@types/react": "^18.2.23",
15 |     "@types/react-dom": "^18.2.8",
16 |     "css-loader": "^6.8.1",
17 |     "html-webpack-plugin": "^5.5.3",
18 |     "postcss-loader": "^7.3.3",
19 |     "react": "^18.2.0",
20 |     "react-dom": "^18.2.0",
21 |     "style-loader": "^3.3.3",
22 |     "ts-loader": "^9.4.4",
23 |     "typescript": "^5.2.2",
24 |     "webpack": "^5.88.2",
25 |     "webpack-cli": "^5.1.4",
26 |     "webpack-dev-server": "^4.15.1"
27 |   },
28 |   "dependencies": {
29 |     "bootstrap": "^5.3.2",
30 |     "faker": "^5.5.3",
31 |     "reactstrap": "^9.2.0"
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/src/index.html:
--------------------------------------------------------------------------------
 1 | <!-- Copyright 2023 Google LLC
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |      http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License. -->
14 | 
15 | <!DOCTYPE html>
16 | <html lang="en">
17 | <head>
18 |     <meta charset="UTF-8">
19 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
20 |     <title>React App</title>
21 | </head>
22 | <body>
23 |     <div id="root"></div>
24 |     <!-- The bundled JS will be auto-injected here by html-webpack-plugin -->
25 | </body>
26 | </html>
27 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |         "outDir": "./dist/",
 4 |         "sourceMap": true,
 5 |         "noImplicitAny": true,
 6 |         "module": "commonjs",
 7 |         "target": "es6",
 8 |         "jsx": "react",
 9 |         "esModuleInterop": true,
10 |         "allowSyntheticDefaultImports": true,
11 |         "moduleResolution": "node",
12 |         "typeRoots": ["./node_modules/@types", "./types"]
13 |     },
14 |     "include": [
15 |         "./src/**/*"
16 |     ]
17 | }


--------------------------------------------------------------------------------
/tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/README.md:
--------------------------------------------------------------------------------
1 | # Finetuning Gemma 3-1B-it on L4
2 | 
3 | >[!WARNING]
4 | >The files for the Finetuning Gemma 3-1B-it on L4 guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Finetuning Gemma 3-1B-it on L4](https://gke-ai-labs.dev/docs/tutorials/finetuning-gemma-3-1b-it-on-l4/).


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/a100-jax/README.md:
--------------------------------------------------------------------------------
1 | # JAX 'Hello World' on GKE + A100-80GB
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/online-serving-single-gpu/README.md:
--------------------------------------------------------------------------------
1 | # Serve a model with a GPU on GKE Autopilot
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/README.md:
--------------------------------------------------------------------------------
1 | # Train a model with GPUs on GKE Standard mode
2 | 
3 | Please follow the Quick Start at https://cloud.google.com/kubernetes-engine/docs/quickstarts/train-model-gpus-standard
4 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/0.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/1.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/2.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/3.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/4.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/5.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/6.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/7.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/8.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/9.png


--------------------------------------------------------------------------------
/tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-datasets


--------------------------------------------------------------------------------
/tutorials-and-examples/hf-tgi/README.md:
--------------------------------------------------------------------------------
1 | # Hugging Face Text Generation Inference (TGI)
2 | 
3 | >[!WARNING]
4 | >The files for the Hugging Face TGI example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Hugging Face TGI tutorial](https://gke-ai-labs.dev/docs/tutorials/hf-tgi/).


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/checkpoints/README.md:
--------------------------------------------------------------------------------
1 | # Creating Inference Checkpoints
2 | 
3 | >[!WARNING]
4 | >The files for the Creating Inference Checkpoints on GKE example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Creating Inference Checkpoints](https://gke-ai-labs.dev/docs/tutorials/inference-servers/checkpoints/).


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/README.md:
--------------------------------------------------------------------------------
1 | # Serve an LLM using TPUs on GKE with JetStream
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained. Methods for deploying LLMs on GKE with TPUs may have changed.
5 | >
6 | >Please refer to the **official Google Cloud documentation** for the latest practices:
7 | >[Serve LLMs on GKE with TPUs using JetStream and PyTorch](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-llm-tpu-jetstream-pytorch)   
8 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/http-server/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Ubuntu:22.04
 2 | # Use Ubuntu 22.04 from Docker Hub.
 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04
 4 | FROM ubuntu:22.04
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | ENV JETSTREAM_VERSION=v0.2.2
 8 | 
 9 | RUN apt -y update && apt install -y --no-install-recommends \
10 |     ca-certificates \
11 |     git \
12 |     python3.10 \
13 |     python3-pip
14 | 
15 | RUN update-alternatives --install \
16 |     /usr/bin/python3 python3 /usr/bin/python3.10 1
17 | 
18 | RUN git clone https://github.com/google/JetStream.git && \
19 | cd /JetStream && \
20 | git checkout ${JETSTREAM_VERSION} && \
21 | pip install -e .
22 | 
23 | RUN pip3 install uvicorn
24 | RUN pip3 install fastapi
25 | RUN pip3 install pydantic
26 | ENV PYTHONDONTWRITEBYTECODE=1
27 | 
28 | COPY http_server.py /httpserver/
29 | WORKDIR /httpserver
30 | 
31 | CMD ["uvicorn", "http_server:app", "--host=0.0.0.0", "--port=8000"]


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/maxengine-server/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Ubuntu:22.04
 2 | # Use Ubuntu 22.04 from Docker Hub.
 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04
 4 | FROM ubuntu:22.04
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | ENV MAXTEXT_VERSION=jetstream-v0.2.2
 8 | 
 9 | RUN apt -y update && apt install -y --no-install-recommends \
10 |     ca-certificates \
11 |     git \
12 |     python3.10 \
13 |     python3-pip
14 | 
15 | RUN update-alternatives --install \
16 |     /usr/bin/python3 python3 /usr/bin/python3.10 1
17 | 
18 | RUN git clone https://github.com/google/maxtext.git
19 | 
20 | RUN cd maxtext/ && \
21 | git checkout ${MAXTEXT_VERSION} && \
22 | bash setup.sh
23 | 
24 | COPY maxengine_server_entrypoint.sh /usr/bin/
25 | 
26 | RUN chmod +x /usr/bin/maxengine_server_entrypoint.sh
27 | 
28 | ENTRYPOINT ["/usr/bin/maxengine_server_entrypoint.sh"]
29 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/maxengine-server/maxengine_server_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd /maxtext
3 | python3 MaxText/maxengine_server.py \
4 | MaxText/configs/base.yml $@


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/checkpoint-job.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: data-loader-7b
 5 | spec:
 6 |   ttlSecondsAfterFinished: 30
 7 |   template:
 8 |     spec:
 9 |       restartPolicy: Never
10 |       containers:
11 |       - name: inference-checkpoint
12 |         image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.2
13 |         args:
14 |         - -b=BUCKET_NAME
15 |         - -m=google/gemma/maxtext/7b-it/2
16 |         volumeMounts:
17 |         - mountPath: "/kaggle/"
18 |           name: kaggle-credentials
19 |           readOnly: true
20 |         resources:
21 |           requests:
22 |             google.com/tpu: 8
23 |           limits:
24 |             google.com/tpu: 8
25 |       nodeSelector:
26 |         cloud.google.com/gke-tpu-topology: 2x4
27 |         cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
28 |       volumes:
29 |       - name: kaggle-credentials
30 |         secret:
31 |           defaultMode: 0400
32 |           secretName: kaggle-secret


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2024 Google LLC
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | module "maxengine" {
18 |   count                         = 1
19 |   source                        = "../../../../../../modules/jetstream-maxtext-deployment"
20 |   cluster_name                  = var.cluster_name
21 |   project_id                    = var.project_id
22 |   maxengine_deployment_settings = var.maxengine_deployment_settings
23 |   hpa_config                    = var.hpa_config
24 | }


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars:
--------------------------------------------------------------------------------
 1 | maxengine_deployment_settings = {
 2 |   metrics = {
 3 |     server = {
 4 |       port = 9100
 5 |       scrape_interval : 10
 6 |     }
 7 |   }
 8 | 
 9 |   accelerator_selectors = {
10 |     topology    = "2x4"
11 |     accelerator = "tpu-v5-lite-podslice"
12 |     chip_count : 8
13 |   }
14 | }
15 | 
16 | # Demonstrating autoscaling with jetstream_prefill_backlog_size, change as desired.
17 | # For jetstream_prefill_backlog_size. (experiment with this to determine optimal values).
18 | 
19 | # hpa_config = {
20 | #   metrics_adapter = "prometheus-adapter"
21 | #   max_replicas    = 5
22 | #   min_replicas    = 1
23 | #   rules = [{
24 | #     target_query         = "jetstream_prefill_backlog_size"
25 | #     average_value_target = 5
26 | #   }]
27 | # }


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     google = {
18 |       source = "hashicorp/google"
19 |     }
20 |     kubernetes = {
21 |       source = "hashicorp/kubernetes"
22 |     }
23 |     kubectl = {
24 |       source = "hashicorp/kubectl"
25 |     }
26 |     helm = {
27 |       source = "hashicorp/helm"
28 |     }
29 |   }
30 | }


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions_override.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_providers {
17 |     kubectl = {
18 |       source = "gavinbunney/kubectl"
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Ubuntu:22.04
 2 | # Use Ubuntu 22.04 from Docker Hub.
 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04
 4 | FROM ubuntu:22.04
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | ENV PYTORCH_JETSTREAM_VERSION=jetstream-v0.2.3
 8 | 
 9 | RUN apt -y update && apt install -y --no-install-recommends \
10 |     ca-certificates \
11 |     git \
12 |     python3.10 \
13 |     python3-pip
14 | 
15 | RUN python3 -m pip install --upgrade pip
16 | 
17 | RUN update-alternatives --install \
18 |     /usr/bin/python3 python3 /usr/bin/python3.10 1
19 | 
20 | RUN git clone https://github.com/google/jetstream-pytorch.git && \
21 | cd /jetstream-pytorch && \
22 | git checkout ${PYTORCH_JETSTREAM_VERSION} && \
23 | bash install_everything.sh
24 | 
25 | COPY jetstream_pytorch_server_entrypoint.sh /usr/bin/
26 | 
27 | RUN chmod +x /usr/bin/jetstream_pytorch_server_entrypoint.sh
28 | 
29 | ENTRYPOINT ["/usr/bin/jetstream_pytorch_server_entrypoint.sh"]


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/jetstream_pytorch_server_entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd /jetstream-pytorch
3 | python3 -m run_server $@


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/storage.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: checkpoint-pv
 5 | spec:
 6 |   capacity:
 7 |     storage: 100G
 8 |   accessModes:
 9 |     - ReadWriteOnce
10 |   gcePersistentDisk:
11 |     pdName: jetstream-pytorch-ckpt
12 |     fsType: ext4
13 | ---
14 | apiVersion: v1
15 | kind: PersistentVolumeClaim
16 | metadata:
17 |   name: checkpoint-pvc
18 | spec:
19 |   storageClassName: ""
20 |   volumeName: checkpoint-pv
21 |   accessModes:
22 |     - ReadWriteOnce
23 |   resources:
24 |     requests:
25 |       storage: 100G


--------------------------------------------------------------------------------
/tutorials-and-examples/inference-servers/maxdiffusion/README.md:
--------------------------------------------------------------------------------
1 | # High-performance diffusion model inference on GKE and TPU using MaxDiffusion
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained. Methods for deploying diffusion model inference on GKE and TPU using MaxDiffusion may have changed.
5 | >
6 | >Please refer to the **official Google Cloud documentation** for the latest practices:
7 | >[Serve Stable Diffusion XL (SDXL) using TPUs on GKE with MaxDiffusion](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-sdxl-tpu)   


--------------------------------------------------------------------------------
/tutorials-and-examples/kserve/README.md:
--------------------------------------------------------------------------------
1 | # KServe on GKE Autopilot
2 | 
3 | >[!WARNING]
4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [KServe on GKE Autopilot](https://gke-ai-labs.dev/docs/tutorials/inference-servers/kserve/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/langchain-chatbot/README.md:
--------------------------------------------------------------------------------
1 | # Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP
2 | 
3 | >[!WARNING]
4 | >The files for the Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP](https://gke-ai-labs.dev/docs/tutorials/langchain-chatbot/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/llamaindex/rag/README.md:
--------------------------------------------------------------------------------
1 | # Llamaindex in GKE cluster
2 | 
3 | >[!WARNING]
4 | >The files for the Llamaindex in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Llamaindex in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/llamaindex/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/metaflow/README.md:
--------------------------------------------------------------------------------
1 | # Fine-Tuning Gemma 2-9B on GKE using Metaflow and Argo Workflows
2 | 
3 | >[!WARNING]
4 | >The files for the Metaflow in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Metaflow in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/metaflow/).


--------------------------------------------------------------------------------
/tutorials-and-examples/mlflow/finetune-gemma/README.md:
--------------------------------------------------------------------------------
1 | # Fine-tune gemma-2-9b and track as an experiment in MLFlow
2 | 
3 | >[!WARNING]
4 | >The files for the Fine-tune gemma-2-9b and track as an experiment in MLFlow Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Fine-tune gemma-2-9b and track as an experiment in MLFlow](https://gke-ai-labs.dev/docs/tutorials/mlflow/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/models-as-oci/README.md:
--------------------------------------------------------------------------------
1 | # Package and Deploy from Hugging Face to Artifact Registry and GKE
2 | 
3 | >[!WARNING]
4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Package and Deploy from Hugging Face to Artifact Registry and GKE](https://gke-ai-labs.dev/docs/tutorials/models-as-oci/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/nvidia-bionemo/README.md:
--------------------------------------------------------------------------------
1 | ### Pretraining and Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework 2.0
2 | 
3 | >[!WARNING]
4 | >The files for the Pretraining and Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/bionemo/README.md) repository. For more information, please refer to the [NVIDIA BioNeMo tutorial](https://gke-ai-labs.dev/docs/blueprints/bionemo/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/nvidia-nim/README.md:
--------------------------------------------------------------------------------
1 | # NVIDIA NIM on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the NVIDIA Inference Microservices (NIM) on GKE have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/nim/quickstart) repository. For more information, please refer to the [NVIDIA NIM & Blueprints on GKE tutorial](https://gke-ai-labs.dev/docs/blueprints/nims-on-gke/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/nvidia-nim/blueprints/README.md:
--------------------------------------------------------------------------------
1 | # NVIDIA NIM Blueprints on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the NIM Blueprints on GKE have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/nim/blueprints) repository. For more information, please refer to the [NVIDIA NIM on GKE tutorial](https://gke-ai-labs.dev/docs/blueprints/nims-on-gke/).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/skypilot/README.md:
--------------------------------------------------------------------------------
1 | # GKE cross region capacity chasing with SkyPilot
2 | >[!WARNING]
3 | >The files for the GKE cross region capacity chasing with SkyPilot Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [GKE cross region capacity chasing with SkyPilot](https://gke-ai-labs.dev/docs/tutorials/skypilot/cross-region-capacity-chasing/).


--------------------------------------------------------------------------------
/tutorials-and-examples/skypilot/dws-and-kueue/README.md:
--------------------------------------------------------------------------------
1 | # Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE
2 | 
3 | >[!WARNING]
4 | >The files for the Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE](https://gke-ai-labs.dev/docs/tutorials/skypilot/resource-management-using-kueue/).


--------------------------------------------------------------------------------
/tutorials-and-examples/storage/hyperdisk-ml/README.md:
--------------------------------------------------------------------------------
1 | ## Populate a Hyperdisk ML Disk from Google Cloud Storage
2 | 
3 | >[!WARNING]
4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Populate a Hyperdisk ML Disk from Google Cloud Storage](https://gke-ai-labs.dev/docs/tutorials/hyperdisk-ml/).


--------------------------------------------------------------------------------
/tutorials-and-examples/storage/parallelstore-backup-and-recovery/README.md:
--------------------------------------------------------------------------------
1 | # Data backup and recovery for Parallelstore
2 | 
3 | >[!WARNING]
4 | >The files for the Data backup and recovery for Parallelstore example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Data backup and recovery for Parallelstore tutorial](https://github.com/ai-on-gke/tutorials-and-examples/tree/main/storage/parallelstore-backup-and-recovery).
5 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/storage/parallelstore-backup-and-recovery/parallelstore-sa.yaml:
--------------------------------------------------------------------------------
1 | 
2 | # Service Account that have access to Parallelstore and GCS
3 | apiVersion: v1
4 | kind: ServiceAccount
5 | metadata:
6 |   name: parallelstore-sa
7 |   namespace: default
8 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/jax/bert/loadbalancer.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: Service
17 | metadata:
18 |   labels:
19 |     run: tf-bert-service
20 |   name: tf-bert-service
21 | spec:
22 |   ports:
23 |   - name: grpc
24 |     port: 8500
25 |     protocol: TCP
26 |     targetPort: 8500
27 |   - name: http
28 |     port: 8501
29 |     protocol: TCP
30 |     targetPort: 8501
31 |   selector:
32 |     app: tf-bert-server
33 |   type: LoadBalancer
34 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/jax/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | numpy
3 | tensorflow
4 | pillow
5 | tensorflow-serving-api
6 | transformers
7 | diffusers
8 | flask


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/jax/stable-diffusion/README.md:
--------------------------------------------------------------------------------
1 | ## Serve (online inference) a model using a single TPU and GKE 
2 | 
3 | To better understand how TPUs work on GKE, please read the doc
4 | [TPUs in GKE introduction](https://cloud.google.com/tpu/docs/tpus-in-gke).
5 | 
6 | This directory contains files for [JAX Model inference and serving](https://cloud.google.com/tpu/docs/tpus-in-gke#jax-model). You can find step-by-step instructions in the quickstart guide.
7 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/jax/stable-diffusion/loadbalancer.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: Service
17 | metadata:
18 |   labels:
19 |     run: tf-stable-diffusion-service
20 |   name: tf-stable-diffusion-service
21 | spec:
22 |   ports:
23 |   - name: grpc
24 |     port: 8500
25 |     protocol: TCP
26 |     targetPort: 8500
27 |   - name: http
28 |     port: 8501
29 |     protocol: TCP
30 |     targetPort: 8501
31 |   selector:
32 |     app: tf-stable-diffusion-server
33 |   type: LoadBalancer
34 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/pt/densenet161/loadbalancer.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: densenet161-service
 5 | spec:
 6 |   type: LoadBalancer
 7 |   ports:
 8 |   - name: densenet161-http-inference
 9 |     port: 8080
10 |     protocol: TCP
11 |     targetPort: 8080
12 |   - name: densenet161-http-management
13 |     port: 8081
14 |     protocol: TCP
15 |     targetPort: 8081
16 |   - name: densenet161-http-metrics
17 |     port: 8082
18 |     protocol: TCP
19 |     targetPort: 8082
20 |   selector:
21 |     app: densenet161-server
22 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/pt/densenet161/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/banana.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/banana.jpeg


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/loadbalancer.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: resnet-service
 5 | spec:
 6 |   type: LoadBalancer
 7 |   ports:
 8 |   - name: resnet-grpc
 9 |     port: 8500
10 |     protocol: TCP
11 |     targetPort: 8500
12 |   - name: resnet-http
13 |     port: 8501
14 |     protocol: TCP
15 |     targetPort: 8501
16 |   selector:
17 |     app: resnet-server
18 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | numpy
3 | tensorflow
4 | pillow


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/diffusion/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8
 2 | 
 3 | RUN git clone https://github.com/huggingface/diffusers.git
 4 | 
 5 | WORKDIR diffusers
 6 | 
 7 | RUN pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 8 | RUN pip install . 
 9 | RUN pip install tensorflow clu 
10 | RUN pip install -U -r examples/text_to_image/requirements_flax.txt


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/gpt/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm
 2 | 
 3 | RUN apt-get update -y
 4 | 
 5 | RUN apt-get install libomp5 -y
 6 | RUN pip3 install mkl mkl-include
 7 | RUN pip3 install tf-nightly tb-nightly tbp-nightly
 8 | RUN pip3 install numpy
 9 | RUN apt-get install numactl libopenblas-dev -y
10 | 
11 | RUN ln -s /usr/local/lib/libmkl_intel_ilp64.so.2 /usr/local/lib/libmkl_intel_ilp64.so.1
12 | 
13 | RUN rm -rf transformers
14 | RUN git clone https://github.com/huggingface/transformers.git
15 | 
16 | WORKDIR transformers
17 | 
18 | RUN git checkout -q ebdb185befaa821304d461ed6aa20a17e4dc3aa2
19 | RUN pip3 install -e .
20 | RUN pip3 install datasets
21 | RUN pip3 install evaluate
22 | RUN pip3 install scikit-learn
23 | 
24 | COPY . .
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/gpt/fsdp_config.json:
--------------------------------------------------------------------------------
1 | {"fsdp_transformer_layer_cls_to_wrap":["GPT2Block", "GPT2MLP", "GPT2Attention"],
2 |  "xla":true,
3 |  "xla_fsdp_settings":{"compute_dtype":"bfloat16",
4 |   "shard_param_on_dim_0":true,
5 |   "pin_layout_in_collective_ops":true
6 |  },
7 |  "xla_fsdp_grad_ckpt":true
8 | }
9 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/gpt/my_config_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function": "gelu_new",
 3 |   "architectures": [
 4 |     "GPT2LMHeadModel"
 5 |   ],
 6 |   "attn_pdrop": 0.1,
 7 |   "bos_token_id": 50256,
 8 |   "embd_pdrop": 0.1,
 9 |   "eos_token_id": 50256,
10 |   "initializer_range": 0.02,
11 |   "layer_norm_epsilon": 1e-05,
12 |   "model_type": "gpt2",
13 |   "n_embd": 3072,
14 |   "n_head": 24,
15 |   "n_layer": 18,
16 |   "n_inner": 12288,
17 |   "n_positions": 1024,
18 |   "resid_pdrop": 0.1,
19 |   "summary_activation": null,
20 |   "summary_first_dropout": 0.1,
21 |   "summary_proj_to_labels": true,
22 |   "summary_type": "cls_index",
23 |   "summary_use_proj": true,
24 |   "task_specific_params": {
25 |     "text-generation": {
26 |       "do_sample": true,
27 |       "max_length": 50
28 |     }
29 |   },
30 |   "vocab_size": 50257
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/README.md:
--------------------------------------------------------------------------------
1 | # Deploy TPUs on GKE
2 | 
3 | To deploy TPU workloads on GKE, see the following pages:
4 | 
5 | * [Deploy TPU workloads on GKE Autopilot mode](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus-autopilot)
6 | * [Deploy TPU workloads on GKE Standard mode](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus)
7 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/0.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/1.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/2.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/3.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/4.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/5.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/6.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/7.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/8.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/9.png


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/src/gke-config/standard-tensorflow-bash-v5e.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: test-tensorflow-pod
 5 |   annotations:
 6 |     gke-gcsfuse/volumes: "true"
 7 | spec:
 8 |   nodeSelector:
 9 |     cloud.google.com/gke-tpu-topology: 2x2  # target topology
10 |     cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
11 |   containers:
12 |   - name: tensorflow
13 |     image: tensorflow/tensorflow:2.14.0
14 |     securityContext:
15 |       privileged: true
16 |     command: ["/bin/bash", "-c", "--"]
17 |     args: ["while true; do sleep infinity; done;"]
18 |     resources:
19 |       requests:
20 |         google.com/tpu: "4"  # TPU chip request
21 |       limits:
22 |         google.com/tpu: "4"  # TPU chip request
23 |     volumeMounts:
24 |     - name: gcs-fuse-csi-vol
25 |       mountPath: /data
26 |       readOnly: false
27 |   volumes:
28 |   - name: gcs-fuse-csi-vol
29 |     csi:
30 |       driver: gcsfuse.csi.storage.gke.io
31 |       readOnly: false
32 |       volumeAttributes:
33 |         bucketName: $BUCKET_NAME
34 |         mountOptions: "implicit-dirs"
35 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/src/tensorflow-mnist-example/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-datasets


--------------------------------------------------------------------------------
/tutorials-and-examples/vector-databases/readme.md:
--------------------------------------------------------------------------------
1 | # Vector Database Repo
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.


--------------------------------------------------------------------------------
/tutorials-and-examples/workflow-orchestration/dws-examples/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic Workload Scheduler examples
2 | 
3 | 
4 | >[!WARNING]
5 | >The files for the Kueue with DWS and GKE autopilot example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Kueue with DWS and GKE autopilot tutorial](https://gke-ai-labs.dev/docs/tutorials/workflow-orchestration/dws/s).
6 | 


--------------------------------------------------------------------------------
/tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md:
--------------------------------------------------------------------------------
1 | # Multikueue-dws-integration
2 | 
3 | >[!WARNING]
4 | >The files for the Multikueue with DWS and GKE autopilot example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Multikueue with DWS and GKE autopilot tutorial](https://gke-ai-labs.dev/docs/tutorials/workflow-orchestration/multikueue-dws).


--------------------------------------------------------------------------------
/tutorials-and-examples/workflow-orchestration/indexed-job/README.md:
--------------------------------------------------------------------------------
1 | # Running distributed ML training workloads on GKE using Indexed Jobs
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.


--------------------------------------------------------------------------------
/tutorials-and-examples/workflow-orchestration/jobset/pytorch/README.md:
--------------------------------------------------------------------------------
1 | # Running distributed ML training workloads on GKE using the JobSet API
2 | 
3 | >[!WARNING]
4 | >This guide and associated code are **deprecated** and no longer maintained.
5 | >
6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions.


--------------------------------------------------------------------------------