├── .github ├── pull_request_template.md └── workflows │ ├── close.yaml │ ├── label-pr.yaml │ ├── obsolete.yaml │ └── stale.yaml ├── .gitignore ├── .gitmodules ├── CODEOWNERS ├── LICENSE ├── Makefile ├── README.md ├── applications ├── jupyter │ └── README.md └── rag │ └── README.md ├── benchmarks ├── 65k-cpu-nodes-simulated-ai-benchmark.md ├── README.md ├── accelerator-based-ai-benchmark.md ├── benchmark │ ├── README.md │ ├── dataset │ │ ├── README.md │ │ └── ShareGPT_v3_unflitered_cleaned_split │ │ │ ├── README.md │ │ │ ├── requirements.txt │ │ │ └── upload_sharegpt.py │ └── tools │ │ ├── CL2-benchmark │ │ ├── config.yaml │ │ ├── headless-service.yaml │ │ ├── modules │ │ │ ├── measurements.yaml │ │ │ ├── scheduling-throughput.yaml │ │ │ └── statefulsets.yaml │ │ ├── priorityclass.yaml │ │ └── statefulset.yaml │ │ ├── README.md │ │ ├── dlio │ │ ├── README.md │ │ ├── main.tf │ │ ├── modules │ │ │ ├── dlio │ │ │ │ ├── job.tf │ │ │ │ ├── podspec.tpl │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ ├── parallelstore_storage │ │ │ │ ├── dataloader_job.tpl │ │ │ │ ├── ps_pv.tpl │ │ │ │ ├── ps_pv_pvc.tf │ │ │ │ ├── ps_pvc.tpl │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ │ └── storage │ │ │ │ ├── gcs_pv_pvc.tf │ │ │ │ ├── pv_podspec.tpl │ │ │ │ ├── pvc_podspec.tpl │ │ │ │ ├── variables.tf │ │ │ │ └── versions.tf │ │ ├── parser.py │ │ ├── variables.tf │ │ └── versions.tf │ │ ├── locust-load-inference │ │ ├── README.md │ │ ├── build.tf │ │ ├── locust-custom-exporter │ │ │ ├── Dockerfile │ │ │ ├── Makefile │ │ │ ├── Makefile.common │ │ │ ├── README.md │ │ │ ├── go.mod │ │ │ ├── go.sum │ │ │ └── main.go │ │ ├── locust-docker │ │ │ ├── Dockerfile │ │ │ └── locust-tasks │ │ │ │ ├── custom_metric_aggregator.py │ │ │ │ ├── load_data.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── run.sh │ │ │ │ └── tasks.py │ │ ├── locust-run.tf │ │ ├── locust-runner │ │ │ ├── Dockerfile │ │ │ ├── app │ │ │ │ ├── __init__.py │ │ │ │ ├── data_model.py │ │ │ │ └── main.py │ │ │ ├── metrics.yaml │ │ │ └── requirements.txt │ │ ├── main.tf │ │ ├── manifest-templates │ │ │ ├── locust-master-controller.yaml.tpl │ │ │ ├── locust-master-service.yaml.tpl │ │ │ ├── locust-worker-controller.yaml.tpl │ │ │ └── pod-monitoring.yaml.tpl │ │ ├── providers.tf │ │ ├── runner-manifest-template │ │ │ ├── locust-runner-service.yaml.tpl │ │ │ └── locust-runner.yaml.tftpl │ │ ├── sample-dashboards │ │ │ └── tgi-dashboard.yaml │ │ ├── sample-tfvars │ │ │ ├── jetstream-sample.tfvars │ │ │ └── tgi-sample.tfvars │ │ └── variables.tf │ │ ├── model-load-benchmark │ │ ├── README.md │ │ ├── base-config.yaml │ │ ├── benchmarker.ini │ │ ├── config │ │ │ ├── config.go │ │ │ ├── config_test.go │ │ │ └── utils.go │ │ ├── deployment │ │ │ ├── consts.go │ │ │ └── deployment.go │ │ ├── example-pod.yaml │ │ ├── go.mod │ │ ├── go.sum │ │ ├── k8sclient │ │ │ └── k8sclient.go │ │ ├── main.go │ │ ├── plot.py │ │ ├── rbac.yaml │ │ ├── requirements.txt │ │ ├── results │ │ │ ├── case_0.yaml │ │ │ ├── case_1.yaml │ │ │ ├── case_10.yaml │ │ │ ├── case_11.yaml │ │ │ ├── case_12.yaml │ │ │ ├── case_13.yaml │ │ │ ├── case_14.yaml │ │ │ ├── case_15.yaml │ │ │ ├── case_16.yaml │ │ │ ├── case_17.yaml │ │ │ ├── case_18.yaml │ │ │ ├── case_19.yaml │ │ │ ├── case_2.yaml │ │ │ ├── case_20.yaml │ │ │ ├── case_21.yaml │ │ │ ├── case_22.yaml │ │ │ ├── case_23.yaml │ │ │ ├── case_3.yaml │ │ │ ├── case_4.yaml │ │ │ ├── case_5.yaml │ │ │ ├── case_6.yaml │ │ │ ├── case_7.yaml │ │ │ ├── case_8.yaml │ │ │ ├── case_9.yaml │ │ │ ├── elapsed_time_vs_cpu_request.png │ │ │ ├── elapsed_time_vs_download_chunk_size_mb.png │ │ │ ├── elapsed_time_vs_ephemeral_storage_request.png │ │ │ ├── elapsed_time_vs_max_parallel_downloads.png │ │ │ ├── elapsed_time_vs_memory_request.png │ │ │ └── elapsed_time_vs_parallel_downloads_per_file.png │ │ ├── runner │ │ │ └── runner.go │ │ ├── suite-generator │ │ │ ├── generator.go │ │ │ └── generator_test.go │ │ └── volumeAttributes.yaml │ │ └── profile-generator │ │ ├── README.md │ │ ├── build.tf │ │ ├── container │ │ ├── Dockerfile │ │ ├── benchmark_serving.py │ │ ├── latency_throughput_curve.sh │ │ └── requirements.txt │ │ ├── main.tf │ │ ├── modules │ │ └── latency-profile │ │ │ ├── main.tf │ │ │ ├── manifest-templates │ │ │ ├── latency-profile-generator-podmonitoring.yaml.tpl │ │ │ └── latency-profile-generator.yaml.tpl │ │ │ ├── sample.tfvars │ │ │ └── variables.tf │ │ ├── sample.tfvars │ │ └── variables.tf ├── inference-server │ ├── README.md │ ├── jetstream │ │ ├── README.md │ │ ├── jetstream.yaml │ │ └── model-conversion │ │ │ └── kaggle_converter.yaml │ ├── templates │ │ └── secret-templates │ │ │ └── secret-provider.tftpl │ ├── text-generation-inference │ │ ├── README.md │ │ ├── autoscaling.md │ │ ├── hpa-templates │ │ │ ├── dcgm-podmonitoring.yaml.tftpl │ │ │ ├── hpa.cpu.yaml.tftpl │ │ │ └── hpa.tgi.custom_metric.yaml.tftpl │ │ ├── main.tf │ │ ├── manifest-templates │ │ │ ├── text-generation-inference-svc.tftpl │ │ │ └── text-generation-inference.tftpl │ │ ├── monitoring-templates │ │ │ └── tgi-podmonitoring.yaml.tftpl │ │ ├── providers.tf │ │ ├── sample-terraform.tfvars │ │ └── variables.tf │ ├── triton │ │ ├── README.md │ │ ├── main.tf │ │ ├── manifest-templates │ │ │ ├── triton-tensorrtllm-inference-docker.tftpl │ │ │ └── triton-tensorrtllm-inference-gs.tftpl │ │ ├── providers.tf │ │ ├── sample-terraform.tfvars │ │ └── variables.tf │ └── vllm │ │ ├── README.md │ │ ├── hpa-templates │ │ └── hpa.vllm.custom_metric.yaml.tftpl │ │ ├── main.tf │ │ ├── manifest-templates │ │ ├── vllm-service.tftpl │ │ └── vllm.tftpl │ │ ├── monitoring-templates │ │ └── vllm-podmonitoring.yaml.tftpl │ │ ├── providers.tf │ │ ├── sample-terraform.tfvars │ │ └── variables.tf ├── infra │ ├── 65k-cpu-cluster │ │ ├── main.tf │ │ ├── provider.tf │ │ ├── sample-tfvars │ │ │ └── 65k-sample.tfvars │ │ └── variables.tf │ └── accelerator-cluster │ │ ├── README.md │ │ ├── stage-1 │ │ ├── README.md │ │ ├── main.tf │ │ ├── modules │ │ │ └── gke-infra │ │ │ │ ├── README.md │ │ │ │ ├── cluster.tf │ │ │ │ ├── filestore.tf │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ ├── outputs.tf │ │ ├── sample-tfvars │ │ │ ├── gpu-sample.tfvars │ │ │ └── jetstream-sample.tfvars │ │ └── variables.tf │ │ └── stage-2 │ │ ├── README.md │ │ ├── main.tf │ │ ├── modules │ │ └── gke-setup │ │ │ ├── main.tf │ │ │ ├── modules │ │ │ ├── gcs-fuse │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── nvidia-dcgm │ │ │ │ ├── main.tf │ │ │ │ ├── manifest-templates │ │ │ │ │ ├── 01-ds-dcgm.yaml │ │ │ │ │ ├── 02-ds-exporter.yaml │ │ │ │ │ ├── 03-cm-dcgm.yaml │ │ │ │ │ └── pod-monitoring.yaml │ │ │ │ └── variables.tf │ │ │ ├── output-benchmark │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── secret-manager │ │ │ │ ├── csi-driver-gcp-plugin │ │ │ │ │ └── provider-gcp-plugin.yaml │ │ │ │ ├── csi-driver │ │ │ │ │ ├── csidriver.yaml │ │ │ │ │ ├── rbac-secretproviderclass.yaml │ │ │ │ │ ├── rbac-secretprovidersyncing.yaml │ │ │ │ │ ├── secrets-store-csi-driver.yaml │ │ │ │ │ ├── secrets-store.csi.x-k8s.io_secretproviderclasses.yaml │ │ │ │ │ └── secrets-store.csi.x-k8s.io_secretproviderclasspodstatuses.yaml │ │ │ │ ├── main.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ └── workload-identity │ │ │ │ ├── gcp.tf │ │ │ │ ├── kubernetes.tf │ │ │ │ ├── outputs.tf │ │ │ │ └── variables.tf │ │ │ ├── outputs.tf │ │ │ ├── providers.tf │ │ │ └── variables.tf │ │ ├── outputs.tf │ │ ├── sample-tfvars │ │ ├── gpu-sample.tfvars │ │ └── jetstream-sample.tfvars │ │ └── variables.tf └── orchestration │ ├── README.md │ ├── config │ ├── stage-1.tfvars │ ├── stage-2.tfvars │ └── text-generation-inference.tfvars │ ├── templates │ ├── stage-2.auto.tfvars.tpl │ └── text-generation-inference.auto.tfvars.tpl │ ├── text-generation-inference-apply.sh │ └── text-generation-inference-destroy.sh ├── best-practices ├── README.md ├── gke-batch-refarch │ └── README.md ├── hotswap.md ├── ml-platform │ └── README.md └── startup-latency.md ├── charts ├── gmp-engine │ ├── Chart.yaml │ ├── charts │ │ └── gmp-frontend │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── deployment.yaml │ │ │ └── service.yaml │ │ │ └── values.yaml │ ├── templates │ │ └── podmonitoring.yaml │ └── values.yaml ├── nvidia-dra-driver-gpu │ ├── .helmignore │ ├── Chart.yaml │ ├── LICENSE │ ├── NOTICE │ ├── crds │ │ └── resource.nvidia.com_computedomains.yaml │ ├── templates │ │ ├── _helpers.tpl │ │ ├── clusterrole.yaml │ │ ├── clusterrolebinding.yaml │ │ ├── controller.yaml │ │ ├── deviceclass-compute-domain-daemon.yaml │ │ ├── deviceclass-compute-domain-default-channel.yaml │ │ ├── deviceclass-gpu.yaml │ │ ├── deviceclass-mig.yaml │ │ ├── kubeletplugin.yaml │ │ ├── openshiftprivilegedrolebinging.yaml │ │ ├── serviceaccount.yaml │ │ ├── validatingadmissionpolicy.yaml │ │ ├── validatingadmissionpolicybinding.yaml │ │ └── validation.yaml │ └── values.yaml └── tpu-dra-driver │ ├── Chart.yaml │ ├── README.md │ ├── install-tpu-dra-driver.sh │ ├── templates │ ├── _helpers.tpl │ ├── clusterrole.yaml │ ├── clusterrolebinding.yaml │ ├── deviceclass.yaml │ ├── kubeletplugin.yaml │ ├── serviceaccount.yaml │ ├── validatingadmissionpolicy.yaml │ ├── validatingadmissionpolicybinding.yaml │ └── validation.yml │ └── values.yaml ├── cloudbuild_cleanup.yaml ├── contributing.md ├── gke-batch-refarch └── README.md ├── infrastructure ├── README.md ├── backend.tf ├── main.tf ├── outputs.tf ├── platform.tfvars ├── tfvars_examples │ ├── autopilot-gke-with-existing-network.platform.tfvars │ ├── autopilot-gke-with-new-network.platform.tfvars │ ├── platform.complete.tfvars │ ├── standard-gke-with-exisiting-network.platform.tfvars │ └── standard-gke-with-new-network.platform.tfvars ├── tfvars_tests │ └── standard-gke-public.platform.tfvars ├── variables.tf └── versions.tf ├── jupyter-on-gke ├── modules ├── cloudsql │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── custom-metrics-stackdriver-adapter │ ├── README.md │ ├── main.tf │ ├── templates │ │ ├── apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl │ │ ├── apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl │ │ ├── apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl │ │ ├── clusterrole_custom-metrics-resource-reader.yaml.tftpl │ │ ├── clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl │ │ ├── clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl │ │ ├── clusterrolebinding_external-metrics-reader.yaml.tftpl │ │ ├── deployment_custom-metrics-stackdriver-adapter.yaml.tftpl │ │ ├── rolebinding_custom-metrics-auth-reader.yaml.tftpl │ │ └── service_custom-metrics-stackdriver-adapter.yaml.tftpl │ └── variables.tf ├── gcp-network │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── gcs │ ├── README.md │ ├── main.tf │ ├── variables.tf │ └── versions.tf ├── gke-autopilot-private-cluster │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── gke-autopilot-public-cluster │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── gke-standard-private-cluster │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── gke-standard-public-cluster │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ └── variables.tf ├── iap │ ├── charts │ │ └── iap │ │ │ ├── Chart.yaml │ │ │ ├── templates │ │ │ ├── backend-config.yaml │ │ │ ├── iap-secret.yaml │ │ │ ├── managed-cert.yaml │ │ │ └── static-ingress.yaml │ │ │ └── values.yaml │ ├── iap.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── inference-service │ ├── README.md │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── jetstream-maxtext-deployment │ ├── README.md │ ├── main.tf │ ├── templates │ │ ├── custom-metrics-stackdriver-adapter │ │ │ └── hpa.jetstream.yaml.tftpl │ │ ├── deployment.yaml.tftpl │ │ ├── podmonitoring-tpu.yaml.tftpl │ │ ├── podmonitoring.yaml.tftpl │ │ ├── prometheus-adapter │ │ │ ├── hpa.jetstream.yaml.tftpl │ │ │ └── values.yaml.tftpl │ │ └── service.yaml.tftpl │ └── variables.tf ├── jupyter │ ├── authentication │ │ ├── README.MD │ │ ├── authenticator │ │ │ ├── gcpiapjwtauthenticator │ │ │ │ ├── __init__.py │ │ │ │ ├── gcpiapjwtauthenticator.py │ │ │ │ └── gcpiapjwtauthenticator_test.py │ │ │ └── setup.py │ │ └── docker_image │ │ │ ├── Dockerfile │ │ │ └── cloudbuild.yaml │ ├── images │ │ ├── IAP_screenshot.png │ │ ├── brand_screenshot.png │ │ ├── gcs_bucket.png │ │ ├── iap_enable_api_screenshot.png │ │ ├── image.png │ │ └── oauth_consent_screenshot.png │ ├── jupyter_config │ │ ├── config-selfauth-autopilot.yaml │ │ └── config-selfauth.yaml │ ├── jupyter_image │ │ └── notebook_image │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ ├── cloudbuild.yaml │ │ │ └── requirements.txt │ ├── main.tf │ ├── outputs.tf │ ├── tests │ │ ├── change_jupyter_config.py │ │ └── test_hub.py │ ├── variables.tf │ └── versions.tf ├── kuberay-cluster │ ├── kuberay_image │ │ ├── Dockerfile │ │ ├── cloudbuild.yaml │ │ └── requirements.txt │ ├── main.tf │ ├── outputs.tf │ ├── values.yaml │ ├── variables.tf │ └── versions.tf ├── kuberay-monitoring │ ├── gmpvalues.yaml │ ├── grafana │ │ └── values.yaml │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── kubernetes-namespace │ ├── charts │ │ └── namespace │ │ │ └── Chart.yaml │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf └── prometheus-adapter │ ├── README.md │ ├── main.tf │ └── variables.tf ├── ray-on-gke ├── README.md ├── examples │ ├── notebooks │ │ ├── gpt-j-online.ipynb │ │ ├── jax-tpu.ipynb │ │ ├── ray-dist-mnist.ipynb │ │ ├── ray-fine-tune-hugging-face.ipynb │ │ ├── ray_basic.ipynb │ │ ├── ray_mnist.ipynb │ │ ├── raytrain-stablediffusion.ipynb │ │ ├── stable-diffusion-tpu.ipynb │ │ └── stable_diffusion.ipynb │ └── tfvars ├── guides │ ├── observability │ │ └── README.md │ └── raytrain-with-gcsfusecsi │ │ ├── README.md │ │ ├── images │ │ ├── ray-cluster-on-gke.png │ │ ├── ray-head-resources.png │ │ └── ray-worker-resources.png │ │ └── jupyter-spec.yaml └── tpu │ └── kuberay-tpu-webhook │ └── README.md ├── scripts ├── ci │ └── wait_for_pods.sh └── network-setup │ ├── v6e-increase-rmem.yaml │ └── v6e-network-optimization.yaml ├── security_test ├── README.md ├── allowlist │ └── category │ │ ├── cluster │ │ ├── continuous-image-puller │ │ │ ├── capabilities.json │ │ │ ├── distroless.json │ │ │ ├── imagedigest.json │ │ │ ├── imagefreshness.json │ │ │ ├── imagepath.json │ │ │ ├── readonlyrootfs.json │ │ │ ├── sbom.json │ │ │ └── seccompprofile.json │ │ ├── hub │ │ │ ├── capabilities.json │ │ │ ├── distroless.json │ │ │ ├── imagedigest.json │ │ │ ├── imagefreshness.json │ │ │ ├── imagepath.json │ │ │ ├── rbac.json │ │ │ ├── readonlyrootfs.json │ │ │ └── seccompprofile.json │ │ ├── kuberay-operator-leader-election │ │ │ └── rbac.json │ │ ├── kuberay-operator │ │ │ └── rbac.json │ │ ├── mistral-7b-instruct │ │ │ ├── allowprivilegeescalation.json │ │ │ ├── capabilities.json │ │ │ ├── distroless.json │ │ │ ├── imagedigest.json │ │ │ ├── imagefreshness.json │ │ │ ├── imagepath.json │ │ │ ├── readonlyrootfs.json │ │ │ ├── rootless.json │ │ │ ├── sbom.json │ │ │ └── seccompprofile.json │ │ ├── proxy │ │ │ ├── capabilities.json │ │ │ ├── distroless.json │ │ │ ├── imagedigest.json │ │ │ ├── imagefreshness.json │ │ │ ├── imagepath.json │ │ │ ├── readonlyrootfs.json │ │ │ ├── sbom.json │ │ │ └── seccompprofile.json │ │ ├── rag-frontend │ │ │ ├── allowprivilegeescalation.json │ │ │ ├── capabilities.json │ │ │ ├── distroless.json │ │ │ ├── imagedigest.json │ │ │ ├── imagefreshness.json │ │ │ ├── imagepath.json │ │ │ ├── readonlyrootfs.json │ │ │ ├── rootless.json │ │ │ ├── sbom.json │ │ │ └── seccompprofile.json │ │ ├── ray-cluster-kuberay │ │ │ └── rbac.json │ │ ├── rayjob-editor-role │ │ │ └── rbac.json │ │ ├── rayjob-viewer-role │ │ │ └── rbac.json │ │ ├── rayservice-editor-role │ │ │ └── rbac.json │ │ └── rayservice-viewer-role │ │ │ └── rbac.json │ │ └── helm │ │ ├── iap │ │ └── defaultnamespace.json │ │ └── kuberay-tpu-webhook │ │ ├── allowprivilegeescalation.json │ │ ├── capabilities.json │ │ ├── imagedigest.json │ │ ├── imagefreshness.json │ │ ├── imagepath.json │ │ ├── readonlyrootfs.json │ │ ├── rootless.json │ │ └── seccompprofile.json └── config.yaml ├── slurm-on-gke └── README.md ├── tools ├── README.md ├── dcgm-on-gke │ └── README.md ├── gke-disk-image-builder │ └── README.md └── saxml-on-gke │ └── README.md ├── tpu-provisioner ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── PROJECT ├── README.md ├── admission_controller │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── __init__.py │ ├── admission_controller.py │ ├── certificates │ │ └── README.md │ ├── manifests │ │ └── manifest.yaml │ ├── requirements.txt │ ├── skaffold.yaml │ └── test │ │ ├── __init__.py │ │ ├── admission_controller_test.py │ │ └── e2e │ │ ├── manifests │ │ ├── test-disabled-provisioning.yaml │ │ ├── test-location-hint-no-reservation.yaml │ │ ├── test-location-hint-with-reservation.yaml │ │ └── test-nonjobset-job.yaml │ │ └── test.sh ├── cloudbuild.yaml ├── cmd │ └── main.go ├── config │ ├── default │ │ ├── kustomization.yaml │ │ ├── manager_auth_proxy_patch.yaml │ │ └── manager_config_patch.yaml │ ├── manager │ │ ├── configmap.yaml │ │ ├── kustomization.yaml │ │ └── manager.yaml │ ├── prometheus │ │ ├── kustomization.yaml │ │ └── monitor.yaml │ └── rbac │ │ ├── auth_proxy_client_clusterrole.yaml │ │ ├── auth_proxy_role.yaml │ │ ├── auth_proxy_role_binding.yaml │ │ ├── auth_proxy_service.yaml │ │ ├── kustomization.yaml │ │ ├── leader_election_role.yaml │ │ ├── leader_election_role_binding.yaml │ │ ├── role.yaml │ │ ├── role_binding.yaml │ │ └── service_account.yaml ├── docs │ ├── cleanup.excalidraw.png │ └── provisioning.excalidraw.png ├── examples │ └── jobset.yaml ├── go.mod ├── go.sum ├── internal │ ├── auth │ │ └── gcp │ │ │ ├── README.md │ │ │ ├── gcp.go │ │ │ └── gcp_test.go │ ├── cloud │ │ ├── common.go │ │ ├── gke.go │ │ ├── gke_context.go │ │ ├── gke_service.go │ │ ├── gke_test.go │ │ └── mock.go │ └── controller │ │ ├── creation_controller.go │ │ ├── deletion_controller.go │ │ ├── nodepool_garbage_collector.go │ │ └── pod_utils.go └── test │ ├── crds │ └── jobset-v0.5.0.yaml │ └── integration │ └── controller │ ├── creation_controller_test.go │ ├── deletion_controller_test.go │ ├── mock_provider.go │ └── suite_test.go └── tutorials-and-examples ├── cloudshell-tutorial.md ├── flyte └── README.md ├── genAI-LLM ├── e2e-genai-langchain-app │ ├── README.md │ ├── backend_ip.png │ ├── e2e-genai-langchain.ipynb │ ├── frontend_app.png │ ├── frontend_ip.png │ ├── open_jupyter.png │ └── src │ │ ├── backend │ │ ├── Dockerfile │ │ ├── deploy.yaml │ │ ├── main.py │ │ ├── model.py │ │ ├── requirements.in │ │ └── requirements.txt │ │ └── frontend │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── deploy.yaml │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── src │ │ ├── index.html │ │ └── index.tsx │ │ ├── tsconfig.json │ │ └── webpack.config.js └── finetuning-gemma-2b-on-l4 │ └── README.md ├── gpu-examples ├── a100-jax │ └── README.md ├── online-serving-single-gpu │ └── README.md └── training-single-gpu │ ├── README.md │ ├── data │ └── mnist_predict │ │ ├── 0.png │ │ ├── 1.png │ │ ├── 2.png │ │ ├── 3.png │ │ ├── 4.png │ │ ├── 5.png │ │ ├── 6.png │ │ ├── 7.png │ │ ├── 8.png │ │ └── 9.png │ └── src │ ├── gke-config │ ├── standard-tensorflow-bash.yaml │ ├── standard-tf-mnist-batch-predict.yaml │ └── standard-tf-mnist-train.yaml │ └── tensorflow-mnist-example │ ├── requirements.txt │ ├── tensorflow_mnist_batch_predict.py │ └── tensorflow_mnist_train_distributed.py ├── hf-tgi └── README.md ├── inference-servers ├── checkpoints │ └── README.md ├── jetstream │ ├── README.md │ ├── http-server │ │ ├── Dockerfile │ │ └── http_server.py │ ├── maxtext │ │ ├── maxengine-server │ │ │ ├── Dockerfile │ │ │ └── maxengine_server_entrypoint.sh │ │ └── single-host-inference │ │ │ ├── README.md │ │ │ ├── checkpoint-job.yaml │ │ │ ├── kubectl │ │ │ └── deployment.yaml │ │ │ └── terraform │ │ │ ├── main.tf │ │ │ ├── providers.tf │ │ │ ├── sample-terraform.tfvars │ │ │ ├── variables.tf │ │ │ ├── versions.tf │ │ │ └── versions_override.tf │ └── pytorch │ │ ├── jetstream-pytorch-server │ │ ├── Dockerfile │ │ └── jetstream_pytorch_server_entrypoint.sh │ │ └── single-host-inference │ │ ├── README.md │ │ ├── checkpoint-job.yaml │ │ ├── deployment.yaml │ │ ├── pd-deployment.yaml │ │ └── storage.yaml └── maxdiffusion │ └── README.md ├── kserve └── README.md ├── langchain-chatbot └── README.md ├── llamaindex └── rag │ └── README.md ├── metaflow └── README.md ├── mlflow └── finetune-gemma │ └── README.md ├── models-as-oci └── README.md ├── nvidia-bionemo └── README.md ├── nvidia-nim ├── README.md └── blueprints │ └── README.md ├── skypilot ├── README.md └── dws-and-kueue │ └── README.md ├── storage ├── hyperdisk-ml │ └── README.md └── parallelstore-backup-and-recovery │ ├── README.md │ ├── parallelstore-sa.yaml │ └── ps-to-gcs-backup.yaml ├── tpu-examples ├── single-host-inference │ ├── jax │ │ ├── bert │ │ │ ├── bert_request.py │ │ │ ├── export_bert_model.py │ │ │ ├── install-bert.yaml │ │ │ ├── loadbalancer.yaml │ │ │ └── serve-bert.yaml │ │ ├── requirements.txt │ │ └── stable-diffusion │ │ │ ├── README.md │ │ │ ├── app.py │ │ │ ├── export_stable_diffusion_model.py │ │ │ ├── install-stable-diffusion.yaml │ │ │ ├── loadbalancer.yaml │ │ │ ├── serve-stable-diffusion-tpu-v4.yaml │ │ │ ├── serve-stable-diffusion-v5e.yaml │ │ │ ├── serve-stable-diffusion.yaml │ │ │ └── stable_diffusion_request.py │ ├── pt │ │ └── densenet161 │ │ │ ├── deployment.yml │ │ │ ├── loadbalancer.yml │ │ │ ├── model-archive.yml │ │ │ ├── request.py │ │ │ └── requirements.txt │ ├── pvc-pv.yaml │ └── tf │ │ └── resnet50 │ │ ├── banana.jpeg │ │ ├── deployment.yml │ │ ├── export_resnet_model.py │ │ ├── loadbalancer.yml │ │ ├── model-conversion.yml │ │ ├── request.py │ │ └── requirements.txt └── training │ ├── diffusion │ └── Dockerfile │ ├── gpt │ ├── Dockerfile │ ├── fsdp_config.json │ └── my_config_2.json │ └── mnist-single-tpu │ ├── README.md │ ├── data │ └── mnist_predict │ │ ├── 0.png │ │ ├── 1.png │ │ ├── 2.png │ │ ├── 3.png │ │ ├── 4.png │ │ ├── 5.png │ │ ├── 6.png │ │ ├── 7.png │ │ ├── 8.png │ │ └── 9.png │ └── src │ ├── gke-config │ ├── standard-tensorflow-bash-v4.yaml │ ├── standard-tensorflow-bash-v5e.yaml │ ├── standard-tf-mnist-batch-predict-v4.yaml │ ├── standard-tf-mnist-batch-predict-v5e.yaml │ ├── standard-tf-mnist-train-v4.yaml │ └── standard-tf-mnist-train-v5e.yaml │ └── tensorflow-mnist-example │ ├── requirements.txt │ ├── tensorflow_mnist_batch_predict.py │ └── tensorflow_mnist_train_distributed.py ├── vector-databases └── readme.md └── workflow-orchestration ├── dws-examples └── README.md ├── dws-multiclusters-example └── README.md ├── indexed-job └── README.md └── jobset └── pytorch └── README.md /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | **What type of PR is this?** 9 | > Uncomment only one ` /kind <>` line, press enter to put that in a new line, and remove leading whitespace from that line: 10 | > 11 | > /kind breaking 12 | > /kind bug 13 | > /kind cleanup 14 | > /kind documentation 15 | > /kind enhancement 16 | 17 | **What this PR does / Why we need it**: 18 | 19 | **Which issue(s) this PR fixes**: 20 | 24 | Closes # 25 | 26 | **Special notes for your reviewer**: 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ## Archives 16 | **/*.tar 17 | **/*.tar.gz 18 | **/*.zip 19 | 20 | # Directories 21 | bin/ 22 | deploy/ 23 | 24 | # IDEs 25 | .idea/ 26 | .vscode/ 27 | 28 | # Python 29 | __pycache__/ 30 | 31 | # Terraform 32 | default.tfstate 33 | default.tfstate.backup 34 | .terraform* 35 | terraform.tfstate* 36 | terraform.tfvars 37 | tfplan 38 | .vscode/ 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "best-practices/accelerated-platforms"] 2 | path = best-practices/accelerated-platforms 3 | url = https://github.com/GoogleCloudPlatform/accelerated-platforms.git 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # @name owns any files in the /benchmarks/ 2 | # directory at the root of the repository and any of its 3 | # subdirectories. 4 | /benchmarks/ @achandrasekar @ahg-g @annapendleton @Bslabe123 @jjk-g 5 | /tpu-provisioner/ @echiugoog @nstogner 6 | -------------------------------------------------------------------------------- /applications/jupyter/README.md: -------------------------------------------------------------------------------- 1 | # JupyterHub on GKE 2 | 3 | >[!WARNING] 4 | >The files for the JupyterHub on GKE example have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [JupyterHub on GKE](https://gke-ai-labs.dev/docs/blueprints/jupyter-on-gke/). -------------------------------------------------------------------------------- /applications/rag/README.md: -------------------------------------------------------------------------------- 1 | # RAG on GKE 2 | 3 | >[!WARNING] 4 | >The files for the RAG on GKE example have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [RAG on GKE](https://gke-ai-labs.dev/docs/blueprints/rag-on-gke/). -------------------------------------------------------------------------------- /benchmarks/benchmark/README.md: -------------------------------------------------------------------------------- 1 | >[!WARNING] 2 | >This guide and associated code are **deprecated** and no longer maintained. 3 | > 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 5 | 6 | This directory contains the benchmark datasets and tools 7 | used to run benchmarks. 8 | 9 | To prepare benchmark datasets, see the `datasets` directory. 10 | 11 | To run a benchmarking tool, see the available tools in the `tools` directory. -------------------------------------------------------------------------------- /benchmarks/benchmark/dataset/README.md: -------------------------------------------------------------------------------- 1 | >[!WARNING] 2 | >This guide and associated code are **deprecated** and no longer maintained. 3 | > 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 5 | 6 | This directory contains datasets for various models that are used in 7 | the benchmark runs. -------------------------------------------------------------------------------- /benchmarks/benchmark/dataset/ShareGPT_v3_unflitered_cleaned_split/requirements.txt: -------------------------------------------------------------------------------- 1 | wget 2 | google-cloud-storage 3 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/CL2-benchmark/headless-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: headless-service 5 | spec: 6 | clusterIP: None 7 | selector: 8 | svc-headless: headless 9 | ports: 10 | - port: 80 11 | targetPort: 80 12 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/CL2-benchmark/modules/scheduling-throughput.yaml: -------------------------------------------------------------------------------- 1 | {{$query := `sum(irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[1m]))[%v:5s]`}} 2 | 3 | steps: 4 | - name: "{{.action}}ing scheduling throughput measurement" 5 | measurements: 6 | - Identifier: SchedulingThroughput_{{.basename}} 7 | Method: GenericPrometheusQuery 8 | Params: 9 | action: {{.action}} 10 | enableViolations: true 11 | metricName: {{.basename}} 12 | metricVersion: v1 13 | unit: 1/s 14 | queries: 15 | - name: Max 16 | query: max_over_time({{$query}}) 17 | - name: Avg 18 | query: avg_over_time({{$query}}) 19 | - name: Perc99 20 | query: quantile_over_time(0.99, {{$query}}) 21 | {{if .threshold}} 22 | threshold: {{.threshold}} 23 | lowerBound: true 24 | {{end}} 25 | - name: Perc90 26 | query: quantile_over_time(0.90, {{$query}}) 27 | - name: Perc50 28 | query: quantile_over_time(0.50, {{$query}}) 29 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/CL2-benchmark/priorityclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: scheduling.k8s.io/v1 2 | kind: PriorityClass 3 | metadata: 4 | name: {{.Name}} 5 | value: {{.Value}} 6 | globalDefault: false 7 | description: "Priority class for user workloads" 8 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/README.md: -------------------------------------------------------------------------------- 1 | >[!WARNING] 2 | >This guide and associated code are **deprecated** and no longer maintained. 3 | > 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 5 | 6 | This directory contains the benchmark tools for measuring performance across multiple inferencing frameworks. -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/dlio/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | version = "~> 2.8.0" 20 | } 21 | kubernetes = { 22 | source = "hashicorp/kubernetes" 23 | version = "2.18.1" 24 | } 25 | kubectl = { 26 | source = "alekc/kubectl" 27 | version = "2.0.1" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pv.tpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: ${pv_name} 5 | spec: 6 | storageClassName: "" 7 | capacity: 8 | storage: 12Ti 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | volumeMode: Filesystem 13 | csi: 14 | driver: parallelstore.csi.storage.gke.io 15 | volumeHandle: ${project}/${ps_location}/${ps_instance_name}/default-pool/default-container 16 | volumeAttributes: 17 | ip: "${ps_ip_address_1}, ${ps_ip_address_2}, ${ps_ip_address_3}" 18 | network: ${ps_network_name} -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/ps_pvc.tpl: -------------------------------------------------------------------------------- 1 | kind: PersistentVolumeClaim 2 | apiVersion: v1 3 | metadata: 4 | name: ${pvc_name} 5 | namespace: ${namespace} 6 | spec: 7 | accessModes: 8 | - ReadWriteMany 9 | storageClassName: ${storageclass} 10 | volumeName: ${pv_name} 11 | resources: 12 | requests: 13 | storage: 12000Gi -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | version = "~> 2.8.0" 20 | } 21 | kubernetes = { 22 | source = "hashicorp/kubernetes" 23 | version = "2.18.1" 24 | } 25 | kubectl = { 26 | source = "alekc/kubectl" 27 | version = "2.0.1" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/storage/pv_podspec.tpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: ${pv_name} 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | capacity: 9 | # This is a placeholder, can be any number. It needs to match with the PVC resource.requests.storage field 10 | storage: 1Gi 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: dummy-storage-class 13 | mountOptions: 14 | - stat-cache-capacity=${gcsfuse_stat_cache_capacity} 15 | - stat-cache-ttl=${gcsfuse_stat_cache_ttl} 16 | - type-cache-ttl=${gcsfuse_type_cache_ttl} 17 | claimRef: 18 | namespace: ${namespace} 19 | name: ${pvc_name} 20 | csi: 21 | driver: gcsfuse.csi.storage.gke.io 22 | volumeHandle: ${gcs_bucket} # unique bucket name -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/storage/pvc_podspec.tpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: ${pvc_name} 5 | namespace: ${namespace} 6 | spec: 7 | accessModes: 8 | - ReadWriteMany 9 | resources: 10 | requests: 11 | storage: 1Gi 12 | volumeName: ${pv_name} 13 | storageClassName: dummy-storage-class -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/modules/storage/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | version = "~> 2.8.0" 20 | } 21 | kubernetes = { 22 | source = "hashicorp/kubernetes" 23 | version = "2.18.1" 24 | } 25 | kubectl = { 26 | source = "alekc/kubectl" 27 | version = "2.0.1" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/dlio/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | helm = { 21 | source = "hashicorp/helm" 22 | version = "~> 2.8.0" 23 | } 24 | kubernetes = { 25 | source = "hashicorp/kubernetes" 26 | version = "2.18.1" 27 | } 28 | kubectl = { 29 | source = "alekc/kubectl" 30 | version = "2.0.1" 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/build.tf: -------------------------------------------------------------------------------- 1 | resource "null_resource" "build_and_push_image" { 2 | 3 | depends_on = [resource.google_project_service.cloudbuild] 4 | provisioner "local-exec" { 5 | working_dir = path.module 6 | command = "gcloud builds submit --tag ${var.artifact_registry}/locust-tasks:latest locust-docker" 7 | } 8 | } 9 | 10 | resource "null_resource" "build_and_push_runner_image" { 11 | 12 | provisioner "local-exec" { 13 | working_dir = path.module 14 | command = "gcloud builds submit --tag ${var.artifact_registry}/locust-runner:latest locust-runner" 15 | } 16 | } 17 | 18 | resource "null_resource" "build_and_push_exporter_image" { 19 | 20 | provisioner "local-exec" { 21 | working_dir = path.module 22 | command = "gcloud builds submit --tag ${var.artifact_registry}/locust-custom-exporter:latest locust-custom-exporter" 23 | } 24 | } -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20 2 | 3 | # Set destination for COPY 4 | WORKDIR /app 5 | 6 | # Download Go modules 7 | COPY go.mod go.sum ./ 8 | RUN go mod download 9 | 10 | # Copy the source code. Note the slash at the end, as explained in 11 | # https://docs.docker.com/reference/dockerfile/#copy 12 | COPY *.go ./ 13 | 14 | # Build 15 | RUN CGO_ENABLED=0 GOOS=linux go build -o /locust_exporter 16 | 17 | EXPOSE 8080 18 | 19 | # Run 20 | CMD ["/locust_exporter"] 21 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The Prometheus Authors 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | include Makefile.common 15 | 16 | ## This is a copy! 17 | ## https://github.com/prometheus/prometheus/blob/main/Makefile.common 18 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/ContainerSolutions/locust_exporter 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/prometheus/client_golang v1.11.1 7 | github.com/prometheus/common v0.26.0 8 | gopkg.in/alecthomas/kingpin.v2 v2.2.6 9 | ) 10 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt: -------------------------------------------------------------------------------- 1 | Brotli==1.0.9 2 | certifi==2023.7.22 3 | chardet==4.0.0 4 | charset-normalizer==2.0.12 5 | click==8.1.2 6 | ConfigArgParse==1.5.5 7 | Flask==2.2.5 8 | Flask-BasicAuth==0.2.0 9 | Flask-Cors==4.0.2 10 | gevent==23.9.0 11 | geventhttpclient==2.0.11 12 | greenlet==2.0.0 13 | google-cloud-storage 14 | idna==3.7 15 | importlib-metadata==4.11.3 16 | itsdangerous==2.1.2 17 | Jinja2==3.1.6 18 | locust==2.20.1 19 | MarkupSafe==2.1.1 20 | msgpack==1.0.3 21 | msgpack-python==0.5.6 22 | psutil==5.9.1 23 | pyzmq==25.0.0 24 | requests==2.31.0 25 | roundrobin==0.0.2 26 | six==1.16.0 27 | transformers==4.48.0 28 | typing_extensions==4.1.1 29 | urllib3==1.26.18 30 | Werkzeug==3.0.3 31 | zipp==3.8.0 32 | zope.event==4.5.0 33 | zope.interface==5.4.0 34 | TensorFlow >= 2.0 35 | google-jetstream==0.2.0 36 | grpcio==1.62.2 37 | grpc-interceptor==0.15.4 38 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-runner/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM python:3.9 16 | 17 | WORKDIR /code 18 | 19 | COPY ./requirements.txt /code/requirements.txt 20 | COPY ./metrics.yaml /code/metrics.yaml 21 | 22 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt 23 | 24 | COPY ./app /code/app 25 | 26 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-runner/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/locust-load-inference/locust-runner/app/__init__.py -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/locust-runner/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | requests 3 | uvicorn 4 | google-cloud-monitoring 5 | google-cloud-storage 6 | pathlib 7 | PyYAML -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/manifest-templates/pod-monitoring.yaml.tpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: locust-scrapper 5 | namespace: ${namespace} 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: locust-master 10 | endpoints: 11 | - port: 8080 12 | interval: 5s 13 | - port: 9646 14 | interval: 5s -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/runner-manifest-template/locust-runner-service.yaml.tpl: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: locust-runner-api 5 | namespace: ${namespace} 6 | annotations: 7 | networking.gke.io/load-balancer-type: "External" 8 | labels: 9 | app: locust-runner 10 | spec: 11 | ports: 12 | - port: 8000 13 | targetPort: 8000 14 | protocol: TCP 15 | %{ for runner_endpoint_ip in runner_endpoint_ip_list ~} 16 | loadBalancerIP: ${runner_endpoint_ip} 17 | %{ endfor ~} 18 | selector: 19 | app: locust-runner 20 | type: LoadBalancer 21 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/locust-load-inference/runner-manifest-template/locust-runner.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: locust-runner 5 | namespace: ${namespace} 6 | labels: 7 | app: locust-runner 8 | examples.ai.gke.io/source: ai-on-gke-benchmarks 9 | spec: 10 | serviceAccountName: ${ksa} 11 | containers: 12 | - name: locust-runner 13 | image: ${artifact_registry}/locust-runner:latest 14 | env: 15 | - name: PROJECT_ID 16 | value: ${project_id} 17 | - name: BUCKET 18 | value: ${bucket} 19 | - name: DURATION 20 | value: ${duration} 21 | - name: USERS 22 | value: ${users} 23 | - name: RATE 24 | value: ${rate} 25 | - name: NAMESPACE 26 | value: ${namespace} -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/benchmarker.ini: -------------------------------------------------------------------------------- 1 | [default] 2 | MODEL_LOAD_BENCHMARK_CONFIG = base-config.yaml 3 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/config/utils.go: -------------------------------------------------------------------------------- 1 | package config 2 | 3 | import ( 4 | "strconv" 5 | "strings" 6 | ) 7 | 8 | // Helper function to parse values with units 9 | func parseValueUnit(value string) (int, string) { 10 | numStr := strings.TrimRightFunc(value, func(r rune) bool { 11 | return r < '0' || r > '9' 12 | }) 13 | unit := strings.TrimPrefix(value, numStr) 14 | num, _ := strconv.Atoi(numStr) 15 | return num, unit 16 | } 17 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: pod-creator-binding 5 | subjects: 6 | - kind: User 7 | name: gke_kunjanp-gke-dev-2_us-west4_gpu-dev-cluster 8 | apiGroup: rbac.authorization.k8s.io 9 | roleRef: 10 | kind: ClusterRole 11 | name: cluster-admin 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | pyyaml 3 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_cpu_request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_cpu_request.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_download_chunk_size_mb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_download_chunk_size_mb.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_ephemeral_storage_request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_ephemeral_storage_request.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_max_parallel_downloads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_max_parallel_downloads.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_memory_request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_memory_request.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_parallel_downloads_per_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/benchmarks/benchmark/tools/model-load-benchmark/results/elapsed_time_vs_parallel_downloads_per_file.png -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/model-load-benchmark/volumeAttributes.yaml: -------------------------------------------------------------------------------- 1 | volumeAttributes: 2 | bucketName: BUCKET_NAME 3 | mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:4,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:3" 4 | fileCacheCapacity: "-1" 5 | fileCacheForRangeRead: "true" 6 | metadataStatCacheCapacity: "-1" 7 | metadataTypeCacheCapacity: "-1" 8 | metadataCacheTTLSeconds: "600" 9 | -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/profile-generator/build.tf: -------------------------------------------------------------------------------- 1 | resource "null_resource" "build_and_push_image" { 2 | count = var.build_latency_profile_generator_image ? 1 : 0 3 | depends_on = [resource.google_project_service.cloudbuild] 4 | provisioner "local-exec" { 5 | working_dir = path.module 6 | command = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container" 7 | } 8 | } -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/profile-generator/container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.20-slim-bookworm as dev 2 | 3 | RUN apt-get update -y \ 4 | && apt-get install -y python3-pip git vim curl wget 5 | RUN pip3 install --upgrade pip 6 | WORKDIR /workspace 7 | 8 | # install build and runtime dependencies 9 | COPY requirements.txt requirements.txt 10 | RUN pip install -r requirements.txt 11 | 12 | RUN pip install -U "huggingface_hub[cli]" 13 | 14 | RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 15 | 16 | COPY benchmark_serving.py benchmark_serving.py 17 | COPY latency_throughput_curve.sh latency_throughput_curve.sh 18 | 19 | RUN chmod +x latency_throughput_curve.sh 20 | RUN chmod +x benchmark_serving.py -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/profile-generator/container/requirements.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | toml==0.10.2 4 | ruff==0.1.5 5 | 6 | # type checking 7 | mypy==0.991 8 | types-PyYAML 9 | types-requests 10 | types-setuptools 11 | 12 | # testing 13 | pytest 14 | pytest-forked 15 | pytest-asyncio 16 | httpx 17 | einops # required for MPT 18 | openai 19 | requests 20 | 21 | # run 22 | ninja # For faster builds. 23 | psutil 24 | ray >= 2.9 25 | sentencepiece # Required for LLaMA tokenizer. 26 | numpy < 2.0 27 | torch == 2.6.0 28 | transformers >= 4.42.0 # Required for Qwen2 29 | xformers == 0.0.23 30 | fastapi 31 | uvicorn[standard] 32 | pydantic >= 2.0 # Required for OpenAI server. 33 | aioprometheus[starlette] 34 | pynvml == 11.5.0 35 | accelerate 36 | aiohttp 37 | google-auth 38 | google-cloud-storage >= 2.18.2 39 | prometheus_client >= 0.21.0 -------------------------------------------------------------------------------- /benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: "lpg-driver-podmonitoring" 5 | namespace: ${namespace} 6 | spec: 7 | selector: 8 | matchLabels: 9 | name: latency-profile-generator 10 | endpoints: 11 | - port: 9090 12 | interval: 15s 13 | -------------------------------------------------------------------------------- /benchmarks/inference-server/README.md: -------------------------------------------------------------------------------- 1 | >[!WARNING] 2 | >This guide and associated code are **deprecated** and no longer maintained. 3 | > 4 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 5 | 6 | This directory contains the inference server specific setup and the 7 | Terraform templates associated with them. 8 | 9 | The current supported options are: 10 | - Text Generation Inference (aka TGI) 11 | - TensorRT-LLM on Triton Inference Server 12 | 13 | You may also choose to manually deploy your own inference server. 14 | 15 | To deploy an inference server, cd into the respective directory and follow 16 | instructions on the respective README.md -------------------------------------------------------------------------------- /benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: data-loader-7b 5 | spec: 6 | ttlSecondsAfterFinished: 30 7 | template: 8 | spec: 9 | serviceAccountName: benchmark-sa 10 | restartPolicy: Never 11 | containers: 12 | - name: inference-checkpoint 13 | image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.0 14 | args: 15 | - -b=GEMMA_BUCKET_NAME 16 | - -m=google/gemma/maxtext/7b-it/2 17 | volumeMounts: 18 | - mountPath: "/kaggle/" 19 | name: kaggle-credentials 20 | readOnly: true 21 | resources: 22 | requests: 23 | google.com/tpu: 4 24 | limits: 25 | google.com/tpu: 4 26 | nodeSelector: 27 | cloud.google.com/gke-tpu-topology: 2x2 28 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 29 | volumes: 30 | - name: kaggle-credentials 31 | secret: 32 | defaultMode: 0400 33 | secretName: kaggle-secret -------------------------------------------------------------------------------- /benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl: -------------------------------------------------------------------------------- 1 | %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} 2 | 3 | apiVersion: secrets-store.csi.x-k8s.io/v1 4 | kind: SecretProviderClass 5 | metadata: 6 | name: gcp-secret-provider 7 | namespace: ${namespace} 8 | spec: 9 | provider: gcp 10 | parameters: 11 | secrets: | 12 | - resourceName: "${hugging_face_token_secret}" 13 | fileName: "secret.txt" 14 | secretObjects: 15 | - data: 16 | - key: HF_TOKEN 17 | objectName: secret.txt 18 | secretName: hf-token # name of the Kubernetes Secret object 19 | type: Opaque # name of the mounted content to sync. this could be the object name or the object alias 20 | 21 | %{ endfor ~} 22 | -------------------------------------------------------------------------------- /benchmarks/inference-server/text-generation-inference/autoscaling.md: -------------------------------------------------------------------------------- 1 | # Autoscaling TGI 2 | 3 | ## tl;dr 4 | 5 | Recommendation: TODO 6 | 7 | ## Autoscaling Options 8 | 9 | ### CPU 10 | 11 | CPU scaling is a poor choice for this workload - the TGI workload starts up, 12 | pulls the model weights, and then spends a minute or two worth of cpu time 13 | crunching some numbers. This causes hpa to add a replica, which then spends 14 | more cpu time, which causes hpa to add a replica, etc. Eventually, things 15 | settle, and hpa scales down the replicas. This whole process could take up to 16 | an hour. 17 | 18 | ### Custom Metrics 19 | 20 | Workload/custom metrics can be viewed in 21 | https://console.cloud.google.com/monitoring/metrics-explorer. (Just search for 22 | the metric name, e.g. "tgi_batch_current_size". The full name should be 23 | "prometheus/tgi_batch_current_size/gauge") 24 | 25 | #### `tgi_batch_current_size` 26 | 27 | TODO 28 | 29 | ### External Metrics 30 | 31 | TODO 32 | -------------------------------------------------------------------------------- /benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v1 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | name: tgi 5 | namespace: ${namespace} 6 | spec: 7 | scaleTargetRef: 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | name: tgi 11 | minReplicas: ${hpa_min_replicas} 12 | maxReplicas: ${hpa_max_replicas} 13 | targetCPUUtilizationPercentage: ${hpa_averagevalue_target} 14 | -------------------------------------------------------------------------------- /benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v2 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | name: tgi 5 | namespace: ${namespace} 6 | spec: 7 | scaleTargetRef: 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | name: tgi 11 | minReplicas: ${hpa_min_replicas} 12 | maxReplicas: ${hpa_max_replicas} 13 | metrics: 14 | %{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 } 15 | - type: External 16 | external: 17 | metric: 18 | name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown 19 | target: 20 | type: AverageValue 21 | averageValue: ${hpa_averagevalue_target} 22 | %{ else } 23 | - type: Pods 24 | pods: 25 | metric: 26 | name: prometheus.googleapis.com|${custom_metric_name}|gauge 27 | target: 28 | type: AverageValue 29 | averageValue: ${hpa_averagevalue_target} 30 | %{ endif } 31 | -------------------------------------------------------------------------------- /benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference-svc.tftpl: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: tgi 19 | namespace: ${namespace} 20 | labels: 21 | app: tgi 22 | spec: 23 | type: LoadBalancer 24 | ports: 25 | - port: 80 26 | targetPort: 80 27 | protocol: TCP 28 | selector: 29 | app: tgi 30 | -------------------------------------------------------------------------------- /benchmarks/inference-server/text-generation-inference/monitoring-templates/tgi-podmonitoring.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: "tgi-podmonitoring" 5 | namespace: ${namespace} 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: tgi 10 | endpoints: 11 | - port: 80 12 | interval: 15s 13 | -------------------------------------------------------------------------------- /benchmarks/inference-server/triton/sample-terraform.tfvars: -------------------------------------------------------------------------------- 1 | credentials_config = { 2 | fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" 3 | } 4 | 5 | namespace = "benchmark" 6 | ksa = "benchmark-ksa" 7 | model_id = "meta-llama/Llama-2-7b-chat-hf" 8 | gpu_count = 1 9 | gcs_model_path = "" -------------------------------------------------------------------------------- /benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v2 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | name: vllm 5 | namespace: ${namespace} 6 | spec: 7 | scaleTargetRef: 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | name: vllm 11 | minReplicas: ${hpa_min_replicas} 12 | maxReplicas: ${hpa_max_replicas} 13 | metrics: 14 | - type: Pods 15 | pods: 16 | metric: 17 | name: prometheus.googleapis.com|${custom_metric_name}|gauge 18 | target: 19 | type: AverageValue 20 | averageValue: ${hpa_averagevalue_target} 21 | -------------------------------------------------------------------------------- /benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: vllm 19 | namespace: ${namespace} 20 | labels: 21 | app: vllm 22 | spec: 23 | type: LoadBalancer 24 | ports: 25 | - port: 80 26 | targetPort: 80 27 | protocol: TCP 28 | selector: 29 | app: vllm 30 | -------------------------------------------------------------------------------- /benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: "vllm-podmonitoring" 5 | namespace: ${namespace} 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: vllm 10 | endpoints: 11 | - path: /metrics 12 | port: 8000 13 | interval: 15s 14 | -------------------------------------------------------------------------------- /benchmarks/inference-server/vllm/sample-terraform.tfvars: -------------------------------------------------------------------------------- 1 | credentials_config = { 2 | fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" 3 | } 4 | 5 | namespace = "benchmark" 6 | ksa = "benchmark-ksa" 7 | model_id = "tiiuae/falcon-7b" 8 | gpu_count = 1 9 | project_id = "" 10 | 11 | # How to (horizontally) scale the workload. Allowed values are: 12 | # - Workload metrics (i.e. custom metrics): 13 | # - "vllm:gpu_cache_usage_perc" 14 | # - "vllm:num_requests_waiting" 15 | # - Other possibilities coming soon... 16 | # 17 | # See `autoscaling.md` for more details and recommendations. 18 | # hpa_type = "vllm:gpu_cache_usage_perc" 19 | 20 | # Sets the averagevalue target of the hpa metric. 21 | # hpa_averagevalue_target = 0.95 22 | 23 | # Adjust these if you want different min/max values 24 | # hpa_min_replicas = 1 25 | # hpa_max_replicas = 5 -------------------------------------------------------------------------------- /benchmarks/infra/65k-cpu-cluster/provider.tf: -------------------------------------------------------------------------------- 1 | provider "google" { 2 | project = var.project_name 3 | } 4 | -------------------------------------------------------------------------------- /benchmarks/infra/65k-cpu-cluster/sample-tfvars/65k-sample.tfvars: -------------------------------------------------------------------------------- 1 | project_name = "$PROJECT_ID" 2 | cluster_name = "gke-benchmark" 3 | region = "us-central1" 4 | min_master_version = "1.31.2" 5 | vpc_network = "$NETWORK" 6 | node_locations = ["us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"] 7 | datapath_provider = "ADVANCED_DATAPATH" 8 | master_ipv4_cidr_block = "172.16.0.0/28" 9 | ip_cidr_range = "10.0.0.0/9" 10 | cluster_ipv4_cidr_block = "/10" 11 | services_ipv4_cidr_block = "/18" 12 | node_pool_count = 16 13 | node_pool_size = 1000 14 | initial_node_count = 250 15 | node_pool_create_timeout = "60m" -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-1/modules/gke-infra/filestore.tf: -------------------------------------------------------------------------------- 1 | resource "google_filestore_instance" "instance" { 2 | for_each = var.filestore_storage 3 | name = each.value.name 4 | 5 | project = module.project.project_id 6 | 7 | location = var.gke_location 8 | tier = each.value.tier 9 | 10 | file_shares { 11 | capacity_gb = each.value.capacity_gb 12 | name = "filestore_share" 13 | } 14 | 15 | networks { 16 | network = local.cluster_vpc.network 17 | modes = ["MODE_IPV4"] 18 | connect_mode = "DIRECT_PEERING" 19 | } 20 | } -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-1/sample-tfvars/gpu-sample.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "$PROJECT_ID" 2 | cluster_name = "ai-benchmark" 3 | region = "us-central1" 4 | gke_location = "us-central1-a" 5 | prefix = "ai-benchmark" 6 | 7 | vpc_create = { 8 | name = "ai-benchmark" 9 | enable_cloud_nat = true 10 | } 11 | 12 | cluster_options = { 13 | enable_gcs_fuse_csi_driver = false 14 | enable_gcp_filestore_csi_driver = false 15 | enable_gce_persistent_disk_csi_driver = false 16 | } 17 | 18 | nodepools = { 19 | nodepool-cpu = { 20 | machine_type = "n2-standard-2", 21 | }, 22 | nodepool-gpu = { 23 | ephemeral_ssd_block_config = { 24 | ephemeral_ssd_count = 1 25 | } 26 | machine_type = "g2-standard-16", 27 | guest_accelerator = { 28 | type = "nvidia-l4", 29 | count = 1, 30 | gpu_driver = { 31 | version = "LATEST" 32 | } 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-1/sample-tfvars/jetstream-sample.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "PROJECT_ID" 2 | cluster_name = "ai-benchmark" 3 | region = "us-east1" 4 | gke_location = "us-east1-c" 5 | prefix = "ai-benchmark" 6 | spot_vms = true 7 | 8 | vpc_create = { 9 | name = "ai-benchmark" 10 | enable_cloud_nat = true 11 | } 12 | 13 | cluster_options = { 14 | enable_gcs_fuse_csi_driver = false 15 | enable_gcp_filestore_csi_driver = false 16 | enable_gce_persistent_disk_csi_driver = false 17 | } 18 | 19 | nodepools = { 20 | nodepool-tpu = { 21 | machine_type = "ct5lp-hightpu-4t", 22 | spot = true, 23 | }, 24 | nodepool-cpu = { 25 | machine_type = "n2-standard-2", 26 | }, 27 | } 28 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/gcs-fuse/outputs.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | output "created_resources" { 18 | description = "IDs of the resources created, if any." 19 | value = merge( 20 | var.bucket_create ? {} : { 21 | bucket_name = module.gcs-fuse-bucket.name 22 | bucket_location = module.gcs-fuse-bucket.location 23 | } 24 | ) 25 | } 26 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/output-benchmark/outputs.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | output "created_resources" { 18 | description = "IDs of the resources created, if any." 19 | value = merge( 20 | { 21 | bucket_name = module.gcs-result-bucket.name 22 | benchmark_tool_runner_endpoint = resource.google_compute_address.benchmark-tool-runner-endpoint.address 23 | } 24 | ) 25 | } -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/csi-driver/csidriver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: storage.k8s.io/v1 2 | kind: CSIDriver 3 | metadata: 4 | name: secrets-store.csi.k8s.io 5 | spec: 6 | podInfoOnMount: true 7 | attachRequired: false 8 | volumeLifecycleModes: 9 | - Ephemeral -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/csi-driver/rbac-secretprovidersyncing.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: secretprovidersyncing-role 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - secrets 10 | verbs: 11 | - create 12 | - delete 13 | - get 14 | - list 15 | - patch 16 | - update 17 | - watch 18 | --- 19 | apiVersion: rbac.authorization.k8s.io/v1 20 | kind: ClusterRoleBinding 21 | metadata: 22 | name: secretprovidersyncing-rolebinding 23 | roleRef: 24 | apiGroup: rbac.authorization.k8s.io 25 | kind: ClusterRole 26 | name: secretprovidersyncing-role 27 | subjects: 28 | - kind: ServiceAccount 29 | name: secrets-store-csi-driver 30 | namespace: kube-system -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/modules/secret-manager/outputs.tf: -------------------------------------------------------------------------------- 1 | output "created_resources" { 2 | description = "IDs of the resources created, if any." 3 | value = { 4 | secret = module.secret-manager.ids 5 | } 6 | } -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/modules/gke-setup/outputs.tf: -------------------------------------------------------------------------------- 1 | output "created_resources" { 2 | description = "IDs of the resources created, if any." 3 | value = merge( 4 | var.secret_create == true ? module.secret-manager[0].created_resources : {}, 5 | #var.gcs_fuse_create == true ? module.gcs-fuse[0].created_resources : {}, 6 | var.workload_identity_create == true ? module.workload-identity[0].created_resources : {}, 7 | #var.nvidia_dcgm_create == true ? module.nvidia-dcgm.created_resources : {} 8 | module.output-benchmark.created_resources, 9 | ) 10 | } 11 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/outputs.tf: -------------------------------------------------------------------------------- 1 | output "created_resources" { 2 | description = "Created resources" 3 | value = module.gke-setup 4 | } 5 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/sample-tfvars/gpu-sample.tfvars: -------------------------------------------------------------------------------- 1 | # can be obtained from stage-1 by running: 2 | # terraform output -json | jq '."fleet_host".value' 3 | credentials_config = { 4 | fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" 5 | } 6 | 7 | # can be obtained from stage-1 by running: 8 | # terraform output -json | jq '."project_id".value' 9 | project_id = "$PROJECT_ID" 10 | 11 | bucket_name = "${PROJECT_ID}-ai-gke-benchmark-fuse" 12 | bucket_location = "US" 13 | 14 | output_bucket_name = "${PROJECT_ID}-benchmark-output" 15 | output_bucket_location = "US" 16 | 17 | google_service_account = "benchmark-sa" 18 | kubernetes_service_account = "benchmark-ksa" 19 | 20 | benchmark_runner_google_service_account = "sample-runner-sa" 21 | benchmark_runner_kubernetes_service_account = "sample-runner-ksa" 22 | -------------------------------------------------------------------------------- /benchmarks/infra/accelerator-cluster/stage-2/sample-tfvars/jetstream-sample.tfvars: -------------------------------------------------------------------------------- 1 | # can be obtained from stage-1 by running: 2 | # terraform output -json | jq '."fleet_host".value' 3 | credentials_config = { 4 | fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" 5 | } 6 | 7 | # can be obtained from stage-1 by running: 8 | # terraform output -json | jq '."project_id".value' 9 | project_id = "PROJECT_ID" 10 | 11 | bucket_name = "${PROJECT_ID}-model-repo-bucket-01" 12 | bucket_location = "US" 13 | 14 | output_bucket_name = "${PROJECT_ID}-benchmark-output-bucket-01" 15 | output_bucket_location = "US" 16 | 17 | google_service_account = "benchmark-sa-01" 18 | kubernetes_service_account = "benchmark-sa" 19 | 20 | benchmark_runner_google_service_account = "sample-runner-sa-01" 21 | benchmark_runner_kubernetes_service_account = "sample-runner-sa" 22 | 23 | nvidia_dcgm_create = "false" 24 | namespace = "default" 25 | namespace_create = false 26 | gcs_fuse_create = true 27 | 28 | -------------------------------------------------------------------------------- /benchmarks/orchestration/README.md: -------------------------------------------------------------------------------- 1 | # AI on GKE Benchmark Framework Orchestration 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 7 | 8 | ## Pre-requisites 9 | * terraform 10 | * jq 11 | * sed 12 | 13 | ### Configuration 14 | Configuration is split across config files where files you need to modify are and templates where files that are automatically filled based on outputs from previous stages. 15 | 16 | ### Running scripts 17 | After you have filled the configuration in config folder run ``text-generation-inference-apply.sh`` which will run stage-1, stage-2 and text-generation-inference stages. 18 | 19 | To destroy the resources that have been created run ``text-generation-inference-destroy.sh`` which will destroy text-generation-inference, stage-2 and stage-1 in that order. 20 | -------------------------------------------------------------------------------- /benchmarks/orchestration/config/stage-1.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "example-project-id" 2 | cluster_name = "test-00" 3 | region = "us-central1" 4 | gke_location = "us-central1-a" 5 | enable_private_endpoint = false 6 | 7 | vpc_create = { 8 | enable_cloud_nat = true 9 | } 10 | 11 | cluster_options = { 12 | enable_gcs_fuse_csi_driver = true 13 | enable_gcp_filestore_csi_driver = true 14 | enable_gce_persistent_disk_csi_driver = true 15 | } 16 | 17 | nodepools = { 18 | nodepool-cpu = { 19 | machine_type = "n2-standard-2", 20 | }, 21 | nodepool-gpu = { 22 | machine_type = "g2-standard-4", 23 | guest_accelerator = { 24 | type = "nvidia-l4", 25 | count = 1, 26 | gpu_driver = { 27 | version = "LATEST" 28 | } 29 | } 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /benchmarks/orchestration/config/stage-2.tfvars: -------------------------------------------------------------------------------- 1 | bucket_name = "ai-gke-benchmark-fuse-demo" 2 | bucket_location = "US" 3 | 4 | secret_name = "hugging_face_secret" 5 | secret_location = "us-central1" 6 | -------------------------------------------------------------------------------- /benchmarks/orchestration/config/text-generation-inference.tfvars: -------------------------------------------------------------------------------- 1 | model_id = "bigscience/bloom-560m" 2 | hugging_face_secret_version = "1" 3 | -------------------------------------------------------------------------------- /benchmarks/orchestration/templates/stage-2.auto.tfvars.tpl: -------------------------------------------------------------------------------- 1 | # can be obtained from stage-1 by running: 2 | # terraform output -json | jq '."fleet_host".value' 3 | credentials_config = { 4 | fleet_host = FLEET_HOST 5 | } 6 | 7 | #terraform output -json | jq '."project_id".value' 8 | project_id = PROJECT_ID 9 | -------------------------------------------------------------------------------- /benchmarks/orchestration/templates/text-generation-inference.auto.tfvars.tpl: -------------------------------------------------------------------------------- 1 | credentials_config = { 2 | fleet_host = FLEET_HOST 3 | } 4 | 5 | #terraform output -json | jq '."project_id".value' 6 | project_id = PROJECT_ID 7 | 8 | hugging_face_secret = HUGGING_FACE_SECRET 9 | 10 | namespace = NAMESPACE_NAME 11 | ksa = KSA_NAME 12 | -------------------------------------------------------------------------------- /benchmarks/orchestration/text-generation-inference-destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | cd ../inference-server/text-generation-inference/ || exit 6 | terraform destroy -auto-approve 7 | 8 | cd ../../infra/stage-2/ || exit 9 | terraform destroy -auto-approve 10 | 11 | cd ../stage-1/ || exit 12 | terraform destroy -auto-approve 13 | -------------------------------------------------------------------------------- /best-practices/gke-batch-refarch/README.md: -------------------------------------------------------------------------------- 1 | # Reference Architecture: Batch Processing Platform on GKE 2 | 3 | >[!WARNING] 4 | >The files for the Batch Processing Platform on GKE example have been moved to the [AI-on-GKE/batch-reference-architecture](https://github.com/ai-on-gke/batch-reference-architecture) repository. Please refer to that repository for the latest updates and instructions. 5 | -------------------------------------------------------------------------------- /best-practices/ml-platform/README.md: -------------------------------------------------------------------------------- 1 | # Moved to the [GoogleCloudPlatform/accelerated-platforms](https://github.com/GoogleCloudPlatform/accelerated-platforms/blob/main/docs/platforms/gke-aiml/README.md) repository which is included as a submodule in the [/best-practices](/best-practices) folder 2 | 3 | ``` 4 | git clone --recurse-submodules https://github.com/GoogleCloudPlatform/ai-on-gke.git 5 | cd ai-on-gke/best-practices/accelerated-platforms 6 | ``` 7 | -------------------------------------------------------------------------------- /charts/gmp-engine/charts/gmp-frontend/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ .Values.name }} 5 | labels: 6 | app: {{ .Values.name }} 7 | spec: 8 | clusterIP: None 9 | ports: 10 | - name: web 11 | port: 9090 12 | selector: 13 | app: {{ .Values.name }} 14 | -------------------------------------------------------------------------------- /charts/gmp-engine/charts/gmp-frontend/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for gmp-frontend. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | name: "gmp-frontend" 6 | projectID: "" 7 | serviceAccount: "" 8 | 9 | image: 10 | repository: gke.gcr.io/prometheus-engine/frontend 11 | pullPolicy: IfNotPresent 12 | tag: "v0.5.0-gke.0" 13 | 14 | replicaCount: 2 15 | 16 | cpu: "1m" 17 | memory: "5Mi" 18 | -------------------------------------------------------------------------------- /charts/gmp-engine/templates/podmonitoring.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | {{- range $pm := .Values.podMonitoring }} 17 | apiVersion: monitoring.googleapis.com/v1 18 | kind: PodMonitoring 19 | metadata: 20 | name: {{ $pm.name}} 21 | spec: 22 | selector: 23 | matchLabels: 24 | {{- $pm.selector | toYaml | nindent 8 }} 25 | endpoints: 26 | - port: {{ $pm.port }} 27 | interval: {{ $pm.interval }} 28 | --- 29 | {{- end }} 30 | 31 | -------------------------------------------------------------------------------- /charts/gmp-engine/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for iap_jupyter. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | podMonitoring: [] 6 | 7 | gmp-frontend: 8 | enabled: false 9 | projectID: "" 10 | serviceAccount: "" 11 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2025 NVIDIA CORPORATION 2 | 3 | This product includes software developed at 4 | NVIDIA CORPORATION (https://nvidia.com). 5 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "nvidia-dra-driver-gpu.name" . }}-role-binding 6 | namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} 7 | subjects: 8 | - kind: ServiceAccount 9 | name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }} 10 | namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} 11 | roleRef: 12 | kind: ClusterRole 13 | name: {{ include "nvidia-dra-driver-gpu.name" . }}-role 14 | apiGroup: rbac.authorization.k8s.io 15 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.resources.computeDomains.enabled }} 2 | --- 3 | apiVersion: resource.k8s.io/v1beta1 4 | kind: DeviceClass 5 | metadata: 6 | name: compute-domain-daemon.nvidia.com 7 | spec: 8 | selectors: 9 | - cel: 10 | expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'" 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.resources.computeDomains.enabled }} 2 | --- 3 | apiVersion: resource.k8s.io/v1beta1 4 | kind: DeviceClass 5 | metadata: 6 | name: compute-domain-default-channel.nvidia.com 7 | spec: 8 | selectors: 9 | - cel: 10 | expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0" 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/deviceclass-gpu.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.resources.gpus.enabled }} 2 | --- 3 | apiVersion: resource.k8s.io/v1beta1 4 | kind: DeviceClass 5 | metadata: 6 | name: gpu.nvidia.com 7 | spec: 8 | selectors: 9 | - cel: 10 | expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/deviceclass-mig.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.resources.gpus.enabled }} 2 | --- 3 | apiVersion: resource.k8s.io/v1beta1 4 | kind: DeviceClass 5 | metadata: 6 | name: mig.nvidia.com 7 | spec: 8 | selectors: 9 | - cel: 10 | expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'" 11 | {{- end }} 12 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/openshiftprivilegedrolebinging.yaml: -------------------------------------------------------------------------------- 1 | # Apply only when running on OpenShift to let the kublet plugin run privileged 2 | {{- if .Capabilities.APIVersions.Has "security.openshift.io/v1/SecurityContextConstraints" -}} 3 | --- 4 | apiVersion: rbac.authorization.k8s.io/v1 5 | kind: RoleBinding 6 | metadata: 7 | name: {{ include "nvidia-dra-driver-gpu.name" . }}-openshift-privileged-role-binding 8 | namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} 9 | subjects: 10 | - kind: ServiceAccount 11 | name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }} 12 | namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} 13 | roleRef: 14 | kind: ClusterRole 15 | name: system:openshift:scc:privileged 16 | apiGroup: rbac.authorization.k8s.io 17 | {{- end }} 18 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }} 6 | namespace: {{ include "nvidia-dra-driver-gpu.namespace" . }} 7 | labels: 8 | {{- include "nvidia-dra-driver-gpu.labels" . | nindent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /charts/nvidia-dra-driver-gpu/templates/validatingadmissionpolicybinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }} 5 | spec: 6 | policyName: resourceslices-policy-{{ include "nvidia-dra-driver-gpu.name" . }} 7 | validationActions: [Deny] 8 | # All ResourceSlices are matched. 9 | -------------------------------------------------------------------------------- /charts/tpu-dra-driver/README.md: -------------------------------------------------------------------------------- 1 | # TPU DRA driver 2 | 3 | This helm chart is for running TPU DRA driver on GKE. The driver is in Private Preview stage now. 4 | 5 | ## Overview 6 | 7 | TPU DRA driver is only supported on GKE cluster version 1.32+ 8 | Make sure to disable the default tpu-device-plugin on the nodes. This can be done by add node label 9 | `gke-no-default-tpu-device-plugin=true` and `gke-no-default-tpu-dra-plugin=true` when creating nodepool 10 | 11 | Run `./install-tpu-dra-driver.sh` to install tpu-dra-driver on your GKE Cluster 12 | nodes with TPU resources -------------------------------------------------------------------------------- /charts/tpu-dra-driver/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "tpu-dra-driver.fullname" . }}-role 6 | namespace: {{ include "tpu-dra-driver.namespace" . }} 7 | rules: 8 | - apiGroups: ["resource.k8s.io"] 9 | resources: ["resourceclaims"] 10 | verbs: ["get"] 11 | - apiGroups: [""] 12 | resources: ["nodes"] 13 | verbs: ["get"] 14 | - apiGroups: ["resource.k8s.io"] 15 | resources: ["resourceslices"] 16 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 17 | -------------------------------------------------------------------------------- /charts/tpu-dra-driver/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "tpu-dra-driver.fullname" . }}-role-binding 6 | namespace: {{ include "tpu-dra-driver.namespace" . }} 7 | subjects: 8 | - kind: ServiceAccount 9 | name: {{ include "tpu-dra-driver.serviceAccountName" . }} 10 | namespace: {{ include "tpu-dra-driver.namespace" . }} 11 | roleRef: 12 | kind: ClusterRole 13 | name: {{ include "tpu-dra-driver.fullname" . }}-role 14 | apiGroup: rbac.authorization.k8s.io 15 | -------------------------------------------------------------------------------- /charts/tpu-dra-driver/templates/deviceclass.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: resource.k8s.io/v1beta1 2 | kind: DeviceClass 3 | metadata: 4 | name: tpu.google.com 5 | spec: 6 | selectors: 7 | - cel: 8 | expression: device.driver == "tpu.google.com" -------------------------------------------------------------------------------- /charts/tpu-dra-driver/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "tpu-dra-driver.serviceAccountName" . }} 6 | namespace: {{ include "tpu-dra-driver.namespace" . }} 7 | labels: 8 | {{- include "tpu-dra-driver.labels" . | nindent 4 }} 9 | {{- with .Values.serviceAccount.annotations }} 10 | annotations: 11 | {{- toYaml . | nindent 4 }} 12 | {{- end }} 13 | {{- end }} 14 | -------------------------------------------------------------------------------- /charts/tpu-dra-driver/templates/validatingadmissionpolicybinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: admissionregistration.k8s.io/v1 2 | kind: ValidatingAdmissionPolicyBinding 3 | metadata: 4 | name: resourceslices-policy-{{ include "tpu-dra-driver.fullname" . }} 5 | spec: 6 | policyName: resourceslices-policy-{{ include "tpu-dra-driver.fullname" . }} 7 | validationActions: [Deny] 8 | # All ResourceSlices are matched. 9 | -------------------------------------------------------------------------------- /gke-batch-refarch/README.md: -------------------------------------------------------------------------------- 1 | # Moved to [best-practices/gke-batch-refarch](/best-practices/gke-batch-refarch) 2 | -------------------------------------------------------------------------------- /infrastructure/backend.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # terraform { 16 | # backend "gcs" { 17 | # bucket = "BUCKET_NAME" 18 | # prefix = "terraform/state" 19 | # } 20 | # } -------------------------------------------------------------------------------- /infrastructure/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | # Creating Autopilot using GKE submodule is broken in v6.2.0. 23 | version = ">= 5.40.0, <= 6.1.0" 24 | } 25 | helm = { 26 | source = "hashicorp/helm" 27 | version = "~> 2.8.0" 28 | } 29 | kubernetes = { 30 | source = "hashicorp/kubernetes" 31 | version = "2.18.1" 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /jupyter-on-gke: -------------------------------------------------------------------------------- 1 | applications/jupyter -------------------------------------------------------------------------------- /modules/cloudsql/README.md: -------------------------------------------------------------------------------- 1 | # CloudSQL 2 | This module contains a Terraform template for creating a CloudSQL instance. 3 | 4 | ## Usage 5 | 6 | 1. Edit `variables.tf` with your GCP settings. 7 | 2. Run `terraform init` and `terraform apply` 8 | 3. Create an IAM service account & grant a cloudsql client role to it: 9 | ``` 10 | gcloud projects add-iam-policy-binding {PROJECT_ID} \ 11 | --member=serviceAccount:{SA_ACCOUNT}.iam.gserviceaccount.com \ 12 | --role="roles/cloudsql.client" 13 | ``` 14 | 15 | Note: Ensure that the regional subnet that is used (referenced by `network_name`) has [Private Service Connect](https://cloud.google.com/vpc/docs/private-service-connect) enabled. 16 | 17 | See [sample RAG application](https://github.com/GoogleCloudPlatform/ai-on-gke/applications/rag/README.md) for example usage of the created instance. -------------------------------------------------------------------------------- /modules/cloudsql/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | kubernetes = { 21 | source = "hashicorp/kubernetes" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.custom.metrics.k8s.io.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: apiregistration.k8s.io/v1 2 | kind: APIService 3 | metadata: 4 | name: v1beta1.custom.metrics.k8s.io 5 | spec: 6 | insecureSkipTLSVerify: true 7 | group: custom.metrics.k8s.io 8 | groupPriorityMinimum: 100 9 | versionPriority: 100 10 | service: 11 | name: custom-metrics-stackdriver-adapter 12 | namespace: custom-metrics 13 | version: v1beta1 14 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta1.external.metrics.k8s.io.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: apiregistration.k8s.io/v1 2 | kind: APIService 3 | metadata: 4 | name: v1beta1.external.metrics.k8s.io 5 | spec: 6 | insecureSkipTLSVerify: true 7 | group: external.metrics.k8s.io 8 | groupPriorityMinimum: 100 9 | versionPriority: 100 10 | service: 11 | name: custom-metrics-stackdriver-adapter 12 | namespace: custom-metrics 13 | version: v1beta1 14 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/apiservice_v1beta2.custom.metrics.k8s.io.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: apiregistration.k8s.io/v1 2 | kind: APIService 3 | metadata: 4 | name: v1beta2.custom.metrics.k8s.io 5 | spec: 6 | insecureSkipTLSVerify: true 7 | group: custom.metrics.k8s.io 8 | groupPriorityMinimum: 100 9 | versionPriority: 200 10 | service: 11 | name: custom-metrics-stackdriver-adapter 12 | namespace: custom-metrics 13 | version: v1beta2 14 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/clusterrole_custom-metrics-resource-reader.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: custom-metrics-resource-reader 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - "pods" 10 | - "nodes" 11 | - "nodes/stats" 12 | verbs: 13 | - list 14 | - get 15 | - watch 16 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics-resource-reader.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: custom-metrics-resource-reader 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: view 9 | subjects: 10 | - kind: ServiceAccount 11 | name: ${cmsa-serviceaccount-name} 12 | namespace: custom-metrics 13 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_custom-metrics:system:auth-delegator.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: custom-metrics:system:auth-delegator 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: system:auth-delegator 9 | subjects: 10 | - kind: ServiceAccount 11 | name: ${cmsa-serviceaccount-name} 12 | namespace: custom-metrics 13 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/clusterrolebinding_external-metrics-reader.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: external-metrics-reader 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: external-metrics-reader 9 | subjects: 10 | - kind: ServiceAccount 11 | name: horizontal-pod-autoscaler 12 | namespace: kube-system 13 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/rolebinding_custom-metrics-auth-reader.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: custom-metrics-auth-reader 5 | namespace: kube-system 6 | roleRef: 7 | apiGroup: rbac.authorization.k8s.io 8 | kind: Role 9 | name: extension-apiserver-authentication-reader 10 | subjects: 11 | - kind: ServiceAccount 12 | name: ${cmsa-serviceaccount-name} 13 | namespace: custom-metrics 14 | -------------------------------------------------------------------------------- /modules/custom-metrics-stackdriver-adapter/templates/service_custom-metrics-stackdriver-adapter.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | run: custom-metrics-stackdriver-adapter 6 | k8s-app: custom-metrics-stackdriver-adapter 7 | kubernetes.io/cluster-service: 'true' 8 | kubernetes.io/name: Adapter 9 | name: custom-metrics-stackdriver-adapter 10 | namespace: custom-metrics 11 | spec: 12 | ports: 13 | - port: 443 14 | protocol: TCP 15 | targetPort: 443 16 | selector: 17 | run: custom-metrics-stackdriver-adapter 18 | k8s-app: custom-metrics-stackdriver-adapter 19 | type: ClusterIP -------------------------------------------------------------------------------- /modules/gcp-network/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "network_name" { 16 | value = google_compute_network.network.name 17 | } 18 | 19 | output "subnets_names" { 20 | value = [for sb in google_compute_subnetwork.subnetwork : sb.name] 21 | } 22 | 23 | output "subnets_ips" { 24 | value = [for sb in google_compute_subnetwork.subnetwork : sb.ip_cidr_range] 25 | } -------------------------------------------------------------------------------- /modules/gcp-network/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /modules/gcs/README.md: -------------------------------------------------------------------------------- 1 | # GCS bucket used in the RAG on GKE demo 2 | 3 | This repository contains a Terraform template for creating the GCS bucket used 4 | in the RAG on GKE demo. 5 | -------------------------------------------------------------------------------- /modules/gcs/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | resource "google_storage_bucket" "static" { 16 | name = var.bucket_name 17 | location = var.region 18 | storage_class = "STANDARD" 19 | uniform_bucket_level_access = true 20 | force_destroy = true 21 | public_access_prevention = "enforced" 22 | } 23 | -------------------------------------------------------------------------------- /modules/gcs/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | type = string 17 | description = "GCP project id" 18 | } 19 | 20 | variable "region" { 21 | type = string 22 | description = "GCS bucket region" 23 | default = "us-central1" 24 | } 25 | 26 | variable "bucket_name" { 27 | type = string 28 | description = "GCS bucket name" 29 | } -------------------------------------------------------------------------------- /modules/gcs/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /modules/gke-autopilot-private-cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "cluster" { 16 | value = module.gke 17 | } 18 | 19 | output "endpoint" { 20 | value = module.gke.endpoint 21 | } 22 | 23 | output "ca_certificate" { 24 | value = module.gke.ca_certificate 25 | } 26 | 27 | output "service_account" { 28 | value = module.gke.service_account 29 | } 30 | -------------------------------------------------------------------------------- /modules/gke-autopilot-private-cluster/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /modules/gke-autopilot-public-cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "cluster" { 16 | value = module.gke 17 | } 18 | 19 | output "endpoint" { 20 | value = module.gke.endpoint 21 | } 22 | 23 | output "ca_certificate" { 24 | value = module.gke.ca_certificate 25 | } 26 | 27 | output "service_account" { 28 | value = module.gke.service_account 29 | } 30 | -------------------------------------------------------------------------------- /modules/gke-standard-private-cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "cluster" { 16 | value = module.gke 17 | } 18 | 19 | output "endpoint" { 20 | value = module.gke.endpoint 21 | } 22 | 23 | output "ca_certificate" { 24 | value = module.gke.ca_certificate 25 | } 26 | 27 | output "service_account" { 28 | value = module.gke.service_account 29 | } 30 | -------------------------------------------------------------------------------- /modules/gke-standard-private-cluster/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /modules/gke-standard-public-cluster/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "cluster" { 16 | value = module.gke 17 | } 18 | 19 | output "endpoint" { 20 | value = module.gke.endpoint 21 | } 22 | 23 | output "ca_certificate" { 24 | value = module.gke.ca_certificate 25 | } 26 | 27 | output "service_account" { 28 | value = module.gke.service_account 29 | } 30 | -------------------------------------------------------------------------------- /modules/iap/charts/iap/templates/backend-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: cloud.google.com/v1 16 | kind: BackendConfig 17 | metadata: 18 | name: {{ .Values.iap.backendConfig.name }} 19 | spec: 20 | iap: 21 | enabled: true 22 | oauthclientCredentials: 23 | secretName: {{ .Values.iap.secret.name }} 24 | -------------------------------------------------------------------------------- /modules/iap/charts/iap/templates/iap-secret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | apiVersion: v1 17 | kind: Secret 18 | metadata: 19 | name: {{ .Values.iap.secret.name }} 20 | data: 21 | client_id: {{ .Values.iap.secret.client_id }} 22 | client_secret: {{ .Values.iap.secret.client_secret }} -------------------------------------------------------------------------------- /modules/iap/charts/iap/templates/managed-cert.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ## ignore template if domain is empty 15 | {{- if .Values.iap.managedCertificate.domain }} 16 | 17 | apiVersion: networking.gke.io/v1 18 | kind: ManagedCertificate 19 | metadata: 20 | name: {{ .Values.iap.managedCertificate.name }} 21 | spec: 22 | domains: 23 | - {{ .Values.iap.managedCertificate.domain }} 24 | 25 | {{- end -}} 26 | -------------------------------------------------------------------------------- /modules/iap/charts/iap/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for iap_jupyter. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | 6 | iap: 7 | backendConfig: 8 | name: "iap-config-default" 9 | 10 | secret: 11 | name: "iap-secret" 12 | client_id: "" 13 | client_secret: "" 14 | 15 | managedCertificate: 16 | name: "iap-managed-cert" 17 | domain: "" 18 | 19 | ingress: 20 | name: "iap-ingress" 21 | staticIpName: "xyz" 22 | backendServiceName: "proxy-public" 23 | backendServicePort: 80 24 | 25 | 26 | -------------------------------------------------------------------------------- /modules/iap/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "domain" { 16 | value = local.domain 17 | } 18 | 19 | output "ip_address" { 20 | value = google_compute_global_address.ip_address.address 21 | } -------------------------------------------------------------------------------- /modules/iap/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | } 23 | kubernetes = { 24 | source = "hashicorp/kubernetes" 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /modules/inference-service/README.md: -------------------------------------------------------------------------------- 1 | # Inference Service 2 | This module is currently designed specifically for the Mistral-7B-Instruct-v0.1 model. Future developments will expand the module to support the creation of customized models more broadly. 3 | -------------------------------------------------------------------------------- /modules/inference-service/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | } 23 | kubernetes = { 24 | source = "hashicorp/kubernetes" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /modules/jetstream-maxtext-deployment/templates/podmonitoring-tpu.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: tpu-metrics-exporter 5 | namespace: kube-system 6 | labels: 7 | k8s-app: tpu-device-plugin 8 | spec: 9 | endpoints: 10 | - port: 2112 11 | interval: ${metrics_scrape_interval}s 12 | selector: 13 | matchLabels: 14 | k8s-app: tpu-device-plugin -------------------------------------------------------------------------------- /modules/jetstream-maxtext-deployment/templates/podmonitoring.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.googleapis.com/v1 2 | kind: PodMonitoring 3 | metadata: 4 | name: jetstream-podmonitoring 5 | namespace: default 6 | spec: 7 | endpoints: 8 | - interval: ${metrics_scrape_interval}s 9 | path: "/" 10 | port: ${metrics_port} 11 | targetLabels: 12 | metadata: 13 | - pod 14 | - container 15 | - node -------------------------------------------------------------------------------- /modules/jetstream-maxtext-deployment/templates/prometheus-adapter/hpa.jetstream.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: autoscaling/v2 2 | kind: HorizontalPodAutoscaler 3 | metadata: 4 | name: jetstream-hpa 5 | namespace: default 6 | spec: 7 | scaleTargetRef: 8 | apiVersion: apps/v1 9 | kind: Deployment 10 | name: maxengine-server 11 | minReplicas: ${hpa_min_replicas} 12 | maxReplicas: ${hpa_max_replicas} 13 | metrics: 14 | %{ for rule in rules } 15 | - type: External 16 | external: 17 | metric: 18 | name: ${rule.target_query} 19 | target: 20 | type: AverageValue 21 | averageValue: ${rule.average_value_target} 22 | %{ endfor ~} -------------------------------------------------------------------------------- /modules/jetstream-maxtext-deployment/templates/service.yaml.tftpl: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: jetstream-svc 5 | namespace: default 6 | spec: 7 | selector: 8 | app: maxengine-server 9 | ports: 10 | - protocol: TCP 11 | name: jetstream-http 12 | port: 8000 13 | targetPort: 8000 14 | - protocol: TCP 15 | name: jetstream-grpc 16 | port: 9000 17 | targetPort: 9000 -------------------------------------------------------------------------------- /modules/jupyter/authentication/authenticator/gcpiapjwtauthenticator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .gcpiapjwtauthenticator import ( 15 | GCPIAPAuthenticator, 16 | IAPUserLoginHandler 17 | ) 18 | 19 | __all__ =['GCPIAPAuthenticator', 'IAPUserLoginHandler'] -------------------------------------------------------------------------------- /modules/jupyter/authentication/docker_image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyterhub/k8s-hub:3.3.0 2 | 3 | RUN pip3 install --no-cache-dir git+https://github.com/GoogleCloudPlatform/ai-on-gke/@main#subdirectory=modules/jupyter/authentication/authenticator -------------------------------------------------------------------------------- /modules/jupyter/authentication/docker_image/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory 16 | steps: 17 | - name: 'gcr.io/cloud-builders/docker' 18 | args: [ 'pull', 'docker.io/jupyterhub/k8s-hub:3.0.0' ] 19 | - name: 'gcr.io/cloud-builders/docker' 20 | args: [ 'build', '-t', '/', '.' ] 21 | images: 22 | - '/' -------------------------------------------------------------------------------- /modules/jupyter/images/IAP_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/IAP_screenshot.png -------------------------------------------------------------------------------- /modules/jupyter/images/brand_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/brand_screenshot.png -------------------------------------------------------------------------------- /modules/jupyter/images/gcs_bucket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/gcs_bucket.png -------------------------------------------------------------------------------- /modules/jupyter/images/iap_enable_api_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/iap_enable_api_screenshot.png -------------------------------------------------------------------------------- /modules/jupyter/images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/image.png -------------------------------------------------------------------------------- /modules/jupyter/images/oauth_consent_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/modules/jupyter/images/oauth_consent_screenshot.png -------------------------------------------------------------------------------- /modules/jupyter/jupyter_image/notebook_image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/tensorflow-notebook:python-3.10 2 | COPY requirements.txt ./requirements.txt 3 | RUN pip install --no-cache-dir -r ./requirements.txt 4 | -------------------------------------------------------------------------------- /modules/jupyter/jupyter_image/notebook_image/README.md: -------------------------------------------------------------------------------- 1 | To build a new jupyter notebook image and use it for the RAG QSS: 2 | 1. Update the cloudbuild.yaml with the new image tag. 3 | 4 | The iamge tag should follow the pattern `sample-public-image-v-rag`.The prefix `sample-public-image-` is needed to so the images will internally be considered as vulnerability remediated and no more bugs will be filed for them. 5 | 2. Then in this path, run: 6 | 7 | `gcloud config set project ai-on-gke` 8 | 9 | `gcloud builds submit --config cloudbuild.yaml .` 10 | 11 | This will build and push the new image to the registry `us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke` 12 | 3. Update the `notebook_image_tag` in `/applications/rag/main.tf` to the new image tag. 13 | -------------------------------------------------------------------------------- /modules/jupyter/jupyter_image/notebook_image/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory 16 | steps: 17 | - name: 'gcr.io/cloud-builders/docker' 18 | args: [ 'pull', 'docker.io/jupyter/tensorflow-notebook:python-3.10' ] 19 | - name: 'gcr.io/cloud-builders/docker' 20 | args: [ 'build', '-t', 'us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image:', '.' ] 21 | images: 22 | - 'us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image:' -------------------------------------------------------------------------------- /modules/jupyter/jupyter_image/notebook_image/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.3.7 2 | ray==2.43.0 3 | datasets==2.18.0 4 | sentence-transformers==2.5.1 5 | kaggle==1.6.6 -------------------------------------------------------------------------------- /modules/jupyter/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "jupyterhub_uri" { 16 | value = var.add_auth ? module.iap_auth[0].domain : "" 17 | } 18 | 19 | output "jupyterhub_user" { 20 | value = var.add_auth ? "" : "admin" 21 | } 22 | 23 | output "jupyterhub_password" { 24 | value = var.add_auth ? "" : random_password.generated_password[0].result 25 | sensitive = true 26 | } 27 | output "jupyterhub_ip_address" { 28 | value = var.add_auth ? module.iap_auth[0].ip_address : "" 29 | } -------------------------------------------------------------------------------- /modules/jupyter/tests/change_jupyter_config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import sys 3 | 4 | config_file = "../jupyter_config/config-selfauth.yaml" 5 | if len(sys.argv) == 2: 6 | autopilot = (sys.argv[1] == "true") 7 | if autopilot: 8 | config_file = "../jupyter_config/config-selfauth-autopilot.yaml" 9 | 10 | with open(config_file, "r") as yaml_file: 11 | data = yaml.safe_load(yaml_file) 12 | 13 | data["hub"]["config"]["DummyAuthenticator"]["password"] = "dummy" 14 | 15 | with open(config_file, 'w') as yaml_file: 16 | yaml.dump(data, yaml_file) 17 | -------------------------------------------------------------------------------- /modules/jupyter/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | google-beta = { 21 | source = "hashicorp/google-beta" 22 | } 23 | helm = { 24 | source = "hashicorp/helm" 25 | version = "~> 2.8.0" 26 | } 27 | kubernetes = { 28 | source = "hashicorp/kubernetes" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /modules/kuberay-cluster/kuberay_image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rayproject/ray:2.9.3-py310-gpu 2 | COPY requirements.txt ./requirements.txt 3 | RUN pip install --no-cache-dir -r ./requirements.txt 4 | -------------------------------------------------------------------------------- /modules/kuberay-cluster/kuberay_image/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # to build, run `gcloud builds submit --config cloudbuild.yaml .` in directory 16 | steps: 17 | - name: 'gcr.io/cloud-builders/docker' 18 | args: [ 'pull', 'docker.io/rayproject/ray:2.9.3-py310-gpu' ] 19 | - name: 'gcr.io/cloud-builders/docker' 20 | args: [ 'build', '-t', '/', '.' ] 21 | images: 22 | - '/' -------------------------------------------------------------------------------- /modules/kuberay-cluster/kuberay_image/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.1.9 2 | transformers==4.38.1 3 | sentence-transformers==2.5.1 4 | pyarrow 5 | datasets==2.18.0 6 | torch==2.0.1 7 | cloud-sql-python-connector[pg8000]==1.7.0 8 | SQLAlchemy==2.0.7 9 | huggingface_hub==0.21.3 -------------------------------------------------------------------------------- /modules/kuberay-cluster/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | version = "~> 2.8.0" 20 | } 21 | kubernetes = { 22 | source = "hashicorp/kubernetes" 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /modules/kuberay-monitoring/gmpvalues.yaml: -------------------------------------------------------------------------------- 1 | podMonitoring: 2 | - name: ray-monitoring 3 | selector: 4 | ray.io/is-ray-node: "yes" 5 | port: metrics 6 | interval: 30s 7 | 8 | gmp-frontend: 9 | enabled: true 10 | -------------------------------------------------------------------------------- /modules/kuberay-monitoring/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "grafana_uri" { 16 | value = var.enable_grafana_on_ray_dashboard ? (data.kubernetes_service.example[0].status != null ? (data.kubernetes_service.example[0].status[0].load_balancer != null ? "${data.kubernetes_service.example[0].status[0].load_balancer[0].ingress[0].ip}" : "") : "") : "" 17 | } 18 | 19 | -------------------------------------------------------------------------------- /modules/kuberay-monitoring/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | version = "~> 2.8.0" 20 | } 21 | kubernetes = { 22 | source = "hashicorp/kubernetes" 23 | version = "2.18.1" 24 | } 25 | time = { 26 | source = "hashicorp/time" 27 | version = "0.11.1" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /modules/kubernetes-namespace/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Helm Chart 16 | resource "helm_release" "app-namespace" { 17 | name = "app-namespace" 18 | chart = "${path.module}/charts/namespace/" 19 | namespace = var.namespace 20 | create_namespace = var.create_namespace 21 | } 22 | -------------------------------------------------------------------------------- /modules/kubernetes-namespace/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "namespace" { 16 | value = var.namespace 17 | } -------------------------------------------------------------------------------- /modules/kubernetes-namespace/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | variable "namespace" { 17 | type = string 18 | description = "Kubernetes namespace where resources are deployed" 19 | } 20 | 21 | variable "create_namespace" { 22 | type = bool 23 | } 24 | -------------------------------------------------------------------------------- /modules/kubernetes-namespace/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | helm = { 18 | source = "hashicorp/helm" 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /modules/prometheus-adapter/README.md: -------------------------------------------------------------------------------- 1 | This module deploys a [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) and a [Prometheus frontend](https://github.com/GoogleCloudPlatform/prometheus-engine/blob/main/examples/frontend.yaml) to a cluster. See [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) repo for more details. 2 | 3 | ## Installation via bash and helm 4 | 5 | Assure the following environment variables are set: 6 | - PROJECT_ID: GKE Project ID 7 | - (optional) PROMETHEUS_HELM_VALUES_FILE: Values file to pass when deploying `prometheus-community/prometheus-adapter` chart 8 | 9 | ``` 10 | curl https://raw.githubusercontent.com/GoogleCloudPlatform/prometheus-engine/v0.10.0/examples/frontend.yaml | envsubst | kubectl apply -f - 11 | 12 | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 13 | helm repo update 14 | 15 | if [ -z "$PROMETHEUS_HELM_VALUES_FILE" ] 16 | helm install example-release prometheus-community/prometheus-adapter 17 | else 18 | helm install example-release prometheus-community/prometheus-adapter -f "$PROMETHEUS_HELM_VALUES_FILE" 19 | fi 20 | ``` 21 | -------------------------------------------------------------------------------- /ray-on-gke/README.md: -------------------------------------------------------------------------------- 1 | # Running Ray on GKE 2 | 3 | >[!WARNING] 4 | >The files for the Ray on GKE Guide have been moved to the [AI-on-GKE/quick-start-guides](https://github.com/ai-on-gke/quick-start-guides) repository. For more information, please refer to the [Ray on GKE](https://gke-ai-labs.dev/docs/blueprints/ray-on-gke). 5 | -------------------------------------------------------------------------------- /ray-on-gke/examples/tfvars: -------------------------------------------------------------------------------- 1 | ../../applications/ray/tfvars_examples -------------------------------------------------------------------------------- /ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-cluster-on-gke.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-cluster-on-gke.png -------------------------------------------------------------------------------- /ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-head-resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-head-resources.png -------------------------------------------------------------------------------- /ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-worker-resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/ray-on-gke/guides/raytrain-with-gcsfusecsi/images/ray-worker-resources.png -------------------------------------------------------------------------------- /ray-on-gke/tpu/kuberay-tpu-webhook/README.md: -------------------------------------------------------------------------------- 1 | # Running KubeRay with TPUs on GKE 2 | 3 | >[!WARNING] 4 | >The files for the KubeRay TPU webhook have been moved to the [AI-on-GKE/kuberay-tpu-webhook](https://github.com/ai-on-gke/kuberay-tpu-webhook) repository. For more information on installing the webhook and running TPUs with KubeRay, please refer to [Ray on TPUs with GKE](https://gke-ai-labs.dev/docs/tutorials/ray-gke-tpus/). 5 | -------------------------------------------------------------------------------- /scripts/ci/wait_for_pods.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the namespace to watch 4 | NAMESPACE=$1 5 | TIMEOUT=$2 6 | START_TIME=$(date +%s) 7 | 8 | # Check if namespace is provided 9 | if [[ -z "$NAMESPACE" ]]; then 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | echo "Waiting for any pod to exist in the namespace '$NAMESPACE' (timeout: ${TIMEOUT}s)..." 15 | 16 | # Loop until a pod exists in the namespace or timeout occurs 17 | while true; do 18 | POD_COUNT=$(kubectl get pods -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) 19 | 20 | if [[ "$POD_COUNT" -gt 0 ]]; then 21 | echo "Pod(s) found in the namespace '$NAMESPACE'." 22 | break 23 | fi 24 | 25 | CURRENT_TIME=$(date +%s) 26 | ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) 27 | 28 | if [[ "$ELAPSED_TIME" -ge "$TIMEOUT" ]]; then 29 | echo "Timeout reached after ${TIMEOUT} seconds. No pods found in the namespace '$NAMESPACE'." 30 | exit 1 31 | fi 32 | 33 | echo "No pods found yet in the namespace '$NAMESPACE'. Checking again in 30 seconds..." 34 | sleep 30 35 | done 36 | -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/continuous-image-puller/capabilities.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "pause" 6 | }, 7 | "message": "container \"pause\" in DaemonSet \"continuous-image-puller\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", 8 | "policyName": "capabilities", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "DaemonSet", 12 | "name": "continuous-image-puller", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/continuous-image-puller/readonlyrootfs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "pause" 6 | }, 7 | "message": "container \"pause\" in DaemonSet \"continuous-image-puller\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", 8 | "policyName": "readonlyrootfs", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "DaemonSet", 12 | "name": "continuous-image-puller", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/continuous-image-puller/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in DaemonSet \"continuous-image-puller\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "DaemonSet", 8 | "name": "continuous-image-puller", 9 | "namespace": ".*", 10 | "version": "v1" 11 | } 12 | } 13 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/capabilities.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub" 6 | }, 7 | "message": "container \"hub\" in Deployment \"hub\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", 8 | "policyName": "capabilities", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "hub", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/distroless.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub", 6 | "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202" 7 | }, 8 | "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" built from non-distroless base image \"Debian GNU/Linux 11 (bullseye)\". See: go/gke-distroless for more details", 9 | "policyName": "distroless", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "hub", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/imagedigest.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub", 6 | "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202" 7 | }, 8 | "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e", 9 | "policyName": "imagedigest", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "hub", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/imagefreshness.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub", 6 | "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202" 7 | }, 8 | "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" that does not have a valid digest.", 9 | "policyName": "imagefreshness", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "hub", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/imagepath.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub", 6 | "image": "us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202" 7 | }, 8 | "message": "container \"hub\" in Deployment \"hub\" has an image \"us-docker.pkg.dev/ai-on-gke/jupyterhub-authentication-class/jupyter-auth-class:sample-public-image-1741648202\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.", 9 | "policyName": "imagepath", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "hub", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/readonlyrootfs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "hub" 6 | }, 7 | "message": "container \"hub\" in Deployment \"hub\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", 8 | "policyName": "readonlyrootfs", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "hub", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/hub/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in Deployment \"hub\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "Deployment", 8 | "name": "hub", 9 | "namespace": ".*", 10 | "version": "v1" 11 | } 12 | } 13 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/mistral-7b-instruct/allowprivilegeescalation.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "mistral-7b-instruct" 6 | }, 7 | "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not set allowPrivilegeEscalation: false in its securityContext. See go/gke-shipshape#allowprivilegeescalation for more details", 8 | "policyName": "allowprivilegeescalation", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "mistral-7b-instruct", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/mistral-7b-instruct/capabilities.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "mistral-7b-instruct" 6 | }, 7 | "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", 8 | "policyName": "capabilities", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "mistral-7b-instruct", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/mistral-7b-instruct/readonlyrootfs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "mistral-7b-instruct" 6 | }, 7 | "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", 8 | "policyName": "readonlyrootfs", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "mistral-7b-instruct", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/mistral-7b-instruct/rootless.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "mistral-7b-instruct" 6 | }, 7 | "message": "container \"mistral-7b-instruct\" in Deployment \"mistral-7b-instruct\" is running as root. Update the container to run as non-root. See go/gke-shipshape#rootless for more details", 8 | "policyName": "rootless", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "mistral-7b-instruct", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/mistral-7b-instruct/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in Deployment \"mistral-7b-instruct\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "Deployment", 8 | "name": "mistral-7b-instruct", 9 | "namespace": ".*", 10 | "version": "v1" 11 | } 12 | } 13 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/capabilities.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp" 6 | }, 7 | "message": "container \"chp\" in Deployment \"proxy\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", 8 | "policyName": "capabilities", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "proxy", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/distroless.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp", 6 | "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1" 7 | }, 8 | "message": "image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" could not be found on gcr.io", 9 | "policyName": "distroless", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "proxy", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/imagedigest.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp", 6 | "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1" 7 | }, 8 | "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e", 9 | "policyName": "imagedigest", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "proxy", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/imagefreshness.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp", 6 | "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1" 7 | }, 8 | "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" that does not have a valid digest.", 9 | "policyName": "imagefreshness", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "proxy", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/imagepath.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1", 6 | "containerName": "chp" 7 | }, 8 | "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.", 9 | "policyName": "imagepath", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "proxy", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/readonlyrootfs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp" 6 | }, 7 | "message": "container \"chp\" in Deployment \"proxy\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", 8 | "policyName": "readonlyrootfs", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "proxy", 13 | "namespace": ".*", 14 | "version": "v1" 15 | } 16 | } 17 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/sbom.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "chp", 6 | "image": "quay.io/jupyterhub/configurable-http-proxy:4.6.1" 7 | }, 8 | "message": "container \"chp\" in Deployment \"proxy\" has an image \"quay.io/jupyterhub/configurable-http-proxy:4.6.1\" with no digest specified. Unable to find digest from registry.", 9 | "policyName": "sbom", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "proxy", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/proxy/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in Deployment \"proxy\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "Deployment", 8 | "name": "proxy", 9 | "namespace": ".*", 10 | "version": "v1" 11 | } 12 | } 13 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/rag-frontend/distroless.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "rag-frontend", 6 | "image": "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/frontend@sha256:2b14a3a95f433cc394087ba0d6376d160d8080b62f485f1a119c52b8a6119368" 7 | }, 8 | "message": "container \"rag-frontend\" in Deployment \"rag-frontend\" has an image \"us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/frontend@sha256:2b14a3a95f433cc394087ba0d6376d160d8080b62f485f1a119c52b8a6119368\" built from non-distroless base image \"Debian GNU/Linux 12 (bookworm)\". See: go/gke-distroless for more details", 9 | "policyName": "distroless", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "rag-frontend", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/rag-frontend/imagedigest.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "image": "gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0", 6 | "containerName": "cloud-sql-proxy" 7 | }, 8 | "message": "container \"cloud-sql-proxy\" in Deployment \"rag-frontend\" has an image \"gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e", 9 | "policyName": "imagedigest", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "rag-frontend", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/rag-frontend/imagefreshness.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "cloud-sql-proxy", 6 | "image": "gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0" 7 | }, 8 | "message": "container \"cloud-sql-proxy\" in Deployment \"rag-frontend\" has an image \"gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.8.0\" that does not have a valid digest.", 9 | "policyName": "imagefreshness", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "rag-frontend", 14 | "namespace": ".*", 15 | "version": "v1" 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/cluster/rag-frontend/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in Deployment \"rag-frontend\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "Deployment", 8 | "name": "rag-frontend", 9 | "namespace": ".*", 10 | "version": "v1" 11 | } 12 | } 13 | ] -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/iap/defaultnamespace.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "Ingress \"iap-ingress\" is in the default namespace, which is not allowed.", 4 | "policyName": "defaultnamespace", 5 | "resourceKey": { 6 | "group": "networking.k8s.io", 7 | "kind": "Ingress", 8 | "name": "iap-ingress", 9 | "version": "v1" 10 | } 11 | }, 12 | { 13 | "message": "Secret \"iap-secret\" is in the default namespace, which is not allowed.", 14 | "policyName": "defaultnamespace", 15 | "resourceKey": { 16 | "kind": "Secret", 17 | "name": "iap-secret", 18 | "version": "v1" 19 | } 20 | } 21 | ] 22 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/allowprivilegeescalation.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook" 6 | }, 7 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set allowPrivilegeEscalation: false in its securityContext. See go/gke-shipshape#allowprivilegeescalation for more details", 8 | "policyName": "allowprivilegeescalation", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "kuberay-tpu-webhook", 13 | "namespace": "ray-system", 14 | "version": "v1" 15 | } 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/capabilities.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook" 6 | }, 7 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not drop all capabilities in its securityContext. See go/gke-shipshape#capabilities for more details", 8 | "policyName": "capabilities", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "kuberay-tpu-webhook", 13 | "namespace": "ray-system", 14 | "version": "v1" 15 | } 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/imagedigest.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook", 6 | "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0" 7 | }, 8 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with no digest; valid image format: image[:tag]@sha256:\u003cdigest\u003e", 9 | "policyName": "imagedigest", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "kuberay-tpu-webhook", 14 | "namespace": "ray-system", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/imagefreshness.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook", 6 | "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0" 7 | }, 8 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" that does not have a valid digest.", 9 | "policyName": "imagefreshness", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "kuberay-tpu-webhook", 14 | "namespace": "ray-system", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/imagepath.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "image": "us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0", 6 | "containerName": "kuberay-tpu-webhook" 7 | }, 8 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" has an image \"us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/tpu-webhook:v1.2.1-gke.0\" with an invalid path. See go/gke-shipshape#imagepath for valid image paths.", 9 | "policyName": "imagepath", 10 | "resourceKey": { 11 | "group": "apps", 12 | "kind": "Deployment", 13 | "name": "kuberay-tpu-webhook", 14 | "namespace": "ray-system", 15 | "version": "v1" 16 | } 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/readonlyrootfs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook" 6 | }, 7 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" does not set readOnlyRootFilesystem: true in its securityContext. This setting is encouraged because it can prevent attackers from writing malicious binaries into runnable locations in the container filesystem.", 8 | "policyName": "readonlyrootfs", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "kuberay-tpu-webhook", 13 | "namespace": "ray-system", 14 | "version": "v1" 15 | } 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/rootless.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "details": { 4 | "@type": "type.googleapis.com/google.internal.kubernetes.security.validation.v1.ContainerDetails", 5 | "containerName": "kuberay-tpu-webhook" 6 | }, 7 | "message": "container \"kuberay-tpu-webhook\" in Deployment \"kuberay-tpu-webhook\" is running as root. Update the container to run as non-root. See go/gke-shipshape#rootless for more details", 8 | "policyName": "rootless", 9 | "resourceKey": { 10 | "group": "apps", 11 | "kind": "Deployment", 12 | "name": "kuberay-tpu-webhook", 13 | "namespace": "ray-system", 14 | "version": "v1" 15 | } 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /security_test/allowlist/category/helm/kuberay-tpu-webhook/seccompprofile.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "message": "pod in Deployment \"kuberay-tpu-webhook\" must set securityContext.seccompProfile.type to value RuntimeDefault", 4 | "policyName": "seccompprofile", 5 | "resourceKey": { 6 | "group": "apps", 7 | "kind": "Deployment", 8 | "name": "kuberay-tpu-webhook", 9 | "namespace": "ray-system", 10 | "version": "v1" 11 | } 12 | } 13 | ] 14 | -------------------------------------------------------------------------------- /slurm-on-gke/README.md: -------------------------------------------------------------------------------- 1 | # Slurm on GKE 2 | 3 | >[!WARNING] 4 | >The files for the Slurm on GKE example have been moved to the [AI-on-GKE/slurm-on-gke](https://github.com/ai-on-gke/slurm-on-gke) repository. For more information, please refer to the [Slurm on GKE](https://gke-ai-labs.dev/docs/blueprints/slurm-on-gke/). -------------------------------------------------------------------------------- /tools/dcgm-on-gke/README.md: -------------------------------------------------------------------------------- 1 | # DCGM on GKE 2 | 3 | >[!WARNING] 4 | >This tool (DCGM on GKE) is now deprecated and is no longer being maintained. 5 | > 6 | >The files for this tool have been removed from this repository and will not be migrated to the new AI-on-GKE GitHub organization. 7 | -------------------------------------------------------------------------------- /tools/gke-disk-image-builder/README.md: -------------------------------------------------------------------------------- 1 | # GKE Disk Image Builder 2 | 3 | >[!WARNING] 4 | >The files for the GKE Disk Image Builder have been moved to the [AI-on-GKE/tools](https://github.com/ai-on-gke/tools/tree/main/gke-disk-image-builder) repository. 5 | -------------------------------------------------------------------------------- /tools/saxml-on-gke/README.md: -------------------------------------------------------------------------------- 1 | # SaxML on GKE 2 | 3 | >[!WARNING] 4 | >This tool (SaxML on GKE) is now deprecated and is no longer being maintained. 5 | > 6 | >The files for this tool have been removed from this repository and will not be migrated to the new AI-on-GKE GitHub organization. 7 | -------------------------------------------------------------------------------- /tpu-provisioner/.dockerignore: -------------------------------------------------------------------------------- 1 | # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file 2 | # Ignore build and test binaries. 3 | bin/ 4 | testbin/ 5 | -------------------------------------------------------------------------------- /tpu-provisioner/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | bin 9 | testbin/* 10 | Dockerfile.cross 11 | 12 | # Test binary, build with `go test -c` 13 | *.test 14 | 15 | # Output of the go coverage tool, specifically when used with LiteIDE 16 | *.out 17 | 18 | # Kubernetes Generated files - skip generated files, except for vendored files 19 | 20 | !vendor/**/zz_generated.* 21 | 22 | # editor and IDE paraphernalia 23 | .idea 24 | *.swp 25 | *.swo 26 | *~ 27 | -------------------------------------------------------------------------------- /tpu-provisioner/PROJECT: -------------------------------------------------------------------------------- 1 | # Code generated by tool. DO NOT EDIT. 2 | # This file is used to track the info used to scaffold your project 3 | # and allow the plugins properly work. 4 | # More info: https://book.kubebuilder.io/reference/project-config.html 5 | domain: google.com 6 | layout: 7 | - go.kubebuilder.io/v4-alpha 8 | projectName: tpu-provisioner 9 | repo: github.com/GoogleCloudPlatform/ai-on-gke/tpu-provisioner 10 | resources: [] 11 | version: "3" 12 | -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/.gitignore: -------------------------------------------------------------------------------- 1 | # don't add certificates 2 | certificates/*.crt 3 | certificates/*.key 4 | 5 | __pycache__/ 6 | .pytest_cache/ -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-buster 2 | WORKDIR /webhook 3 | COPY requirements.txt /webhook 4 | COPY admission_controller.py /webhook 5 | RUN pip install --no-cache-dir --upgrade -r /webhook/requirements.txt 6 | CMD ["uvicorn", "admission_controller:app", "--host", "0.0.0.0", "--port", "5000","--ssl-keyfile=/certs/tls.key", "--ssl-certfile=/certs/tls.crt"] 7 | -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/__init__.py -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/certificates/README.md: -------------------------------------------------------------------------------- 1 | Two files are required in this directory: 2 | 3 | 1. `certificate.crt` 4 | 2. `private.key` 5 | 6 | 7 | These are used to configure TLS for network communication to/from the webhook. -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/requirements.txt -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/skaffold.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: skaffold/v4beta11 2 | kind: Config 3 | metadata: 4 | name: admission-controller 5 | build: 6 | local: {} 7 | artifacts: 8 | - image: example.com/tpu-provisioner/admission-controller 9 | docker: 10 | dockerfile: Dockerfile 11 | manifests: 12 | rawYaml: 13 | - manifests/manifest.yaml 14 | -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/admission_controller/test/__init__.py -------------------------------------------------------------------------------- /tpu-provisioner/admission_controller/test/e2e/manifests/test-nonjobset-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: test-nonjobset-job 5 | spec: 6 | template: 7 | spec: 8 | containers: 9 | - name: sleeper 10 | image: ubuntu 11 | command: ["sleep", "10000"] 12 | restartPolicy: Never 13 | backoffLimit: 0 -------------------------------------------------------------------------------- /tpu-provisioner/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | - id: 'tpu provisioner tests' 17 | name: 'golang:1.23' 18 | dir: /workspace/tpu-provisioner 19 | entrypoint: 'bash' 20 | args: 21 | - '-c' 22 | - | 23 | set -e 24 | make test 25 | allowFailure: false 26 | 27 | options: 28 | substitutionOption: 'ALLOW_LOOSE' 29 | machineType: 'E2_HIGHCPU_8' 30 | timeout: 600s 31 | -------------------------------------------------------------------------------- /tpu-provisioner/config/default/manager_config_patch.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: controller-manager 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: manager 11 | -------------------------------------------------------------------------------- /tpu-provisioner/config/manager/configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: manager 5 | data: {} 6 | -------------------------------------------------------------------------------- /tpu-provisioner/config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | - configmap.yaml 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | -------------------------------------------------------------------------------- /tpu-provisioner/config/prometheus/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - monitor.yaml 3 | -------------------------------------------------------------------------------- /tpu-provisioner/config/prometheus/monitor.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Prometheus Monitor Service (Metrics) 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: ServiceMonitor 5 | metadata: 6 | labels: 7 | control-plane: controller-manager 8 | app.kubernetes.io/name: servicemonitor 9 | app.kubernetes.io/instance: controller-manager-metrics-monitor 10 | app.kubernetes.io/component: metrics 11 | app.kubernetes.io/created-by: tpu-provisioner 12 | app.kubernetes.io/part-of: tpu-provisioner 13 | app.kubernetes.io/managed-by: kustomize 14 | name: controller-manager-metrics-monitor 15 | namespace: system 16 | spec: 17 | endpoints: 18 | - path: /metrics 19 | port: https 20 | scheme: https 21 | bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 22 | tlsConfig: 23 | insecureSkipVerify: true 24 | selector: 25 | matchLabels: 26 | control-plane: controller-manager 27 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/auth_proxy_client_clusterrole.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: metrics-reader 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: metrics-reader 12 | rules: 13 | - nonResourceURLs: 14 | - "/metrics" 15 | verbs: 16 | - get 17 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/auth_proxy_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrole 6 | app.kubernetes.io/instance: proxy-role 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-role 12 | rules: 13 | - apiGroups: 14 | - authentication.k8s.io 15 | resources: 16 | - tokenreviews 17 | verbs: 18 | - create 19 | - apiGroups: 20 | - authorization.k8s.io 21 | resources: 22 | - subjectaccessreviews 23 | verbs: 24 | - create 25 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/auth_proxy_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: proxy-rolebinding 7 | app.kubernetes.io/component: kube-rbac-proxy 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: proxy-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: proxy-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/auth_proxy_service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | control-plane: controller-manager 6 | app.kubernetes.io/name: service 7 | app.kubernetes.io/instance: controller-manager-metrics-service 8 | app.kubernetes.io/component: kube-rbac-proxy 9 | app.kubernetes.io/created-by: tpu-provisioner 10 | app.kubernetes.io/part-of: tpu-provisioner 11 | app.kubernetes.io/managed-by: kustomize 12 | name: controller-manager-metrics-service 13 | namespace: system 14 | spec: 15 | ports: 16 | - name: https 17 | port: 8443 18 | protocol: TCP 19 | targetPort: https 20 | selector: 21 | control-plane: controller-manager 22 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | # All RBAC will be applied under this service account in 3 | # the deployment namespace. You may comment out this resource 4 | # if your manager will use a service account that exists at 5 | # runtime. Be sure to update RoleBinding and ClusterRoleBinding 6 | # subjects if changing service account names. 7 | - service_account.yaml 8 | - role.yaml 9 | - role_binding.yaml 10 | - leader_election_role.yaml 11 | - leader_election_role_binding.yaml 12 | # Comment the following 4 lines if you want to disable 13 | # the auth proxy (https://github.com/brancz/kube-rbac-proxy) 14 | # which protects your /metrics endpoint. 15 | - auth_proxy_service.yaml 16 | - auth_proxy_role.yaml 17 | - auth_proxy_role_binding.yaml 18 | - auth_proxy_client_clusterrole.yaml 19 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- 1 | # permissions to do leader election. 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: Role 4 | metadata: 5 | labels: 6 | app.kubernetes.io/name: role 7 | app.kubernetes.io/instance: leader-election-role 8 | app.kubernetes.io/component: rbac 9 | app.kubernetes.io/created-by: tpu-provisioner 10 | app.kubernetes.io/part-of: tpu-provisioner 11 | app.kubernetes.io/managed-by: kustomize 12 | name: leader-election-role 13 | rules: 14 | - apiGroups: 15 | - "" 16 | resources: 17 | - configmaps 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | - create 23 | - update 24 | - patch 25 | - delete 26 | - apiGroups: 27 | - coordination.k8s.io 28 | resources: 29 | - leases 30 | verbs: 31 | - get 32 | - list 33 | - watch 34 | - create 35 | - update 36 | - patch 37 | - delete 38 | - apiGroups: 39 | - "" 40 | resources: 41 | - events 42 | verbs: 43 | - create 44 | - patch 45 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: rolebinding 6 | app.kubernetes.io/instance: leader-election-rolebinding 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: leader-election-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: Role 15 | name: leader-election-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: clusterrolebinding 6 | app.kubernetes.io/instance: manager-rolebinding 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: manager-rolebinding 12 | roleRef: 13 | apiGroup: rbac.authorization.k8s.io 14 | kind: ClusterRole 15 | name: manager-role 16 | subjects: 17 | - kind: ServiceAccount 18 | name: controller-manager 19 | namespace: system 20 | -------------------------------------------------------------------------------- /tpu-provisioner/config/rbac/service_account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | labels: 5 | app.kubernetes.io/name: serviceaccount 6 | app.kubernetes.io/instance: controller-manager-sa 7 | app.kubernetes.io/component: rbac 8 | app.kubernetes.io/created-by: tpu-provisioner 9 | app.kubernetes.io/part-of: tpu-provisioner 10 | app.kubernetes.io/managed-by: kustomize 11 | name: controller-manager 12 | namespace: system 13 | -------------------------------------------------------------------------------- /tpu-provisioner/docs/cleanup.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/docs/cleanup.excalidraw.png -------------------------------------------------------------------------------- /tpu-provisioner/docs/provisioning.excalidraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tpu-provisioner/docs/provisioning.excalidraw.png -------------------------------------------------------------------------------- /tpu-provisioner/internal/auth/gcp/README.md: -------------------------------------------------------------------------------- 1 | # GCP Auth (v1.26+) 2 | 3 | See: https://github.com/kubernetes/cloud-provider-gcp/tree/master/pkg/clientauthplugin -------------------------------------------------------------------------------- /tpu-provisioner/internal/cloud/mock.go: -------------------------------------------------------------------------------- 1 | package cloud 2 | 3 | import ( 4 | corev1 "k8s.io/api/core/v1" 5 | "sigs.k8s.io/controller-runtime/pkg/client" 6 | ) 7 | 8 | var _ Provider = &Mock{} 9 | 10 | // Mock is useful for local development or debugging purposes to understand what 11 | // the controller would do without it doing anything. 12 | type Mock struct{} 13 | 14 | // TODO: Find a better mock node pool label key. 15 | func (m *Mock) NodePoolLabelKey() string { return "kubernetes.io/os" } 16 | func (m *Mock) EnsureNodePoolForPod(*corev1.Pod, string) error { return nil } 17 | func (m *Mock) DeleteNodePoolForNode(*corev1.Node, string) error { return nil } 18 | func (m *Mock) DeleteNodePool(string, client.Object, string) error { return nil } 19 | func (m *Mock) ListNodePools() ([]NodePoolRef, error) { return nil, nil } 20 | -------------------------------------------------------------------------------- /tutorials-and-examples/flyte/README.md: -------------------------------------------------------------------------------- 1 | # Running Flyte on GKE 2 | 3 | >[!WARNING] 4 | >The files for the Flyte in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Flyte in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/flyte/). -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/README.md: -------------------------------------------------------------------------------- 1 | # E2E GenAI application with Langchain, Ray, Flask API backend, React frontend 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. 7 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/backend_ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/backend_ip.png -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_app.png -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_ip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/frontend_ip.png -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/open_jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/open_jupyter.png -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/backend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | # Set working directory 4 | WORKDIR /app 5 | 6 | # Copy requirements 7 | COPY requirements.txt requirements.txt 8 | 9 | # Install dependencies 10 | RUN pip install --require-hashes --no-cache-dir -r requirements.txt 11 | 12 | # Copy all files 13 | COPY . . 14 | 15 | # Expose port for Flask 16 | EXPOSE 5000 17 | 18 | # Run main.py 19 | CMD ["python", "main.py"] 20 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/backend/requirements.in: -------------------------------------------------------------------------------- 1 | ray==2.43.0 2 | ray[serve] 3 | requests 4 | transformers 5 | langchain 6 | torch 7 | flask 8 | Flask-CORS 9 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production build output 5 | /dist 6 | 7 | # IDEs and editors 8 | /.idea 9 | .vscode/ 10 | *.swp 11 | *.swo 12 | 13 | # OS generated 14 | .DS_Store 15 | Thumbs.db 16 | 17 | # TypeScript 18 | *.tsbuildinfo 19 | 20 | # Log files 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | 25 | # Temporary files 26 | *.tmp 27 | *.tmp.json 28 | 29 | # Debug logs from ESLint, stylelint etc. 30 | *.log 31 | 32 | # Environment variables 33 | .env 34 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a Node.js image 2 | FROM node:16-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Copy the build files from the local system to the container 8 | COPY . ./ 9 | RUN npm install && \ 10 | npm run build && \ 11 | npm install serve && \ 12 | npm cache clean --force 13 | 14 | # Command to run the application 15 | CMD ["npx", "serve", "-s", "dist", "-l", "3000"] 16 | 17 | # Expose the port the app runs on 18 | EXPOSE 3000 19 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "webpack serve --open", 9 | "build": "webpack" 10 | }, 11 | "keywords": [], 12 | "devDependencies": { 13 | "@types/faker": "^5.5.3", 14 | "@types/react": "^18.2.23", 15 | "@types/react-dom": "^18.2.8", 16 | "css-loader": "^6.8.1", 17 | "html-webpack-plugin": "^5.5.3", 18 | "postcss-loader": "^7.3.3", 19 | "react": "^18.2.0", 20 | "react-dom": "^18.2.0", 21 | "style-loader": "^3.3.3", 22 | "ts-loader": "^9.4.4", 23 | "typescript": "^5.2.2", 24 | "webpack": "^5.88.2", 25 | "webpack-cli": "^5.1.4", 26 | "webpack-dev-server": "^4.15.1" 27 | }, 28 | "dependencies": { 29 | "bootstrap": "^5.3.2", 30 | "faker": "^5.5.3", 31 | "reactstrap": "^9.2.0" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/src/index.html: -------------------------------------------------------------------------------- 1 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | React App 21 | 22 | 23 |
24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/e2e-genai-langchain-app/src/frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "./dist/", 4 | "sourceMap": true, 5 | "noImplicitAny": true, 6 | "module": "commonjs", 7 | "target": "es6", 8 | "jsx": "react", 9 | "esModuleInterop": true, 10 | "allowSyntheticDefaultImports": true, 11 | "moduleResolution": "node", 12 | "typeRoots": ["./node_modules/@types", "./types"] 13 | }, 14 | "include": [ 15 | "./src/**/*" 16 | ] 17 | } -------------------------------------------------------------------------------- /tutorials-and-examples/genAI-LLM/finetuning-gemma-2b-on-l4/README.md: -------------------------------------------------------------------------------- 1 | # Finetuning Gemma 3-1B-it on L4 2 | 3 | >[!WARNING] 4 | >The files for the Finetuning Gemma 3-1B-it on L4 guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Finetuning Gemma 3-1B-it on L4](https://gke-ai-labs.dev/docs/tutorials/finetuning-gemma-3-1b-it-on-l4/). -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/a100-jax/README.md: -------------------------------------------------------------------------------- 1 | # JAX 'Hello World' on GKE + A100-80GB 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/online-serving-single-gpu/README.md: -------------------------------------------------------------------------------- 1 | # Serve a model with a GPU on GKE Autopilot 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/README.md: -------------------------------------------------------------------------------- 1 | # Train a model with GPUs on GKE Standard mode 2 | 3 | Please follow the Quick Start at https://cloud.google.com/kubernetes-engine/docs/quickstarts/train-model-gpus-standard 4 | -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/0.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/1.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/2.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/3.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/4.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/5.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/6.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/7.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/8.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/gpu-examples/training-single-gpu/data/mnist_predict/9.png -------------------------------------------------------------------------------- /tutorials-and-examples/gpu-examples/training-single-gpu/src/tensorflow-mnist-example/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-datasets -------------------------------------------------------------------------------- /tutorials-and-examples/hf-tgi/README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Text Generation Inference (TGI) 2 | 3 | >[!WARNING] 4 | >The files for the Hugging Face TGI example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Hugging Face TGI tutorial](https://gke-ai-labs.dev/docs/tutorials/hf-tgi/). -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/checkpoints/README.md: -------------------------------------------------------------------------------- 1 | # Creating Inference Checkpoints 2 | 3 | >[!WARNING] 4 | >The files for the Creating Inference Checkpoints on GKE example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Creating Inference Checkpoints](https://gke-ai-labs.dev/docs/tutorials/inference-servers/checkpoints/). -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/README.md: -------------------------------------------------------------------------------- 1 | # Serve an LLM using TPUs on GKE with JetStream 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. Methods for deploying LLMs on GKE with TPUs may have changed. 5 | > 6 | >Please refer to the **official Google Cloud documentation** for the latest practices: 7 | >[Serve LLMs on GKE with TPUs using JetStream and PyTorch](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-llm-tpu-jetstream-pytorch) 8 | -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/http-server/Dockerfile: -------------------------------------------------------------------------------- 1 | # Ubuntu:22.04 2 | # Use Ubuntu 22.04 from Docker Hub. 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04 4 | FROM ubuntu:22.04 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV JETSTREAM_VERSION=v0.2.2 8 | 9 | RUN apt -y update && apt install -y --no-install-recommends \ 10 | ca-certificates \ 11 | git \ 12 | python3.10 \ 13 | python3-pip 14 | 15 | RUN update-alternatives --install \ 16 | /usr/bin/python3 python3 /usr/bin/python3.10 1 17 | 18 | RUN git clone https://github.com/google/JetStream.git && \ 19 | cd /JetStream && \ 20 | git checkout ${JETSTREAM_VERSION} && \ 21 | pip install -e . 22 | 23 | RUN pip3 install uvicorn 24 | RUN pip3 install fastapi 25 | RUN pip3 install pydantic 26 | ENV PYTHONDONTWRITEBYTECODE=1 27 | 28 | COPY http_server.py /httpserver/ 29 | WORKDIR /httpserver 30 | 31 | CMD ["uvicorn", "http_server:app", "--host=0.0.0.0", "--port=8000"] -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/maxengine-server/Dockerfile: -------------------------------------------------------------------------------- 1 | # Ubuntu:22.04 2 | # Use Ubuntu 22.04 from Docker Hub. 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04 4 | FROM ubuntu:22.04 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV MAXTEXT_VERSION=jetstream-v0.2.2 8 | 9 | RUN apt -y update && apt install -y --no-install-recommends \ 10 | ca-certificates \ 11 | git \ 12 | python3.10 \ 13 | python3-pip 14 | 15 | RUN update-alternatives --install \ 16 | /usr/bin/python3 python3 /usr/bin/python3.10 1 17 | 18 | RUN git clone https://github.com/google/maxtext.git 19 | 20 | RUN cd maxtext/ && \ 21 | git checkout ${MAXTEXT_VERSION} && \ 22 | bash setup.sh 23 | 24 | COPY maxengine_server_entrypoint.sh /usr/bin/ 25 | 26 | RUN chmod +x /usr/bin/maxengine_server_entrypoint.sh 27 | 28 | ENTRYPOINT ["/usr/bin/maxengine_server_entrypoint.sh"] 29 | -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/maxengine-server/maxengine_server_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /maxtext 3 | python3 MaxText/maxengine_server.py \ 4 | MaxText/configs/base.yml $@ -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/checkpoint-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: data-loader-7b 5 | spec: 6 | ttlSecondsAfterFinished: 30 7 | template: 8 | spec: 9 | restartPolicy: Never 10 | containers: 11 | - name: inference-checkpoint 12 | image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.2 13 | args: 14 | - -b=BUCKET_NAME 15 | - -m=google/gemma/maxtext/7b-it/2 16 | volumeMounts: 17 | - mountPath: "/kaggle/" 18 | name: kaggle-credentials 19 | readOnly: true 20 | resources: 21 | requests: 22 | google.com/tpu: 8 23 | limits: 24 | google.com/tpu: 8 25 | nodeSelector: 26 | cloud.google.com/gke-tpu-topology: 2x4 27 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 28 | volumes: 29 | - name: kaggle-credentials 30 | secret: 31 | defaultMode: 0400 32 | secretName: kaggle-secret -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2024 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | module "maxengine" { 18 | count = 1 19 | source = "../../../../../../modules/jetstream-maxtext-deployment" 20 | cluster_name = var.cluster_name 21 | project_id = var.project_id 22 | maxengine_deployment_settings = var.maxengine_deployment_settings 23 | hpa_config = var.hpa_config 24 | } -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars: -------------------------------------------------------------------------------- 1 | maxengine_deployment_settings = { 2 | metrics = { 3 | server = { 4 | port = 9100 5 | scrape_interval : 10 6 | } 7 | } 8 | 9 | accelerator_selectors = { 10 | topology = "2x4" 11 | accelerator = "tpu-v5-lite-podslice" 12 | chip_count : 8 13 | } 14 | } 15 | 16 | # Demonstrating autoscaling with jetstream_prefill_backlog_size, change as desired. 17 | # For jetstream_prefill_backlog_size. (experiment with this to determine optimal values). 18 | 19 | # hpa_config = { 20 | # metrics_adapter = "prometheus-adapter" 21 | # max_replicas = 5 22 | # min_replicas = 1 23 | # rules = [{ 24 | # target_query = "jetstream_prefill_backlog_size" 25 | # average_value_target = 5 26 | # }] 27 | # } -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | google = { 18 | source = "hashicorp/google" 19 | } 20 | kubernetes = { 21 | source = "hashicorp/kubernetes" 22 | } 23 | kubectl = { 24 | source = "hashicorp/kubectl" 25 | } 26 | helm = { 27 | source = "hashicorp/helm" 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/versions_override.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_providers { 17 | kubectl = { 18 | source = "gavinbunney/kubectl" 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/Dockerfile: -------------------------------------------------------------------------------- 1 | # Ubuntu:22.04 2 | # Use Ubuntu 22.04 from Docker Hub. 3 | # https://hub.docker.com/_/ubuntu/tags?page=1&name=22.04 4 | FROM ubuntu:22.04 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV PYTORCH_JETSTREAM_VERSION=jetstream-v0.2.3 8 | 9 | RUN apt -y update && apt install -y --no-install-recommends \ 10 | ca-certificates \ 11 | git \ 12 | python3.10 \ 13 | python3-pip 14 | 15 | RUN python3 -m pip install --upgrade pip 16 | 17 | RUN update-alternatives --install \ 18 | /usr/bin/python3 python3 /usr/bin/python3.10 1 19 | 20 | RUN git clone https://github.com/google/jetstream-pytorch.git && \ 21 | cd /jetstream-pytorch && \ 22 | git checkout ${PYTORCH_JETSTREAM_VERSION} && \ 23 | bash install_everything.sh 24 | 25 | COPY jetstream_pytorch_server_entrypoint.sh /usr/bin/ 26 | 27 | RUN chmod +x /usr/bin/jetstream_pytorch_server_entrypoint.sh 28 | 29 | ENTRYPOINT ["/usr/bin/jetstream_pytorch_server_entrypoint.sh"] -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/pytorch/jetstream-pytorch-server/jetstream_pytorch_server_entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /jetstream-pytorch 3 | python3 -m run_server $@ -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/jetstream/pytorch/single-host-inference/storage.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: checkpoint-pv 5 | spec: 6 | capacity: 7 | storage: 100G 8 | accessModes: 9 | - ReadWriteOnce 10 | gcePersistentDisk: 11 | pdName: jetstream-pytorch-ckpt 12 | fsType: ext4 13 | --- 14 | apiVersion: v1 15 | kind: PersistentVolumeClaim 16 | metadata: 17 | name: checkpoint-pvc 18 | spec: 19 | storageClassName: "" 20 | volumeName: checkpoint-pv 21 | accessModes: 22 | - ReadWriteOnce 23 | resources: 24 | requests: 25 | storage: 100G -------------------------------------------------------------------------------- /tutorials-and-examples/inference-servers/maxdiffusion/README.md: -------------------------------------------------------------------------------- 1 | # High-performance diffusion model inference on GKE and TPU using MaxDiffusion 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. Methods for deploying diffusion model inference on GKE and TPU using MaxDiffusion may have changed. 5 | > 6 | >Please refer to the **official Google Cloud documentation** for the latest practices: 7 | >[Serve Stable Diffusion XL (SDXL) using TPUs on GKE with MaxDiffusion](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-sdxl-tpu) -------------------------------------------------------------------------------- /tutorials-and-examples/kserve/README.md: -------------------------------------------------------------------------------- 1 | # KServe on GKE Autopilot 2 | 3 | >[!WARNING] 4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [KServe on GKE Autopilot](https://gke-ai-labs.dev/docs/tutorials/inference-servers/kserve/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/langchain-chatbot/README.md: -------------------------------------------------------------------------------- 1 | # Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP 2 | 3 | >[!WARNING] 4 | >The files for the Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Deploying a Persistent Chatbot on Google Cloud Platform with LangChain, Streamlit, and IAP](https://gke-ai-labs.dev/docs/tutorials/langchain-chatbot/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/llamaindex/rag/README.md: -------------------------------------------------------------------------------- 1 | # Llamaindex in GKE cluster 2 | 3 | >[!WARNING] 4 | >The files for the Llamaindex in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Llamaindex in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/llamaindex/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/metaflow/README.md: -------------------------------------------------------------------------------- 1 | # Fine-Tuning Gemma 2-9B on GKE using Metaflow and Argo Workflows 2 | 3 | >[!WARNING] 4 | >The files for the Metaflow in GKE cluster Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Metaflow in GKE cluster](https://gke-ai-labs.dev/docs/tutorials/metaflow/). -------------------------------------------------------------------------------- /tutorials-and-examples/mlflow/finetune-gemma/README.md: -------------------------------------------------------------------------------- 1 | # Fine-tune gemma-2-9b and track as an experiment in MLFlow 2 | 3 | >[!WARNING] 4 | >The files for the Fine-tune gemma-2-9b and track as an experiment in MLFlow Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Fine-tune gemma-2-9b and track as an experiment in MLFlow](https://gke-ai-labs.dev/docs/tutorials/mlflow/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/models-as-oci/README.md: -------------------------------------------------------------------------------- 1 | # Package and Deploy from Hugging Face to Artifact Registry and GKE 2 | 3 | >[!WARNING] 4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Package and Deploy from Hugging Face to Artifact Registry and GKE](https://gke-ai-labs.dev/docs/tutorials/models-as-oci/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/nvidia-bionemo/README.md: -------------------------------------------------------------------------------- 1 | ### Pretraining and Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework 2.0 2 | 3 | >[!WARNING] 4 | >The files for the Pretraining and Fine-tuning ESM-2 LLM on GKE using BioNeMo Framework have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/bionemo/README.md) repository. For more information, please refer to the [NVIDIA BioNeMo tutorial](https://gke-ai-labs.dev/docs/blueprints/bionemo/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/nvidia-nim/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA NIM on GKE 2 | 3 | >[!WARNING] 4 | >The files for the NVIDIA Inference Microservices (NIM) on GKE have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/nim/quickstart) repository. For more information, please refer to the [NVIDIA NIM & Blueprints on GKE tutorial](https://gke-ai-labs.dev/docs/blueprints/nims-on-gke/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/nvidia-nim/blueprints/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA NIM Blueprints on GKE 2 | 3 | >[!WARNING] 4 | >The files for the NIM Blueprints on GKE have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/nvidia-ai-solutions/blob/main/nim/blueprints) repository. For more information, please refer to the [NVIDIA NIM on GKE tutorial](https://gke-ai-labs.dev/docs/blueprints/nims-on-gke/). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/skypilot/README.md: -------------------------------------------------------------------------------- 1 | # GKE cross region capacity chasing with SkyPilot 2 | >[!WARNING] 3 | >The files for the GKE cross region capacity chasing with SkyPilot Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [GKE cross region capacity chasing with SkyPilot](https://gke-ai-labs.dev/docs/tutorials/skypilot/cross-region-capacity-chasing/). -------------------------------------------------------------------------------- /tutorials-and-examples/skypilot/dws-and-kueue/README.md: -------------------------------------------------------------------------------- 1 | # Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE 2 | 3 | >[!WARNING] 4 | >The files for the Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE Guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Efficient GPU Resource Management for ML Workloads using SkyPilot, Kueue on GKE](https://gke-ai-labs.dev/docs/tutorials/skypilot/resource-management-using-kueue/). -------------------------------------------------------------------------------- /tutorials-and-examples/storage/hyperdisk-ml/README.md: -------------------------------------------------------------------------------- 1 | ## Populate a Hyperdisk ML Disk from Google Cloud Storage 2 | 3 | >[!WARNING] 4 | >The files for this guide have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Populate a Hyperdisk ML Disk from Google Cloud Storage](https://gke-ai-labs.dev/docs/tutorials/hyperdisk-ml/). -------------------------------------------------------------------------------- /tutorials-and-examples/storage/parallelstore-backup-and-recovery/README.md: -------------------------------------------------------------------------------- 1 | # Data backup and recovery for Parallelstore 2 | 3 | >[!WARNING] 4 | >The files for the Data backup and recovery for Parallelstore example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Data backup and recovery for Parallelstore tutorial](https://github.com/ai-on-gke/tutorials-and-examples/tree/main/storage/parallelstore-backup-and-recovery). 5 | -------------------------------------------------------------------------------- /tutorials-and-examples/storage/parallelstore-backup-and-recovery/parallelstore-sa.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Service Account that have access to Parallelstore and GCS 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: parallelstore-sa 7 | namespace: default 8 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/jax/bert/loadbalancer.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | labels: 19 | run: tf-bert-service 20 | name: tf-bert-service 21 | spec: 22 | ports: 23 | - name: grpc 24 | port: 8500 25 | protocol: TCP 26 | targetPort: 8500 27 | - name: http 28 | port: 8501 29 | protocol: TCP 30 | targetPort: 8501 31 | selector: 32 | app: tf-bert-server 33 | type: LoadBalancer 34 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/jax/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | numpy 3 | tensorflow 4 | pillow 5 | tensorflow-serving-api 6 | transformers 7 | diffusers 8 | flask -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/jax/stable-diffusion/README.md: -------------------------------------------------------------------------------- 1 | ## Serve (online inference) a model using a single TPU and GKE 2 | 3 | To better understand how TPUs work on GKE, please read the doc 4 | [TPUs in GKE introduction](https://cloud.google.com/tpu/docs/tpus-in-gke). 5 | 6 | This directory contains files for [JAX Model inference and serving](https://cloud.google.com/tpu/docs/tpus-in-gke#jax-model). You can find step-by-step instructions in the quickstart guide. 7 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/jax/stable-diffusion/loadbalancer.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | labels: 19 | run: tf-stable-diffusion-service 20 | name: tf-stable-diffusion-service 21 | spec: 22 | ports: 23 | - name: grpc 24 | port: 8500 25 | protocol: TCP 26 | targetPort: 8500 27 | - name: http 28 | port: 8501 29 | protocol: TCP 30 | targetPort: 8501 31 | selector: 32 | app: tf-stable-diffusion-server 33 | type: LoadBalancer 34 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/pt/densenet161/loadbalancer.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: densenet161-service 5 | spec: 6 | type: LoadBalancer 7 | ports: 8 | - name: densenet161-http-inference 9 | port: 8080 10 | protocol: TCP 11 | targetPort: 8080 12 | - name: densenet161-http-management 13 | port: 8081 14 | protocol: TCP 15 | targetPort: 8081 16 | - name: densenet161-http-metrics 17 | port: 8082 18 | protocol: TCP 19 | targetPort: 8082 20 | selector: 21 | app: densenet161-server 22 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/pt/densenet161/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/banana.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/banana.jpeg -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/loadbalancer.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: resnet-service 5 | spec: 6 | type: LoadBalancer 7 | ports: 8 | - name: resnet-grpc 9 | port: 8500 10 | protocol: TCP 11 | targetPort: 8500 12 | - name: resnet-http 13 | port: 8501 14 | protocol: TCP 15 | targetPort: 8501 16 | selector: 17 | app: resnet-server 18 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/single-host-inference/tf/resnet50/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | numpy 3 | tensorflow 4 | pillow -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/diffusion/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | 3 | RUN git clone https://github.com/huggingface/diffusers.git 4 | 5 | WORKDIR diffusers 6 | 7 | RUN pip install "jax[tpu]" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html 8 | RUN pip install . 9 | RUN pip install tensorflow clu 10 | RUN pip install -U -r examples/text_to_image/requirements_flax.txt -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/gpt/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm 2 | 3 | RUN apt-get update -y 4 | 5 | RUN apt-get install libomp5 -y 6 | RUN pip3 install mkl mkl-include 7 | RUN pip3 install tf-nightly tb-nightly tbp-nightly 8 | RUN pip3 install numpy 9 | RUN apt-get install numactl libopenblas-dev -y 10 | 11 | RUN ln -s /usr/local/lib/libmkl_intel_ilp64.so.2 /usr/local/lib/libmkl_intel_ilp64.so.1 12 | 13 | RUN rm -rf transformers 14 | RUN git clone https://github.com/huggingface/transformers.git 15 | 16 | WORKDIR transformers 17 | 18 | RUN git checkout -q ebdb185befaa821304d461ed6aa20a17e4dc3aa2 19 | RUN pip3 install -e . 20 | RUN pip3 install datasets 21 | RUN pip3 install evaluate 22 | RUN pip3 install scikit-learn 23 | 24 | COPY . . 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/gpt/fsdp_config.json: -------------------------------------------------------------------------------- 1 | {"fsdp_transformer_layer_cls_to_wrap":["GPT2Block", "GPT2MLP", "GPT2Attention"], 2 | "xla":true, 3 | "xla_fsdp_settings":{"compute_dtype":"bfloat16", 4 | "shard_param_on_dim_0":true, 5 | "pin_layout_in_collective_ops":true 6 | }, 7 | "xla_fsdp_grad_ckpt":true 8 | } 9 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/gpt/my_config_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function": "gelu_new", 3 | "architectures": [ 4 | "GPT2LMHeadModel" 5 | ], 6 | "attn_pdrop": 0.1, 7 | "bos_token_id": 50256, 8 | "embd_pdrop": 0.1, 9 | "eos_token_id": 50256, 10 | "initializer_range": 0.02, 11 | "layer_norm_epsilon": 1e-05, 12 | "model_type": "gpt2", 13 | "n_embd": 3072, 14 | "n_head": 24, 15 | "n_layer": 18, 16 | "n_inner": 12288, 17 | "n_positions": 1024, 18 | "resid_pdrop": 0.1, 19 | "summary_activation": null, 20 | "summary_first_dropout": 0.1, 21 | "summary_proj_to_labels": true, 22 | "summary_type": "cls_index", 23 | "summary_use_proj": true, 24 | "task_specific_params": { 25 | "text-generation": { 26 | "do_sample": true, 27 | "max_length": 50 28 | } 29 | }, 30 | "vocab_size": 50257 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/README.md: -------------------------------------------------------------------------------- 1 | # Deploy TPUs on GKE 2 | 3 | To deploy TPU workloads on GKE, see the following pages: 4 | 5 | * [Deploy TPU workloads on GKE Autopilot mode](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus-autopilot) 6 | * [Deploy TPU workloads on GKE Standard mode](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus) 7 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/0.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/1.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/2.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/3.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/4.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/5.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/6.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/7.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/8.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/ai-on-gke/bb96e0e2b9fa8cdeb6bb6bb7a04f8d3c38f4b048/tutorials-and-examples/tpu-examples/training/mnist-single-tpu/data/mnist_predict/9.png -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/src/gke-config/standard-tensorflow-bash-v5e.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: test-tensorflow-pod 5 | annotations: 6 | gke-gcsfuse/volumes: "true" 7 | spec: 8 | nodeSelector: 9 | cloud.google.com/gke-tpu-topology: 2x2 # target topology 10 | cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice 11 | containers: 12 | - name: tensorflow 13 | image: tensorflow/tensorflow:2.14.0 14 | securityContext: 15 | privileged: true 16 | command: ["/bin/bash", "-c", "--"] 17 | args: ["while true; do sleep infinity; done;"] 18 | resources: 19 | requests: 20 | google.com/tpu: "4" # TPU chip request 21 | limits: 22 | google.com/tpu: "4" # TPU chip request 23 | volumeMounts: 24 | - name: gcs-fuse-csi-vol 25 | mountPath: /data 26 | readOnly: false 27 | volumes: 28 | - name: gcs-fuse-csi-vol 29 | csi: 30 | driver: gcsfuse.csi.storage.gke.io 31 | readOnly: false 32 | volumeAttributes: 33 | bucketName: $BUCKET_NAME 34 | mountOptions: "implicit-dirs" 35 | -------------------------------------------------------------------------------- /tutorials-and-examples/tpu-examples/training/mnist-single-tpu/src/tensorflow-mnist-example/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-datasets -------------------------------------------------------------------------------- /tutorials-and-examples/vector-databases/readme.md: -------------------------------------------------------------------------------- 1 | # Vector Database Repo 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. -------------------------------------------------------------------------------- /tutorials-and-examples/workflow-orchestration/dws-examples/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Workload Scheduler examples 2 | 3 | 4 | >[!WARNING] 5 | >The files for the Kueue with DWS and GKE autopilot example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Kueue with DWS and GKE autopilot tutorial](https://gke-ai-labs.dev/docs/tutorials/workflow-orchestration/dws/s). 6 | -------------------------------------------------------------------------------- /tutorials-and-examples/workflow-orchestration/dws-multiclusters-example/README.md: -------------------------------------------------------------------------------- 1 | # Multikueue-dws-integration 2 | 3 | >[!WARNING] 4 | >The files for the Multikueue with DWS and GKE autopilot example have been moved to the [AI-on-GKE/tutorials-and-examples](https://github.com/ai-on-gke/tutorials-and-examples) repository. For more information, please refer to the [Multikueue with DWS and GKE autopilot tutorial](https://gke-ai-labs.dev/docs/tutorials/workflow-orchestration/multikueue-dws). -------------------------------------------------------------------------------- /tutorials-and-examples/workflow-orchestration/indexed-job/README.md: -------------------------------------------------------------------------------- 1 | # Running distributed ML training workloads on GKE using Indexed Jobs 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. -------------------------------------------------------------------------------- /tutorials-and-examples/workflow-orchestration/jobset/pytorch/README.md: -------------------------------------------------------------------------------- 1 | # Running distributed ML training workloads on GKE using the JobSet API 2 | 3 | >[!WARNING] 4 | >This guide and associated code are **deprecated** and no longer maintained. 5 | > 6 | >Please refer to the [GKE AI Labs website](https://gke-ai-labs.dev) for the latest tutorials and quick start solutions. --------------------------------------------------------------------------------