├── .gitignore ├── envs └── staging │ ├── variables.sh │ ├── cluster-spec.yml │ └── kubeflow-us-east-1 │ └── kfctl_istio.yaml ├── deploy_kubeflow.sh ├── variables.sh ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.gif 2 | *.kubeconfig 3 | envs/*/*/.cache/* 4 | envs/*/*/aws_config/* 5 | envs/*/*/kustomize/* 6 | terminal-recordings/* -------------------------------------------------------------------------------- /envs/staging/variables.sh: -------------------------------------------------------------------------------- 1 | ## Environment specific values. 2 | set -x 3 | # when replicating, ensure to change the variable as needed. 4 | export ENVIRONMENT=staging 5 | 6 | # kubectl 7 | export KUBECONFIG="$(pwd)/envs/$ENVIRONMENT/.kubeconfig" 8 | 9 | # Use binaries in .bin 10 | export PATH="$(pwd)/.bin:$PATH" 11 | 12 | set +x -------------------------------------------------------------------------------- /deploy_kubeflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source variables.sh 4 | mkdir -p ${KF_DIR} 5 | 6 | # Download the config file and change the default login credentials. 7 | # wget -O ${KF_DIR}kfctl_istio.yaml $CONFIG_URI 8 | export CONFIG_FILE=${KF_DIR}/kfctl_istio.yaml 9 | 10 | # Credentials for the default user are admin@kubeflow.org:12341234 11 | # To change them, please edit the dex-auth application parameters 12 | # inside the KfDef file. 13 | vim $CONFIG_FILE 14 | 15 | kfctl apply -V -f ${CONFIG_FILE} 16 | -------------------------------------------------------------------------------- /variables.sh: -------------------------------------------------------------------------------- 1 | # Set the following kfctl configuration file: 2 | 3 | ## Environment specific values. 4 | export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v1.0.1/kfdef/kfctl_aws_cognito.v1.0.1.yaml" 5 | 6 | # Set KF_NAME to the name of your Kubeflow deployment. You also use this 7 | # value as directory name when creating your configuration directory. 8 | # For example, your deployment name can be 'my-kubeflow' or 'kf-test'. 9 | # This name should match the kubeflow cluster name that can be found under envs/$ENVIRONMENT/cluster-spec.yml > metadata.name 10 | # If the name doesn't match, you will face issues with issuing an ELB. 11 | export KF_NAME=kubeflow-us-east-1 12 | 13 | # Set the path to the base directory where you want to store one or more 14 | # Kubeflow deployments. For example, /opt. 15 | # Then set the Kubeflow application directory for this deployment. 16 | export BASE_DIR=envs/staging 17 | export KF_DIR=${BASE_DIR}/${KF_NAME} 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Arjun Sunil 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kubeflow on Spot instances 2 | 3 | 4 | 5 | ![Kubeflow Logo](https://i.postimg.cc/SKNrtQ19/kubeflow-logo-1024x390.png) 6 | 7 | 8 | ![Kubernetes](http://img.shields.io/badge/kubernetes-1.14-blue?style=for-the-badge&logo=kubernetes) 9 | ![GitHub issues](https://img.shields.io/github/issues/arjun921/aws-spot-instances-kubeflow?style=for-the-badge) 10 | ![GitHub forks](https://img.shields.io/github/forks/arjun921/aws-spot-instances-kubeflow?style=for-the-badge) 11 | ![GitHub Stars](https://img.shields.io/github/stars/arjun921/aws-spot-instances-kubeflow?style=for-the-badge) 12 | ![GitHub License](https://img.shields.io/github/license/arjun921/aws-spot-instances-kubeflow?style=for-the-badge) 13 | 14 | 15 | 16 | 17 | Config files for setting up Multitenant Kubeflow on AWS with spot instances 18 | Repo contains supporting code for [How we reduced our ML training costs by 78%](https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf) 19 | 20 | 21 | ## By the end of this tutorial, you will have: 22 | - An EKS cluster with Kubernetes 1.14 on AWS 23 | - Autoscaling with Nodegroup autodiscovery enabled 24 | - GPU nodes 25 | - With scale-down-to-zero at no workload 26 | - Spot Instance purchase enabled by default 27 | - Kubeflow 1.0.1 running on the cluster with only GPU requesting resources running on GPU nodes 28 | 29 | # TLDR; 30 | ![](https://cdn-images-1.medium.com/max/800/1*77giv9ZqFwXytwXRQJ4ayw.gif) 31 | ```bash 32 | # setup environment 33 | export ENVIRONMENT=staging 34 | export AWS_PROFILE= 35 | source envs/$ENVIRONMENT/variables.sh 36 | # Create cluster 37 | eksctl create cluster -f envs/$ENVIRONMENT/cluster-spec.yml 38 | kubectl cluster-info # to check if the cluster is connected 39 | # set executable 40 | chmod a+x *.sh 41 | # Deploy Kubeflow 42 | ./deploy_kubeflow.sh 43 | ``` 44 | 45 | 46 | 47 | 48 | ## Prequisites 49 | ### AWS 50 | - [CLI Programmatic Access Keys](https://www.youtube.com/watch?v=l9kkdRiDFQw) 51 | - Keys to manipulate resources on AWS 52 | ### CLI 53 | - [eksctl](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-eksctl.html) 54 | - To create the cluster 55 | - [aws-cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) 56 | - eksctl dependency 57 | - [aws-iam-authenticator](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) 58 | - eksctl dependency 59 | - [kubectl](https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html) 60 | - To manage the kubernetes cluster 61 | - [helm3](https://helm.sh/docs/intro/install/) 62 | - To deploy helm charts 63 | 64 | ## Cluster Spec 65 | The cluster that gets spun up will have the following specs: 66 | - **ng-1** 67 | - m5a.2xlarge 68 | - min nodes: 0 69 | - max nodes: 3 70 | - vol: 100 GB 71 | - **ng-2** 72 | - m5a.2xlarge 73 | - min: 0 74 | - max: 10 75 | - vol: 20 GB 76 | - **1-gpu-spot-p2-xlarge** 77 | - p2.xlarge 78 | - min nodes: 0 79 | - max nodes: 10 80 | - max price: $1.2 81 | - **1-gpu-spot-p3-2xlarge** 82 | - p3.2xlarge 83 | - min nodes: 0 84 | - max nodes: 10 85 | - max price: $1.2 86 | - **4-gpu-spot-p3-8xlarge** 87 | - p3.8xlarge 88 | - min nodes: 0 89 | - max nodes: 4 90 | - max price: OnDemand 91 | - **8-gpu-spot-p3dn-24xlarge** -- *Disabled by default* 92 | - p3dn.24xlarge 93 | - min nodes: 0 94 | - max nodes: 1 95 | - max price: $11 96 | -------------------------------------------------------------------------------- /envs/staging/cluster-spec.yml: -------------------------------------------------------------------------------- 1 | # Author: Arjun Sunil 2 | # Connect with me: https://arjunsunil.com 3 | # Supporting tutorial can be found at the following link: https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf 4 | # EKS Cluster Spec with Spot GPU Nodes for Kubeflow with necessary flags enabled for Autoscaling and GPU nodes scale down to 0 at no workload. 5 | # Built in efforts to reducing training costs of ML workloads. 6 | # This spec creates a cluster on EKS with the following active nodes 7 | # = 2x m5a.2xlarge - Accomodates all pods of Kubeflow 8 | # It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up 9 | # = m5a.2xlarge -- Max Allowed 10 worker nodes 10 | # = p2.xlarge -- Max Allowed 10 worker nodes 11 | # = p3.2xlarge -- Max Allowed 10 worker nodes 12 | # = p3.8xlarge -- Max Allowed 04 worker nodes 13 | # = p3dn.24xlarge -- Max Allowed 01 worker nodes 14 | 15 | apiVersion: eksctl.io/v1alpha5 16 | kind: ClusterConfig 17 | 18 | metadata: 19 | # Name of your cluster, change to whatever you find fit. 20 | # if changed, make sure to change all nodegroup tags from 'k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"' 21 | name: kubeflow-us-east-1 22 | # choose your region wisely, this will significantly impact the cost incurred 23 | region: us-east-1 24 | # 1.14 Kubernetes version since Kubeflow 1.0 has been tested with the same. 25 | version: '1.14' 26 | tags: 27 | # Add more cloud tags if needed for billing 28 | environment: staging 29 | 30 | # Add all possible AZs to ensure nodes can be spun up in any AZ later on. 31 | # THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT. 32 | # This list applies to the whole clustr and isn't specific to nodegroups 33 | availabilityZones: ["us-east-1a", "us-east-1b", "us-east-1d", "us-east-1f"] 34 | 35 | nodeGroups: 36 | - name: ng-1 37 | desiredCapacity: 2 38 | minSize: 0 39 | maxSize: 3 40 | # Set one nodegroup with 100GB volumes for Kubeflow to get deployed. 41 | # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node. 42 | volumeSize: 100 43 | volumeType: gp2 44 | instanceType: m5a.2xlarge 45 | availabilityZones: ["us-east-1a"] 46 | labels: 47 | node-class: "worker-node" 48 | tags: 49 | # EC2 tags required for cluster-autoscaler auto-discovery 50 | k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand 51 | k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false" 52 | k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0" 53 | k8s.io/cluster-autoscaler/enabled: "true" 54 | k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 55 | iam: 56 | withAddonPolicies: 57 | albIngress: true 58 | autoScaler: true 59 | cloudWatch: true 60 | 61 | - name: ng-2 62 | desiredCapacity: 0 63 | minSize: 0 64 | maxSize: 10 65 | volumeSize: 20 66 | volumeType: gp2 67 | instanceType: m5a.2xlarge 68 | availabilityZones: ["us-east-1a"] 69 | labels: 70 | node-class: "worker-node" 71 | tags: 72 | # EC2 tags required for cluster-autoscaler auto-discovery 73 | k8s.io/cluster-autoscaler/node-template/label/lifecycle: OnDemand 74 | k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "false" 75 | k8s.io/cluster-autoscaler/node-template/label/gpu-count: "0" 76 | k8s.io/cluster-autoscaler/enabled: "true" 77 | k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 78 | iam: 79 | withAddonPolicies: 80 | albIngress: true 81 | autoScaler: true 82 | cloudWatch: true 83 | 84 | - name: 1-gpu-spot-p2-xlarge 85 | minSize: 0 86 | maxSize: 10 87 | instancesDistribution: 88 | # set your own max price. AWS spot instance prices no longer cross OnDemand price. 89 | # Comment out the field to default to OnDemand as max price. 90 | maxPrice: 1.2 91 | instanceTypes: ["p2.xlarge"] 92 | onDemandBaseCapacity: 0 93 | onDemandPercentageAboveBaseCapacity: 0 94 | spotAllocationStrategy: capacity-optimized 95 | labels: 96 | lifecycle: Ec2Spot 97 | aws.amazon.com/spot: "true" 98 | gpu-count: "1" 99 | # Stick to one AZ for all GPU nodes. In case of termination, this will prevent volumes from being unavailable if the new instance got spun up in another AZ. 100 | availabilityZones: ["us-east-1f"] 101 | taints: 102 | spotInstance: "true:PreferNoSchedule" 103 | tags: 104 | k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot 105 | k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true" 106 | k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1" 107 | k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule" 108 | k8s.io/cluster-autoscaler/enabled: "true" 109 | k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 110 | iam: 111 | withAddonPolicies: 112 | autoScaler: true 113 | cloudWatch: true 114 | albIngress: true 115 | 116 | - name: 1-gpu-spot-p3-2xlarge 117 | minSize: 0 118 | maxSize: 10 119 | instancesDistribution: 120 | # set your own max price. AWS spot instance prices no longer cross OnDemand price. 121 | # Comment out the field to default to OnDemand as max price. 122 | maxPrice: 1.2 123 | instanceTypes: ["p3.2xlarge"] 124 | onDemandBaseCapacity: 0 125 | onDemandPercentageAboveBaseCapacity: 0 126 | spotAllocationStrategy: capacity-optimized 127 | labels: 128 | lifecycle: Ec2Spot 129 | aws.amazon.com/spot: "true" 130 | gpu-count: "1" 131 | # Stick to one AZ for all GPU nodes. In case of termination, this will prevent volumes from being unavailable if the new instance got spun up in another AZ. 132 | availabilityZones: ["us-east-1f"] 133 | taints: 134 | spotInstance: "true:PreferNoSchedule" 135 | tags: 136 | k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot 137 | k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true" 138 | k8s.io/cluster-autoscaler/node-template/label/gpu-count: "1" 139 | k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule" 140 | k8s.io/cluster-autoscaler/enabled: "true" 141 | k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 142 | iam: 143 | withAddonPolicies: 144 | autoScaler: true 145 | cloudWatch: true 146 | albIngress: true 147 | 148 | - name: 4-gpu-spot-p3-8xlarge 149 | minSize: 0 150 | maxSize: 4 151 | instancesDistribution: 152 | # set your own max price. AWS spot instance prices no longer cross OnDemand price. 153 | # Comment out the field to default to OnDemand as max price. 154 | # maxPrice: 4.4 155 | instanceTypes: ["p3.8xlarge"] 156 | onDemandBaseCapacity: 0 157 | onDemandPercentageAboveBaseCapacity: 0 158 | spotAllocationStrategy: capacity-optimized 159 | labels: 160 | lifecycle: Ec2Spot 161 | aws.amazon.com/spot: "true" 162 | gpu-count: "4" 163 | # Stick to one AZ for all GPU nodes. In case of termination, this will prevent volumes from being unavailable if the new instance got spun up in another AZ. 164 | availabilityZones: ["us-east-1f"] 165 | taints: 166 | spotInstance: "true:PreferNoSchedule" 167 | tags: 168 | k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot 169 | k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true" 170 | k8s.io/cluster-autoscaler/node-template/label/gpu-count: "4" 171 | k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule" 172 | k8s.io/cluster-autoscaler/enabled: "true" 173 | k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 174 | iam: 175 | withAddonPolicies: 176 | autoScaler: true 177 | cloudWatch: true 178 | albIngress: true 179 | 180 | # - name: 8-gpu-spot-p3dn-24xlarge 181 | # minSize: 0 182 | # maxSize: 1 183 | # instancesDistribution: 184 | # # set your own max price. AWS spot instance prices no longer cross OnDemand price. 185 | # # Comment out the field to default to OnDemand as max price. 186 | # maxPrice: 11 187 | # instanceTypes: ["p3dn.24xlarge"] 188 | # onDemandBaseCapacity: 0 189 | # onDemandPercentageAboveBaseCapacity: 0 190 | # spotAllocationStrategy: capacity-optimized 191 | # labels: 192 | # lifecycle: Ec2Spot 193 | # aws.amazon.com/spot: "true" 194 | # gpu-count: "8" 195 | # availabilityZones: ["us-east-1f"] 196 | # taints: 197 | # spotInstance: "true:PreferNoSchedule" 198 | # tags: 199 | # k8s.io/cluster-autoscaler/node-template/label/lifecycle: Ec2Spot 200 | # k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/spot: "true" 201 | # k8s.io/cluster-autoscaler/node-template/label/gpu-count: "8" 202 | # k8s.io/cluster-autoscaler/node-template/taint/spotInstance: "true:PreferNoSchedule" 203 | # k8s.io/cluster-autoscaler/enabled: "true" 204 | # k8s.io/cluster-autoscaler/kubeflow-us-east-1: "owned" 205 | # iam: 206 | # withAddonPolicies: 207 | # autoScaler: true 208 | # cloudWatch: true 209 | # albIngress: true 210 | -------------------------------------------------------------------------------- /envs/staging/kubeflow-us-east-1/kfctl_istio.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kfdef.apps.kubeflow.org/v1 2 | kind: KfDef 3 | metadata: 4 | clusterName: kubeflow-us-east-1.us-east-1.eksctl.io 5 | creationTimestamp: null 6 | name: kubeflow-us-east-1 7 | namespace: kubeflow 8 | spec: 9 | applications: 10 | - kustomizeConfig: 11 | parameters: 12 | - name: namespace 13 | value: istio-system 14 | repoRef: 15 | name: manifests 16 | path: istio/istio-crds 17 | name: istio-crds 18 | - kustomizeConfig: 19 | parameters: 20 | - name: namespace 21 | value: istio-system 22 | repoRef: 23 | name: manifests 24 | path: istio/istio-install 25 | name: istio-install 26 | - kustomizeConfig: 27 | parameters: 28 | - name: namespace 29 | value: istio-system 30 | repoRef: 31 | name: manifests 32 | path: istio/cluster-local-gateway 33 | name: cluster-local-gateway 34 | - kustomizeConfig: 35 | parameters: 36 | - name: clusterRbacConfig 37 | value: "ON" 38 | repoRef: 39 | name: manifests 40 | path: istio/istio 41 | name: istio 42 | - kustomizeConfig: 43 | repoRef: 44 | name: manifests 45 | path: application/application-crds 46 | name: application-crds 47 | - kustomizeConfig: 48 | overlays: 49 | - application 50 | repoRef: 51 | name: manifests 52 | path: application/application 53 | name: application 54 | - kustomizeConfig: 55 | parameters: 56 | - name: namespace 57 | value: cert-manager 58 | repoRef: 59 | name: manifests 60 | path: cert-manager/cert-manager-crds 61 | name: cert-manager-crds 62 | - kustomizeConfig: 63 | parameters: 64 | - name: namespace 65 | value: kube-system 66 | repoRef: 67 | name: manifests 68 | path: cert-manager/cert-manager-kube-system-resources 69 | name: cert-manager-kube-system-resources 70 | - kustomizeConfig: 71 | overlays: 72 | - self-signed 73 | - application 74 | parameters: 75 | - name: namespace 76 | value: cert-manager 77 | repoRef: 78 | name: manifests 79 | path: cert-manager/cert-manager 80 | name: cert-manager 81 | - kustomizeConfig: 82 | repoRef: 83 | name: manifests 84 | path: metacontroller 85 | name: metacontroller 86 | - kustomizeConfig: 87 | overlays: 88 | - istio 89 | - application 90 | repoRef: 91 | name: manifests 92 | path: argo 93 | name: argo 94 | - kustomizeConfig: 95 | repoRef: 96 | name: manifests 97 | path: kubeflow-roles 98 | name: kubeflow-roles 99 | - kustomizeConfig: 100 | overlays: 101 | - istio 102 | - application 103 | parameters: 104 | - name: userid-header 105 | value: kubeflow-userid 106 | repoRef: 107 | name: manifests 108 | path: common/centraldashboard 109 | name: centraldashboard 110 | - kustomizeConfig: 111 | overlays: 112 | - application 113 | repoRef: 114 | name: manifests 115 | path: admission-webhook/webhook 116 | name: webhook 117 | - kustomizeConfig: 118 | overlays: 119 | - application 120 | parameters: 121 | - name: webhookNamePrefix 122 | value: admission-webhook- 123 | repoRef: 124 | name: manifests 125 | path: admission-webhook/bootstrap 126 | name: bootstrap 127 | - kustomizeConfig: 128 | overlays: 129 | - istio 130 | - application 131 | parameters: 132 | - name: userid-header 133 | value: kubeflow-userid 134 | repoRef: 135 | name: manifests 136 | path: jupyter/jupyter-web-app 137 | name: jupyter-web-app 138 | - kustomizeConfig: 139 | overlays: 140 | - application 141 | repoRef: 142 | name: manifests 143 | path: spark/spark-operator 144 | name: spark-operator 145 | - kustomizeConfig: 146 | overlays: 147 | - istio 148 | - application 149 | - db 150 | repoRef: 151 | name: manifests 152 | path: metadata 153 | name: metadata 154 | - kustomizeConfig: 155 | overlays: 156 | - istio 157 | - application 158 | repoRef: 159 | name: manifests 160 | path: jupyter/notebook-controller 161 | name: notebook-controller 162 | - kustomizeConfig: 163 | overlays: 164 | - application 165 | repoRef: 166 | name: manifests 167 | path: pytorch-job/pytorch-job-crds 168 | name: pytorch-job-crds 169 | - kustomizeConfig: 170 | overlays: 171 | - application 172 | repoRef: 173 | name: manifests 174 | path: pytorch-job/pytorch-operator 175 | name: pytorch-operator 176 | - kustomizeConfig: 177 | overlays: 178 | - application 179 | parameters: 180 | - name: usageId 181 | value: "7996476243788582144" 182 | - name: reportUsage 183 | value: "true" 184 | repoRef: 185 | name: manifests 186 | path: common/spartakus 187 | name: spartakus 188 | - kustomizeConfig: 189 | overlays: 190 | - istio 191 | repoRef: 192 | name: manifests 193 | path: tensorboard 194 | name: tensorboard 195 | - kustomizeConfig: 196 | overlays: 197 | - application 198 | repoRef: 199 | name: manifests 200 | path: tf-training/tf-job-crds 201 | name: tf-job-crds 202 | - kustomizeConfig: 203 | overlays: 204 | - application 205 | repoRef: 206 | name: manifests 207 | path: tf-training/tf-job-operator 208 | name: tf-job-operator 209 | - kustomizeConfig: 210 | overlays: 211 | - application 212 | repoRef: 213 | name: manifests 214 | path: katib/katib-crds 215 | name: katib-crds 216 | - kustomizeConfig: 217 | overlays: 218 | - application 219 | - istio 220 | repoRef: 221 | name: manifests 222 | path: katib/katib-controller 223 | name: katib-controller 224 | - kustomizeConfig: 225 | overlays: 226 | - application 227 | repoRef: 228 | name: manifests 229 | path: pipeline/api-service 230 | name: api-service 231 | - kustomizeConfig: 232 | overlays: 233 | - application 234 | parameters: 235 | - name: minioPvName 236 | value: minio-pv 237 | - name: minioPvcName 238 | value: minio-pv-claim 239 | repoRef: 240 | name: manifests 241 | path: pipeline/minio 242 | name: minio 243 | - kustomizeConfig: 244 | overlays: 245 | - application 246 | parameters: 247 | - name: mysqlPvName 248 | value: mysql-pv 249 | - name: mysqlPvcName 250 | value: mysql-pv-claim 251 | repoRef: 252 | name: manifests 253 | path: pipeline/mysql 254 | name: mysql 255 | - kustomizeConfig: 256 | overlays: 257 | - application 258 | repoRef: 259 | name: manifests 260 | path: pipeline/persistent-agent 261 | name: persistent-agent 262 | - kustomizeConfig: 263 | overlays: 264 | - application 265 | repoRef: 266 | name: manifests 267 | path: pipeline/pipelines-runner 268 | name: pipelines-runner 269 | - kustomizeConfig: 270 | overlays: 271 | - istio 272 | - application 273 | repoRef: 274 | name: manifests 275 | path: pipeline/pipelines-ui 276 | name: pipelines-ui 277 | - kustomizeConfig: 278 | overlays: 279 | - application 280 | repoRef: 281 | name: manifests 282 | path: pipeline/pipelines-viewer 283 | name: pipelines-viewer 284 | - kustomizeConfig: 285 | overlays: 286 | - application 287 | repoRef: 288 | name: manifests 289 | path: pipeline/scheduledworkflow 290 | name: scheduledworkflow 291 | - kustomizeConfig: 292 | overlays: 293 | - application 294 | repoRef: 295 | name: manifests 296 | path: pipeline/pipeline-visualization-service 297 | name: pipeline-visualization-service 298 | - kustomizeConfig: 299 | overlays: 300 | - application 301 | - istio 302 | parameters: 303 | - name: userid-header 304 | value: kubeflow-userid 305 | repoRef: 306 | name: manifests 307 | path: profiles 308 | name: profiles 309 | - kustomizeConfig: 310 | overlays: 311 | - application 312 | repoRef: 313 | name: manifests 314 | path: seldon/seldon-core-operator 315 | name: seldon-core 316 | - kustomizeConfig: 317 | overlays: 318 | - application 319 | repoRef: 320 | name: manifests 321 | path: mpi-job/mpi-operator 322 | name: mpi-operator 323 | - kustomizeConfig: 324 | overlays: 325 | - oidc 326 | parameters: 327 | - name: namespace 328 | value: istio-system 329 | - name: oidcIssuer 330 | value: https://kubeflow-spot.auth0.com/ 331 | - name: oidcAuthorizationEndpoint 332 | value: https://kubeflow-spot.auth0.com/authorize 333 | - name: oidcTokenEndpoint 334 | value: https://kubeflow-spot.auth0.com/oauth/token 335 | - name: oidcUserInfoEndpoint 336 | value: https://kubeflow-spot.auth0.com/userinfo 337 | - name: certArn 338 | value: arn:aws:acm:us-east-1:########:certificate/#######-#####-###-# 339 | - name: oidcSecretName 340 | value: alb-oidc-secret 341 | repoRef: 342 | name: manifests 343 | path: aws/istio-ingress 344 | name: istio-ingress 345 | - kustomizeConfig: 346 | overlays: 347 | - application 348 | parameters: 349 | - name: namespace 350 | value: istio-system 351 | - name: origin-header 352 | value: x-amzn-oidc-data 353 | - name: custom-header 354 | value: kubeflow-userid 355 | repoRef: 356 | name: manifests 357 | path: aws/aws-istio-authz-adaptor 358 | name: aws-istio-authz-adaptor 359 | - kustomizeConfig: 360 | overlays: 361 | - application 362 | parameters: 363 | - name: clusterName 364 | value: kubeflow-us-east-1 365 | repoRef: 366 | name: manifests 367 | path: aws/aws-alb-ingress-controller 368 | name: aws-alb-ingress-controller 369 | - kustomizeConfig: 370 | overlays: 371 | - application 372 | repoRef: 373 | name: manifests 374 | path: aws/nvidia-device-plugin 375 | name: nvidia-device-plugin 376 | plugins: 377 | - kind: KfAwsPlugin 378 | metadata: 379 | creationTimestamp: null 380 | name: aws 381 | spec: 382 | auth: 383 | enablePodIamPolicy: true 384 | oidc: 385 | certArn: arn:aws:acm:us-east-1:########:certificate/#######-#####-###-# 386 | oAuthClientId: ENTER_YOUR_CLIENT_ID 387 | oAuthClientSecret: ENTER_YOUR_SECRET 388 | oidcAuthorizationEndpoint: https://kubeflow-spot.auth0.com/authorize 389 | oidcIssuer: https://kubeflow-spot.auth0.com/ 390 | oidcTokenEndpoint: https://kubeflow-spot.auth0.com/oauth/token 391 | oidcUserInfoEndpoint: https://kubeflow-spot.auth0.com/userinfo 392 | region: us-east-1 393 | repos: 394 | - name: manifests 395 | uri: https://github.com/kubeflow/manifests/archive/v1.0-branch.tar.gz --------------------------------------------------------------------------------